In [1]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm_notebook
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['df_2000_2001.csv.gz',
 'df_2000_2001_budget_revenue_title_cert.csv.gz',
 'df_clean_basics.csv.gz',
 'final_basics_df.csv.gz',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json',
 'tmdb_results_combined.csv.gz']

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"

In [3]:
basics = pd.read_csv(basics_url,sep='\t', low_memory=False)

In [4]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [5]:
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"

In [6]:
ratings = pd.read_csv(ratings_url,sep='\t', low_memory=False)

In [7]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1909
1,tt0000002,5.8,256
2,tt0000003,6.5,1710
3,tt0000004,5.6,169
4,tt0000005,6.2,2525


In [8]:
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"

In [9]:
akas = pd.read_csv(akas_url,sep='\t', low_memory=False)

In [10]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [11]:
basics = basics.replace({'\\N':np.nan})

In [12]:
akas = akas.replace({'\\N':np.nan})

In [13]:
ratings = ratings.replace({'\\N':np.nan})

In [14]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9202163 entries, 0 to 9202162
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 631.9+ MB


In [15]:
basics['runtimeMinutes'].value_counts()

30                         131226
60                         102791
22                          92445
44                          69259
45                          58507
                            ...  
569                             1
670                             1
924                             1
Animation,Comedy,Family         1
2088                            1
Name: runtimeMinutes, Length: 873, dtype: int64

In [16]:
basics.duplicated().sum()

0

In [17]:
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1225225
endYear           9106402
runtimeMinutes    6733736
genres             426756
dtype: int64

In [18]:
basics.dropna(subset=['runtimeMinutes'], inplace=True)
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 1
startYear           37697
endYear           2421645
runtimeMinutes          0
genres              67377
dtype: int64

In [19]:
basics.dropna(subset=['genres'], inplace=True)
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear           36357
endYear           2355832
runtimeMinutes          0
genres                  0
dtype: int64

In [20]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear           30812
endYear           1984662
runtimeMinutes          0
genres                  0
dtype: int64

In [21]:
basics['titleType'].value_counts()

tvEpisode       955232
short           484722
movie           279934
video           139801
tvSeries         74177
tvMovie          56147
tvSpecial        13907
tvMiniSeries     11651
tvShort           7016
videoGame          285
Name: titleType, dtype: int64

In [22]:
basics = basics[basics["titleType"].str.contains("movie")==True]
basics['titleType'].value_counts()

movie    279934
Name: titleType, dtype: int64

In [23]:
basics['startYear'].value_counts()

2018    9580
2017    9398
2019    9312
2016    8996
2015    8544
        ... 
1906       1
1903       1
1908       1
2027       1
1894       1
Name: startYear, Length: 124, dtype: int64

In [24]:
basics.dropna(subset=['startYear'], inplace=True)
basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           275633
runtimeMinutes         0
genres                 0
dtype: int64

In [25]:
basics = basics[basics['startYear'] >= '2000']
basics['startYear'].value_counts()

2018    9580
2017    9398
2019    9312
2016    8996
2015    8544
2014    8143
2021    8000
2013    7763
2020    7486
2012    7270
2011    6742
2010    6344
2009    5952
2022    5841
2008    5182
2007    4604
2006    4370
2005    3881
2004    3506
2003    3215
2002    2971
2001    2843
2000    2716
2023     286
2024      28
2025       6
2026       2
2027       1
Name: startYear, dtype: int64

In [26]:
basics = basics[basics['startYear'] <= '2021']
basics['startYear'].value_counts()

2018    9580
2017    9398
2019    9312
2016    8996
2015    8544
2014    8143
2021    8000
2013    7763
2020    7486
2012    7270
2011    6742
2010    6344
2009    5952
2008    5182
2007    4604
2006    4370
2005    3881
2004    3506
2003    3215
2002    2971
2001    2843
2000    2716
Name: startYear, dtype: int64

In [27]:
keepers =basics['tconst'].isin(akas['titleId'])
keepers

34790      True
61091      True
67637      True
77931      True
86768      True
           ... 
9201835    True
9201844    True
9201883    True
9201928    True
9202012    True
Name: tconst, Length: 136818, dtype: bool

In [28]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34790,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61091,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67637,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77931,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86768,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9201835,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9201844,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"
9201883,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,,84,Thriller
9201928,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"


In [29]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [30]:
akas['region'].value_counts()

JP    3951923
FR    3951792
DE    3934949
IN    3880396
ES    3874828
       ...   
TV          1
NU          1
PW          1
NR          1
TC          1
Name: region, Length: 247, dtype: int64

In [31]:
akas = akas[akas["region"].str.contains("US")==True]
akas['region'].value_counts()

US    1347304
Name: region, dtype: int64

In [32]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1347304 entries, 5 to 33097590
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1347304 non-null  object
 1   ordering         1347304 non-null  int64 
 2   title            1347304 non-null  object
 3   region           1347304 non-null  object
 4   language         3702 non-null     object
 5   types            963869 non-null   object
 6   attributes       44877 non-null    object
 7   isOriginalTitle  1345929 non-null  object
dtypes: int64(1), object(7)
memory usage: 92.5+ MB


In [33]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136208 entries, 34790 to 9202012
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          136208 non-null  object
 1   titleType       136208 non-null  object
 2   primaryTitle    136208 non-null  object
 3   originalTitle   136208 non-null  object
 4   isAdult         136208 non-null  object
 5   startYear       136208 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  136208 non-null  object
 8   genres          136208 non-null  object
dtypes: object(9)
memory usage: 10.4+ MB


In [34]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1258876 entries, 0 to 1258875
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1258876 non-null  object 
 1   averageRating  1258876 non-null  float64
 2   numVotes       1258876 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.8+ MB


In [35]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['df_2000_2001.csv.gz',
 'df_2000_2001_budget_revenue_title_cert.csv.gz',
 'df_clean_basics.csv.gz',
 'final_basics_df.csv.gz',
 'final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json',
 'tmdb_results_combined.csv.gz']

In [36]:
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [37]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [38]:
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [39]:
import json
with open('C:/Users/fid24/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key-tmdb'])

In [40]:
import tmdbsimple as tmdb
tmdb.API_KEY =  login['api-key-tmdb']

In [89]:
YEARS_TO_GET = [2002,2003,2004,2005,2006,2008,2009,2010,2011,2012,2013,2014] 

In [90]:
def get_movie_with_rating(movie_id):
    # Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
    # save the .info .releases dictionaries
    movie_info = movie.info()
    releases = movie.releases()
    # Loop through countries in releases
    for c in releases['countries']:
        # if the country abbreviation==US
        if c['iso_3166_1' ] =='US':
            ## save a "certification" key in the info dict with the certification
            movie_info['certification'] = c['certification']
    return movie_info

In [91]:
 def write_json(new_data, filename): 
    """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""    
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [92]:
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS',position=0):
    # If it does not exist: create it
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    file_exists = os.path.isfile(JSON_FILE)
    if file_exists == False:    
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
    # Load in the dataframe from project part 1 as basics:
    basics = pd.read_csv('Data/title_basics.csv.gz')
    #Saving new year as the current df
    df = basics.loc[ basics['startYear']==YEAR].copy()
    # saving movie ids to list
    movie_ids = df['tconst'].copy()#.to_list()
    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]
    #Get index and movie id from list
    # INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        # Attempt to retrieve then data for the movie id
        try:
            temp = get_movie_with_rating(movie_id)  #This uses your pre-made function
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        # If it fails,  make a dict with just the id and None for certification.
        except Exception as e:
            continue

    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

YEARS:   0%|          | 0/1 [00:00<?, ?it/s]

ValueError: Expected object or value

In [45]:
with open('C:/Users/fid24/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key-tmdb'])

In [46]:
import tmdbsimple as tmdb
tmdb.API_KEY =  login['api-key-tmdb']

In [77]:
JSON_FILE_2002 = f"C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2002.csv.gz"
JSON_FILE_2002

'C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2002.csv.gz'

In [78]:
df_2002 = pd.read_csv(JSON_FILE_2002)
df_2002.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0096056,0.0,/95U3MUDXu4xSCmVLtWgargRipDi.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",,109809.0,en,Crime and Punishment,...,0.0,126.0,"[{'english_name': 'Polish', 'iso_639_1': 'pl',...",Released,,Crime and Punishment,0.0,5.5,11.0,
2,tt0118926,0.0,/qR3Dk3ctnrrxkAI6I472RhamIbu.jpg,,0.0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",,20689.0,en,The Dancer Upstairs,...,5227348.0,132.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,"An honest man caught in a world of intrigue, p...",The Dancer Upstairs,0.0,6.294,51.0,R
3,tt0119980,0.0,,,0.0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",,563364.0,en,Random Shooting in LA,...,0.0,91.0,[],Released,,Random Shooting in LA,0.0,0.0,0.0,
4,tt0120679,0.0,/s04Ds4xbJU7DzeGVyamccH4LoxF.jpg,,12000000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",https://www.miramax.com/movie/frida,1360.0,en,Frida,...,56298474.0,123.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Prepare to be seduced.,Frida,0.0,7.455,1764.0,R


In [79]:
JSON_FILE_2003 = f"C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2003.csv.gz"
JSON_FILE_2003

'C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2003.csv.gz'

In [80]:
df_2003 = pd.read_csv(JSON_FILE_2003)
df_2003.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0119727,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}]",,263128.0,pt,Mulher Polícia,...,0.0,100.0,"[{'english_name': 'Portuguese', 'iso_639_1': '...",Released,,The Policewoman,0.0,5.0,3.0,
2,tt0120607,0.0,,,0.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,584762.0,en,Between The Sheets,...,0.0,102.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Between The Sheets,0.0,0.0,0.0,
3,tt0135991,0.0,,,0.0,"[{'id': 27, 'name': 'Horror'}, {'id': 16, 'nam...",,140359.0,en,Dominator,...,0.0,77.0,[],Released,Anyone who says Rock 'n' Roll is dead - WILL be!,Dominator,0.0,1.5,2.0,
4,tt0138524,0.0,/x1BTLDDijvms3Yi9NZoxxniTDGA.jpg,,60000000.0,"[{'id': 80, 'name': 'Crime'}, {'id': 35, 'name...",,11775.0,en,Intolerable Cruelty,...,119940815.0,100.0,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,They can't keep their hands off each others as...,Intolerable Cruelty,0.0,5.951,1205.0,PG-13


In [81]:
JSON_FILE_2004 = f"C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2004.csv.gz"
JSON_FILE_2004

'C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2004.csv.gz'

In [82]:
df_2004 = pd.read_csv(JSON_FILE_2004)
df_2004.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0092960,0.0,,,0.0,[],,678770.0,en,Emilio Varela vs. Camelia la Texana,...,0.0,102.0,[],Released,,Emilio Varela vs. Camelia la Texana,0.0,5.0,2.0,
2,tt0122247,0.0,,,0.0,"[{'id': 27, 'name': 'Horror'}, {'id': 35, 'nam...",,27743.0,en,Serial Killing 4 Dummys,...,0.0,89.0,[],Released,,Serial Killing 101,0.0,5.6,11.0,
3,tt0146803,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}]",,46730.0,en,"Goodnight, Joseph Parker",...,0.0,0.0,[],Released,When there's no place left to go but home,"Goodnight, Joseph Parker",0.0,0.0,0.0,
4,tt0159290,0.0,/atFOPVD3AKSj16ImD1MJfIklf90.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",,101917.0,nl,Bluebird,...,0.0,77.0,"[{'english_name': 'Dutch', 'iso_639_1': 'nl', ...",Released,"Merel, a talented young girl, is suddenly gett...",Bluebird,0.0,6.333,9.0,


In [83]:
JSON_FILE_2005 = f"C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2005.csv.gz"
JSON_FILE_2005

'C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2005.csv.gz'

In [84]:
df_2005 = pd.read_csv(JSON_FILE_2005)
df_2005.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0088751,0.0,,,350000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 27, 'nam...",,29163.0,en,The Naked Monster,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,The Naked Monster,0.0,3.4,5.0,
2,tt0118141,0.0,/unoJZwLGTlzKc3QkvsERVPLRnFH.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 14, 'name...",http://www.crispinglover.com/whatisit.htm,54506.0,en,What Is It?,...,0.0,72.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The adventures of a young man whose principle ...,What Is It?,0.0,5.8,22.0,NC-17
3,tt0120667,0.0,/jkBEPKRq4HWlLwsMFMdDiYwaCle.jpg,"{'id': 9744, 'name': 'Fantastic Four Collectio...",100000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,9738.0,en,Fantastic Four,...,333535934.0,106.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,4 times the action. 4 times the adventure. 4 t...,Fantastic Four,0.0,5.77,8215.0,PG-13
4,tt0121164,0.0,/r4VumNLSafeGhlieKNhGv0BQ4UD.jpg,,40000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 14, ...",http://corpsebridemovie.warnerbros.com/,3933.0,en,Corpse Bride,...,118133252.0,77.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,There's been a grave misunderstanding.,Corpse Bride,0.0,7.486,7358.0,


In [85]:
JSON_FILE_2006 = f"C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2006.csv.gz"
JSON_FILE_2006

'C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2006.csv.gz'

In [86]:
df_2006 = pd.read_csv(JSON_FILE_2006)
df_2006.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0110476,0.0,/tGCeUKo6g74OehsgJ1rCtpi85aT.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,63163.0,ru,Мастер и Маргарита,...,0.0,128.0,"[{'english_name': 'Russian', 'iso_639_1': 'ru'...",Released,Manuscripts don't burn,The Master and Margarita,0.0,6.075,20.0,
2,tt0144280,0.0,,,100000.0,[],,30356.0,en,Whispers from a Shallow Grave,...,0.0,90.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Whispers from a Shallow Grave,0.0,2.0,2.0,
3,tt0166871,0.0,,,0.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 80, 'nam...",,218584.0,pt,Viúva Rica Solteira Não Fica,...,56360.0,135.0,"[{'english_name': 'Portuguese', 'iso_639_1': '...",Released,,Viúva Rica Solteira Não Fica,0.0,7.2,6.0,
4,tt0197633,0.0,/1hGrzTajJ5JC6VfPzrUj8Gj5t5k.jpg,,0.0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",,58520.0,en,Live Freaky! Die Freaky!,...,0.0,75.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Live Freaky! Die Freaky!,0.0,3.8,8.0,NR


In [87]:
JSON_FILE_2007 = f"C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2007.csv.gz"
JSON_FILE_2007

'C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2007.csv.gz'

In [88]:
df_2007 = pd.read_csv(JSON_FILE_2007)
df_2007.head()

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2007.csv.gz'

In [47]:
JSON_FILE_2008 = f"C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2008.csv.gz"
JSON_FILE_2008

'C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2008.csv.gz'

In [48]:
df_2008 = pd.read_csv(JSON_FILE_2008)
df_2008.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0115686,0.0,,,0.0,"[{'id': 14, 'name': 'Fantasy'}, {'id': 18, 'na...",,710448.0,ja,ビリケン,...,0.0,100.0,"[{'english_name': 'Japanese', 'iso_639_1': 'ja...",Released,,Billiken,0.0,0.0,0.0,
2,tt0119970,0.0,,,0.0,[],,165200.0,en,My Apocalypse,...,0.0,0.0,[],Released,,My Apocalypse,0.0,4.8,2.0,
3,tt0177636,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10402, 'n...",,8890.0,de,Geliebte Clara,...,0.0,107.0,"[{'english_name': 'German', 'iso_639_1': 'de',...",Released,,Clara,0.0,4.7,6.0,
4,tt0200465,0.0,/ynnarxTMs8jWKV9GK0SpUhGNCgH.jpg,,20000000.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 80, 'n...",http://thebankjobmovie.com/,8848.0,en,The Bank Job,...,64828421.0,112.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The true story of a heist gone wrong... in all...,The Bank Job,0.0,6.818,1819.0,R


In [49]:
JSON_FILE_2009 = f"C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2009.csv.gz"
JSON_FILE_2009

'C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2009.csv.gz'

In [50]:
df_2009 = pd.read_csv(JSON_FILE_2009)
df_2009.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0116991,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}]",,492619.0,en,Mariette in Ecstasy,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Mariette in Ecstasy,0.0,0.0,0.0,PG-13
2,tt0143558,0.0,/mUdwA3Szik9bkIMWEeux49I4EgL.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",,410363.0,hu,Apaföld,...,0.0,80.0,"[{'english_name': 'Hungarian', 'iso_639_1': 'h...",Released,,Father's Acre,0.0,6.8,5.0,
3,tt0153140,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 35, 'name...",,367762.0,hu,Rózsaszín sajt,...,0.0,93.0,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,"When you hug too strong, you don’t see the face.",Camembert Rose,0.0,0.0,0.0,
4,tt0205380,0.0,/dcF1eyBqF2Cb9rGpmsduLjZTNKr.jpg,,0.0,"[{'id': 28, 'name': 'Action'}, {'id': 18, 'nam...",,435199.0,hi,Sanam Teri Kasam,...,0.0,170.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Sanam Teri Kasam,0.0,7.0,2.0,


In [51]:
JSON_FILE_2010 = f"C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2010.csv.gz"
JSON_FILE_2010

'C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2010.csv.gz'

In [52]:
df_2010 = pd.read_csv(JSON_FILE_2010)
df_2010.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0146592,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}]",,163942.0,hu,Pál Adrienn,...,0.0,136.0,"[{'english_name': 'Hungarian', 'iso_639_1': 'h...",Released,,Adrienn Pál,0.0,5.7,7.0,
2,tt0154039,0.0,,,0.0,[],,260823.0,hu,Oda az igazság,...,0.0,100.0,"[{'english_name': 'Hungarian', 'iso_639_1': 'h...",Released,,So Much for Justice!,0.0,4.0,1.0,
3,tt0162942,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}]",,61200.0,en,A zöld sárkány gyermekei,...,0.0,0.0,[],Released,,Children of the Green Dragon,0.0,5.7,3.0,
4,tt0312305,0.0,,,0.0,"[{'id': 10751, 'name': 'Family'}, {'id': 16, '...",http://www.qqthemovie.com/,23738.0,en,Quantum Quest: A Cassini Space Odyssey,...,0.0,45.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Quantum Quest: A Cassini Space Odyssey,0.0,8.4,7.0,


In [53]:
JSON_FILE_2011 = f"C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2011.csv.gz"
JSON_FILE_2011

'C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2011.csv.gz'

In [54]:
df_2011 = pd.read_csv(JSON_FILE_2011)
df_2011.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0176694,0.0,/43VrmHOChLcAOG9S5tIFdsxr3u8.jpg,,0.0,"[{'id': 16, 'name': 'Animation'}, {'id': 18, '...",,116369.0,en,Az ember tragédiája,...,0.0,166.0,"[{'english_name': 'Hungarian', 'iso_639_1': 'h...",Released,,The Tragedy of Man,0.0,7.5,13.0,
2,tt0210470,0.0,,,0.0,"[{'id': 99, 'name': 'Documentary'}]",,239465.0,en,50 Feet of String,...,0.0,53.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,50 Feet of String,0.0,6.0,3.0,
3,tt0247643,0.0,,,0.0,[],,287000.0,en,Los Pájaros se van con la Muerte,...,0.0,90.0,[],Released,,Los Pájaros se van con la Muerte,0.0,0.0,0.0,
4,tt0323808,0.0,,,7750000.0,"[{'id': 9648, 'name': 'Mystery'}, {'id': 27, '...",http://www.thewickertreemovie.com/,79544.0,en,The Wicker Tree,...,0.0,96.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Accept our sacrifice,The Wicker Tree,0.0,3.9,41.0,R


In [55]:
JSON_FILE_2012 = f"C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2012.csv.gz"
JSON_FILE_2012

'C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2012.csv.gz'

In [56]:
df_2012 = pd.read_csv(JSON_FILE_2012)
df_2012.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0249516,0.0,/t7zb6CnRQwhzQSq0apR4ESFYiWN.jpg,,65000000.0,"[{'id': 28, 'name': 'Action'}, {'id': 16, 'nam...",http://www.thresholdanimationstudios.com/video...,116977.0,en,Foodfight!,...,73706.0,87.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Foodfight!,0.0,1.873,106.0,PG-13
2,tt0285252,0.0,/pKOHinAVIkyqLliMaTHCSL1T4zU.jpg,,4000000.0,"[{'id': 35, 'name': 'Comedy'}]",,112074.0,pl,Life's a Beach,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Life's a Beach,0.0,2.5,5.0,R
3,tt0293069,0.0,/cj2isIuskSePAwpb2D6yUBZrhkJ.jpg,,0.0,"[{'id': 53, 'name': 'Thriller'}]",,136884.0,en,Dark Blood,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Dark Blood,0.0,6.1,17.0,
4,tt0337692,0.0,/5dUOTVeNPU2CmEfFniQ8TE6HChG.jpg,,25000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",http://www.ontheroad-themovie.com/?lang=en,83770.0,en,On the Road,...,8784318.0,137.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The best teacher is experience.,On the Road,0.0,5.61,729.0,R


In [57]:
JSON_FILE_2013 = f"C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2013.csv.gz"
JSON_FILE_2013

'C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2013.csv.gz'

In [58]:
df_2013 = pd.read_csv(JSON_FILE_2013)
df_2013.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0255820,0.0,,,2000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,323498.0,en,Return to Babylon,...,0.0,75.0,[],Released,,Return to Babylon,0.0,6.0,22.0,
2,tt0359950,0.0,/v52aVrsuWPxbRXC1wb5plWDJ9UW.jpg,,90000000.0,"[{'id': 12, 'name': 'Adventure'}, {'id': 35, '...",https://www.20thcenturystudios.com/movies/the-...,116745.0,en,The Secret Life of Walter Mitty,...,188133322.0,114.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,"Stop Dreaming, Start Living",The Secret Life of Walter Mitty,0.0,7.153,6645.0,PG
3,tt0409379,0.0,/56NwJuplsUl9WIZfF0h1QH0wsJ1.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",http://insecretmovie.com/,172226.0,en,In Secret,...,0.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,In Secret,0.0,6.232,170.0,R
4,tt0452183,0.0,,,0.0,"[{'id': 35, 'name': 'Comedy'}]",,439174.0,en,Blunt Movie,...,0.0,83.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,From the creators of ABSOLUTELY NOTHING before...,Blunt Movie,0.0,3.7,3.0,R


In [59]:
JSON_FILE_2014 = f"C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2014.csv.gz"
JSON_FILE_2014

'C:/Users/fid24/OneDrive/Documents/Github/Project-3-/data/final_tmdb_data_2014.csv.gz'

In [60]:
df_2014 = pd.read_csv(JSON_FILE_2014)
df_2014.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0329539,0.0,,,0.0,"[{'id': 35, 'name': 'Comedy'}]",,274341.0,en,Rice Girl,...,0.0,95.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,"«For Windy Yee, a big break in Hollywood can m...",Rice Girl,0.0,1.0,2.0,
2,tt0365907,0.0,/isrro0soStk2tSWMsI50lPPhUsU.jpg,,28000000.0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",,169917.0,en,A Walk Among the Tombstones,...,58834384.0,114.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Some people are afraid of all the wrong things,A Walk Among the Tombstones,0.0,6.31,2434.0,R
3,tt0403935,0.0,/romORaZvZqMRjTPgw0HrAZpVn9V.jpg,,0.0,"[{'id': 28, 'name': 'Action'}, {'id': 53, 'nam...",,308165.0,hi,एक्शन जैकसन,...,0.0,144.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,"Naa Commitment, Naa Appointment, Only Punishment!",Action Jackson,0.0,4.333,24.0,
4,tt0424859,0.0,,,0.0,[],,285294.0,en,The Ninth Cloud,...,0.0,93.0,[],Released,Even Reality can't stop Zena.,The Ninth Cloud,0.0,4.0,1.0,


In [103]:
df = pd.concat([df_2002, df_2003, df_2004, df_2005, df_2006, df_2008, df_2009, df_2010, df_2011, df_2012, df_2013, df_2014], ignore_index=True)
df

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0096056,0.0,/95U3MUDXu4xSCmVLtWgargRipDi.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}]",,109809.0,en,Crime and Punishment,...,0.0,126.0,"[{'english_name': 'Polish', 'iso_639_1': 'pl',...",Released,,Crime and Punishment,0.0,5.500,11.0,
2,tt0118926,0.0,/qR3Dk3ctnrrxkAI6I472RhamIbu.jpg,,0.0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",,20689.0,en,The Dancer Upstairs,...,5227348.0,132.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,"An honest man caught in a world of intrigue, p...",The Dancer Upstairs,0.0,6.294,51.0,R
3,tt0119980,0.0,,,0.0,"[{'id': 80, 'name': 'Crime'}, {'id': 18, 'name...",,563364.0,en,Random Shooting in LA,...,0.0,91.0,[],Released,,Random Shooting in LA,0.0,0.000,0.0,
4,tt0120679,0.0,/s04Ds4xbJU7DzeGVyamccH4LoxF.jpg,,12000000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",https://www.miramax.com/movie/frida,1360.0,en,Frida,...,56298474.0,123.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Prepare to be seduced.,Frida,0.0,7.455,1764.0,R
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44944,tt9558490,0.0,,"{'id': 337019, 'name': '99 Cahaya Di Langit Er...",0.0,"[{'id': 18, 'name': 'Drama'}]",,337023.0,id,99 Cahaya di Langit Eropa: Part 2,...,0.0,99.0,"[{'english_name': 'German', 'iso_639_1': 'de',...",Released,,99 Cahaya di Langit Eropa: Part 2,0.0,7.000,1.0,
44945,tt9672244,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}]",,665423.0,es,Le fils de Marie,...,0.0,81.0,"[{'english_name': 'Spanish', 'iso_639_1': 'es'...",Released,,Le fils de Marie,0.0,0.000,0.0,
44946,tt9718528,0.0,,,20000.0,"[{'id': 18, 'name': 'Drama'}]",,344109.0,pt,Dois Casamentos,...,0.0,75.0,"[{'english_name': 'Portuguese', 'iso_639_1': '...",Released,,Dois Casamentos,0.0,5.000,3.0,
44947,tt9853118,0.0,,,0.0,"[{'id': 80, 'name': 'Crime'}]",https://sakhamovie.ru/#!/post/59,594004.0,ru,Күрүөйэх,...,0.0,78.0,"[{'english_name': 'Russian', 'iso_639_1': 'ru'...",Released,,Күрүөйэх,0.0,0.000,0.0,


In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44949 entries, 0 to 44948
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                44949 non-null  object 
 1   adult                  44937 non-null  float64
 2   backdrop_path          20939 non-null  object 
 3   belongs_to_collection  2631 non-null   object 
 4   budget                 44937 non-null  float64
 5   genres                 44937 non-null  object 
 6   homepage               8185 non-null   object 
 7   id                     44937 non-null  float64
 8   original_language      44937 non-null  object 
 9   original_title         44937 non-null  object 
 10  overview               42097 non-null  object 
 11  popularity             44937 non-null  float64
 12  poster_path            37687 non-null  object 
 13  production_companies   44937 non-null  object 
 14  production_countries   44937 non-null  object 
 15  re

In [None]:
df.to_csv("Data/df_2002-2014_no_2007(2).csv.gz",compression='gzip',index=False)

In [98]:
df.drop(columns='imdb_id', inplace=True)
df.drop(columns='adult', inplace=True)
df.drop(columns='backdrop_path', inplace=True)
df.drop(columns='belongs_to_collection', inplace=True)
df.drop(columns='genres', inplace=True)
df.drop(columns='homepage', inplace=True)
df.drop(columns='id', inplace=True)
df.drop(columns='original_language', inplace=True)
df.drop(columns='overview', inplace=True)
df.drop(columns='popularity', inplace=True)
df.drop(columns='poster_path', inplace=True)
df.drop(columns='production_companies', inplace=True)
df.drop(columns='production_countries', inplace=True)
df.drop(columns='release_date', inplace=True)
df.drop(columns='runtime', inplace=True)
df.drop(columns='spoken_languages', inplace=True)
df.drop(columns='status', inplace=True)
df.drop(columns='tagline', inplace=True)
df.drop(columns='video', inplace=True)
df.drop(columns='vote_average', inplace=True)
df.drop(columns='vote_count', inplace=True)
df.drop(columns='original_title', inplace=True)
df

Unnamed: 0,budget,revenue,title,certification
0,,,,
1,0.0,0.0,Crime and Punishment,
2,0.0,5227348.0,The Dancer Upstairs,R
3,0.0,0.0,Random Shooting in LA,
4,12000000.0,56298474.0,Frida,R
...,...,...,...,...
44944,0.0,0.0,99 Cahaya di Langit Eropa: Part 2,
44945,0.0,0.0,Le fils de Marie,
44946,20000.0,0.0,Dois Casamentos,
44947,0.0,0.0,Күрүөйэх,


In [99]:
df.isna().sum()

budget               1
revenue              1
title                1
certification    36768
dtype: int64

In [100]:
df.dropna(subset=['budget'], inplace=True)
df.isna().sum()

budget               0
revenue              0
title                0
certification    36767
dtype: int64

In [95]:
df.duplicated().sum()

11

In [97]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [101]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44937 entries, 1 to 44948
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   budget         44937 non-null  float64
 1   revenue        44937 non-null  float64
 2   title          44937 non-null  object 
 3   certification  8170 non-null   object 
dtypes: float64(2), object(2)
memory usage: 1.7+ MB


In [102]:
df.to_csv("Data/df_2002-2014_no_2007.csv.gz",compression='gzip',index=False)