In [57]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm_notebook
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json']

In [14]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"

In [15]:
basics = pd.read_csv(basics_url,sep='\t', low_memory=False)

In [16]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [17]:
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"

In [18]:
ratings = pd.read_csv(ratings_url,sep='\t', low_memory=False)

In [19]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1905
1,tt0000002,5.8,256
2,tt0000003,6.5,1704
3,tt0000004,5.6,168
4,tt0000005,6.2,2519


In [20]:
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"

In [21]:
akas = pd.read_csv(akas_url,sep='\t', low_memory=False)

In [22]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [23]:
basics = basics.replace({'\\N':np.nan})

In [24]:
akas = akas.replace({'\\N':np.nan})

In [25]:
ratings = ratings.replace({'\\N':np.nan})

In [26]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9183649 entries, 0 to 9183648
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 630.6+ MB


In [27]:
basics['runtimeMinutes'].value_counts()

30                         131130
60                         102711
22                          92373
44                          69182
45                          58417
                            ...  
569                             1
670                             1
924                             1
Animation,Comedy,Family         1
2088                            1
Name: runtimeMinutes, Length: 873, dtype: int64

In [28]:
basics.duplicated().sum()

0

In [29]:
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1224251
endYear           9088176
runtimeMinutes    6719279
genres             426980
dtype: int64

In [30]:
basics.dropna(subset=['runtimeMinutes'], inplace=True)
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 1
startYear           37664
endYear           2417711
runtimeMinutes          0
genres              67332
dtype: int64

In [31]:
basics.dropna(subset=['genres'], inplace=True)
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear           36322
endYear           2351944
runtimeMinutes          0
genres                  0
dtype: int64

In [32]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear           30792
endYear           1981457
runtimeMinutes          0
genres                  0
dtype: int64

In [33]:
basics['titleType'].value_counts()

tvEpisode       953493
short           483966
movie           279497
video           139664
tvSeries         74068
tvMovie          56095
tvSpecial        13873
tvMiniSeries     11626
tvShort           7012
videoGame          284
Name: titleType, dtype: int64

In [34]:
basics = basics[basics["titleType"].str.contains("movie")==True]
basics['titleType'].value_counts()

movie    279497
Name: titleType, dtype: int64

In [35]:
basics['startYear'].value_counts()

2018    9574
2017    9387
2019    9301
2016    8992
2015    8540
        ... 
1906       1
1903       1
1908       1
2027       1
1894       1
Name: startYear, Length: 124, dtype: int64

In [36]:
basics.dropna(subset=['startYear'], inplace=True)
basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           275218
runtimeMinutes         0
genres                 0
dtype: int64

In [37]:
basics = basics[basics['startYear'] >= '2000']
basics['startYear'].value_counts()

2018    9574
2017    9387
2019    9301
2016    8992
2015    8540
2014    8143
2021    7991
2013    7759
2020    7479
2012    7271
2011    6736
2010    6342
2009    5948
2022    5669
2008    5181
2007    4604
2006    4367
2005    3880
2004    3504
2003    3212
2002    2971
2001    2843
2000    2715
2023     272
2024      28
2025       6
2026       2
2027       1
Name: startYear, dtype: int64

In [38]:
basics = basics[basics['startYear'] <= '2021']
basics['startYear'].value_counts()

2018    9574
2017    9387
2019    9301
2016    8992
2015    8540
2014    8143
2021    7991
2013    7759
2020    7479
2012    7271
2011    6736
2010    6342
2009    5948
2008    5181
2007    4604
2006    4367
2005    3880
2004    3504
2003    3212
2002    2971
2001    2843
2000    2715
Name: startYear, dtype: int64

In [39]:
keepers =basics['tconst'].isin(akas['titleId'])
keepers

34790      True
61090      True
67636      True
77930      True
86767      True
           ... 
9183321    True
9183330    True
9183369    True
9183414    True
9183498    True
Name: tconst, Length: 136740, dtype: bool

In [40]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34790,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61090,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67636,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77930,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86767,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9183321,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9183330,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"
9183369,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,,84,Thriller
9183414,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"


In [41]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [42]:
akas['region'].value_counts()

FR    3941917
JP    3941747
DE    3924980
IN    3870159
ES    3865032
       ...   
TV          1
NU          1
PW          1
NR          1
TC          1
Name: region, Length: 247, dtype: int64

In [43]:
akas = akas[akas["region"].str.contains("US")==True]
akas['region'].value_counts()

US    1344596
Name: region, dtype: int64

In [44]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1344596 entries, 5 to 33011811
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1344596 non-null  object
 1   ordering         1344596 non-null  int64 
 2   title            1344596 non-null  object
 3   region           1344596 non-null  object
 4   language         3689 non-null     object
 5   types            963675 non-null   object
 6   attributes       44802 non-null    object
 7   isOriginalTitle  1343221 non-null  object
dtypes: int64(1), object(7)
memory usage: 92.3+ MB


In [45]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136130 entries, 34790 to 9183498
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          136130 non-null  object
 1   titleType       136130 non-null  object
 2   primaryTitle    136130 non-null  object
 3   originalTitle   136130 non-null  object
 4   isAdult         136130 non-null  object
 5   startYear       136130 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  136130 non-null  object
 8   genres          136130 non-null  object
dtypes: object(9)
memory usage: 10.4+ MB


In [46]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1256807 entries, 0 to 1256806
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1256807 non-null  object 
 1   averageRating  1256807 non-null  float64
 2   numVotes       1256807 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.8+ MB


In [47]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['final_tmdb_data_2000.csv.gz',
 'final_tmdb_data_2001.csv.gz',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz',
 'tmdb_api_results_2000.json',
 'tmdb_api_results_2001.json']

In [48]:
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [49]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [50]:
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [51]:
import json
with open('C:/Users/fid24/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key-tmdb'])

In [52]:
import tmdbsimple as tmdb
tmdb.API_KEY =  login['api-key-tmdb']

In [53]:
YEARS_TO_GET = [2000,2001]

In [54]:
def get_movie_with_rating(movie_id):
    # Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
    # save the .info .releases dictionaries
    movie_info = movie.info()
    releases = movie.releases()
    # Loop through countries in releases
    for c in releases['countries']:
        # if the country abbreviation==US
        if c['iso_3166_1' ] =='US':
            ## save a "certification" key in the info dict with the certification
            movie_info['certification'] = c['certification']
    return movie_info

In [55]:
 def write_json(new_data, filename): 
    """Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""    
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [58]:
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET,desc='YEARS',position=0):
    # If it does not exist: create it
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'
    file_exists = os.path.isfile(JSON_FILE)
    if file_exists == False:    
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
    # Load in the dataframe from project part 1 as basics:
    basics = pd.read_csv('Data/title_basics.csv.gz')
    #Saving new year as the current df
    df = basics.loc[ basics['startYear']==YEAR].copy()
    # saving movie ids to list
    movie_ids = df['tconst'].copy()#.to_list()
    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]
    #Get index and movie id from list
    # INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        # Attempt to retrieve then data for the movie id
        try:
            temp = get_movie_with_rating(movie_id)  #This uses your pre-made function
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        # If it fails,  make a dict with just the id and None for certification.
        except Exception as e:
            continue

    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/527 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/588 [00:00<?, ?it/s]

In [59]:
test_notebook = get_movie_with_rating("tt0332280")
test_notebook

{'adult': False,
 'backdrop_path': '/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg',
 'belongs_to_collection': None,
 'budget': 29000000,
 'genres': [{'id': 10749, 'name': 'Romance'}, {'id': 18, 'name': 'Drama'}],
 'homepage': 'http://www.newline.com/properties/notebookthe.html',
 'id': 11036,
 'imdb_id': 'tt0332280',
 'original_language': 'en',
 'original_title': 'The Notebook',
 'overview': "An epic love story centered around an older man who reads aloud to a woman with Alzheimer's. From a faded notebook, the old man's words bring to life the story about a couple who is separated by World War II, and is then passionately reunited, seven years later, after they have taken different paths.",
 'popularity': 62.25,
 'poster_path': '/rNzQyW4f8B8cQeg7Dgj3n6eT5k9.jpg',
 'production_companies': [{'id': 12,
   'logo_path': '/iaYpEp3LQmb8AfAtmTvpqd4149c.png',
   'name': 'New Line Cinema',
   'origin_country': 'US'},
  {'id': 1565, 'logo_path': None, 'name': 'Avery Pix', 'origin_country': 'US'},
  {'id': 260

In [60]:
test_avengers = get_movie_with_rating("tt0848228")
test_avengers

{'adult': False,
 'backdrop_path': '/nNmJRkg8wWnRmzQDe2FwKbPIsJV.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 238.0,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path':