In [188]:



def get_movie_with_rating(movie_id):
    """Adapted from source = https://github.com/celiao/tmdbsimple"""
    # Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
    
    # save the .info .releases dictionaries
    info = movie.info()
    
    releases = movie.releases()
    # Loop through countries in releases
    for c in releases['countries']:
        # if c['iso_3166_1'] == 'US':
        ## save a 'certification' key in info with the certification
        info['certification'] = c['certification']
        
        
    return info
    


In [189]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)


In [190]:
def read_and_fix_json(JSON_FILE):
    """Attempts to read in json file of records and fixes the final character
    to end with a ] if it errors.
    
    Args:
        JSON_FILE (str): filepath of JSON file
        
    Returns:
        DataFrame: the corrected data from the bad json file
    """
    try: 
        previous_df =  pd.read_json(JSON_FILE)
    
    ## If read_json throws an error
    except:
        
        ## manually open the json file
        with open(JSON_FILE,'r+') as f:
            ## Read in the file as a STRING
            bad_json = f.read()
            
            ## if the final character doesn't match first, select the right bracket
            first_char = bad_json[0]
            final_brackets = {'[':']', 
                           "{":"}"}
            ## Select expected final brakcet
            final_char = final_brackets[first_char]
            
            ## if the last character in file doen't match the first char, add it
            if bad_json[-1] != final_char:
                good_json = bad_json[:-1]
                good_json+=final_char
            else:
                raise Exception('ERROR is not due to mismatched final bracket.')
            
            ## Rewind to start of file and write new good_json to disk
            f.seek(0)
            f.write(good_json)
           
        ## Load the json file again now that its fixed
        previous_df =  pd.read_json(JSON_FILE)
        
    return previous_df
	
	

In [191]:
# Install tmdbsimple (only need to run once)
!pip install tmdbsimple





In [192]:
import pandas as pd
import numpy as np

# example making new folder with os
import os, time,json
import tmdbsimple as tmdb 
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)



['data (3) (1).csv',
 '.DS_Store',
 'title.ratings.tsv.gz.csv.gz',
 'title.basics.tsv.gz',
 'tmdb_api_results_2001.json',
 'title_basics.csv.gz',
 'data (5) (1).csv',
 'data (4) (1).csv',
 'title.ratings.tsv.gz',
 'title.basics.csv.gz',
 'title-akas-us-only.csv',
 'final_tmdb_data_2001.csv.gz',
 'data (6) (1).csv',
 'title_akas.csv.gz',
 'title.basics.tsv.gz.csv.gz',
 'title_ratings.csv.gz']

In [193]:
# Load the three files into respective dataframes
url_akas = "https://datasets.imdbws.com/title.akas.tsv.gz"
url_basics = "https://datasets.imdbws.com/title.basics.tsv.gz"
url_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz"

akas_df = pd.read_csv(url_akas, compression='gzip', sep='\t', low_memory=False)
basics_df = pd.read_csv(url_basics, compression='gzip', sep='\t', low_memory=False)
ratings_df = pd.read_csv(url_ratings, compression='gzip', sep='\t', low_memory=False)

# Display the first few rows of each dataframe to understand their structure
akas_df.head(), basics_df.head(), ratings_df.head()


(     titleId  ordering                      title region language  \
 0  tt0000001         1                 Карменсіта     UA       \N   
 1  tt0000001         2                 Carmencita     DE       \N   
 2  tt0000001         3  Carmencita - spanyol tánc     HU       \N   
 3  tt0000001         4                 Καρμενσίτα     GR       \N   
 4  tt0000001         5                 Карменсита     RU       \N   
 
          types     attributes isOriginalTitle  
 0  imdbDisplay             \N               0  
 1           \N  literal title               0  
 2  imdbDisplay             \N               0  
 3  imdbDisplay             \N               0  
 4  imdbDisplay             \N               0  ,
       tconst titleType            primaryTitle           originalTitle  \
 0  tt0000001     short              Carmencita              Carmencita   
 1  tt0000002     short  Le clown et ses chiens  Le clown et ses chiens   
 2  tt0000003     short          Pauvre Pierrot          P

In [194]:
akas_df = akas_df[(akas_df['region'] == 'US')]

In [195]:
akas_df.replace({'\\N': np.nan}, inplace=True)

# Display the first few rows of the processed dataframe
akas_df.head()



Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [196]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers1 =basics_df['tconst'].isin(akas_df['titleId'])
keepers1



0            True
1            True
2           False
3           False
4            True
            ...  
10135601    False
10135602    False
10135603    False
10135604    False
10135605    False
Name: tconst, Length: 10135606, dtype: bool

In [197]:
basics_df.replace({'\\N': np.nan}, inplace=True)

# Display the first few rows of the processed dataframe
basics_df.head()
basics_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10135606 entries, 0 to 10135605
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 696.0+ MB


In [198]:
basics_df = basics_df[keepers1]
basics_df



Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10135467,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,,58,Family
10135496,tt9916620,movie,The Copeland Case,The Copeland Case,0,,,,Drama
10135534,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,,,,"Drama,Short"
10135557,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,,,Short


In [199]:
basics_df = basics_df[basics_df['runtimeMinutes'].notna()]

basics_df = basics_df[basics_df['genres'].notna()]

basics_df = basics_df[basics_df.titleType == 'movie']

basics_df = basics_df[basics_df['startYear'].notna()]

basics_df['startYear'] = basics_df['startYear'].astype(float)

basics_df.dtypes

tconst             object
titleType          object
primaryTitle       object
originalTitle      object
isAdult            object
startYear         float64
endYear            object
runtimeMinutes     object
genres             object
dtype: object

In [200]:
basics_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894.0,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897.0,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906.0,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907.0,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908.0,,120,"Adventure,Fantasy"
...,...,...,...,...,...,...,...,...,...
10134961,tt9915436,movie,Vida em Movimento,Vida em Movimento,0,2019.0,,70,Documentary
10135139,tt9915872,movie,The Last White Witch,Boku no kanojo wa mahoutsukai,0,2019.0,,97,"Comedy,Drama,Fantasy"
10135279,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
10135288,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


In [201]:
# Filtering the basics dataframe using startYear column to keep movies between 2000 and 2021 inclusive
basics_df = basics_df[(basics_df['startYear'] >= 2000) & (basics_df['startYear'] <= 2021)]
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114626 entries, 34800 to 10135372
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          114626 non-null  object 
 1   titleType       114626 non-null  object 
 2   primaryTitle    114626 non-null  object 
 3   originalTitle   114626 non-null  object 
 4   isAdult         114626 non-null  object 
 5   startYear       114626 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  114626 non-null  object 
 8   genres          114626 non-null  object 
dtypes: float64(1), object(8)
memory usage: 8.7+ MB


In [202]:
# Exclude movies that are included in the documentary category.
is_documentary = basics_df['genres'].str.contains('documentary',case=False)
basics_df = basics_df[~is_documentary]
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82044 entries, 34800 to 10135372
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          82044 non-null  object 
 1   titleType       82044 non-null  object 
 2   primaryTitle    82044 non-null  object 
 3   originalTitle   82044 non-null  object 
 4   isAdult         82044 non-null  object 
 5   startYear       82044 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  82044 non-null  object 
 8   genres          82044 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.3+ MB


In [203]:
ratings_df.replace({'\\N': np.nan}, inplace=True)

In [204]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers2 =ratings_df['tconst'].isin(basics_df['tconst'])
keepers2


0          False
1          False
2          False
3          False
4          False
           ...  
1345295    False
1345296    False
1345297    False
1345298    False
1345299    False
Name: tconst, Length: 1345300, dtype: bool

In [205]:
ratings_df = ratings_df[keepers2]
ratings_df

Unnamed: 0,tconst,averageRating,numVotes
17896,tt0035423,6.4,87470
40653,tt0062336,6.4,180
46366,tt0068865,5.4,74
46523,tt0069049,6.7,7812
58014,tt0082328,5.9,1747
...,...,...,...
1345219,tt9914942,6.6,181
1345246,tt9915872,6.4,9
1345259,tt9916170,7.0,7
1345260,tt9916190,3.7,243


In [147]:
# Save Dataframe
akas_df.to_csv("Data/title_akas.csv.gz", compression='gzip', index=False)

# Open saved file
akas_df = pd.read_csv("Data/title_akas.csv.gz", low_memory=False)
akas_df.head()
akas_df.info() # resubmit added


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1464022 entries, 0 to 1464021
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1464022 non-null  object 
 1   ordering         1464022 non-null  int64  
 2   title            1464022 non-null  object 
 3   region           1464022 non-null  object 
 4   language         4126 non-null     object 
 5   types            983456 non-null   object 
 6   attributes       47438 non-null    object 
 7   isOriginalTitle  1462680 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 89.4+ MB


In [206]:
basics_df.to_csv("Data/title_basics.csv.gz", compression='gzip' , index=False)

#open saved file
basics_df = pd.read_csv("Data/title_basics.csv.gz", low_memory=False)
basics_df.head()
basics_df.info() #resubmit added


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82044 entries, 0 to 82043
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          82044 non-null  object 
 1   titleType       82044 non-null  object 
 2   primaryTitle    82044 non-null  object 
 3   originalTitle   82044 non-null  object 
 4   isAdult         82044 non-null  int64  
 5   startYear       82044 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  82044 non-null  int64  
 8   genres          82044 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 5.6+ MB


In [207]:
ratings_df.to_csv("Data/title_ratings.csv.gz", compression='gzip' , index=False)
#open saved file
ratings_df = pd.read_csv("Data/title_ratings.csv.gz", low_memory=False)
ratings_df.head()
ratings_df.info() #resubmit added

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68192 entries, 0 to 68191
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         68192 non-null  object 
 1   averageRating  68192 non-null  float64
 2   numVotes       68192 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 1.6+ MB


## Using API

In [208]:
import json
with open('/Users/corycates/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()



dict_keys(['api-key'])

In [209]:
import tmdbsimple as tmdb
tmdb.API_KEY =  login['api-key']



In [210]:
movie.info()

{'adult': False,
 'backdrop_path': '/9ns9463dwOeo1CK1JU2wirL5Yi1.jpg',
 'belongs_to_collection': None,
 'budget': 50000000,
 'genres': [{'id': 35, 'name': 'Comedy'},
  {'id': 10751, 'name': 'Family'},
  {'id': 16, 'name': 'Animation'}],
 'homepage': 'https://www.tomandjerrymovie.com',
 'id': 587807,
 'imdb_id': 'tt1361336',
 'original_language': 'en',
 'original_title': 'Tom & Jerry',
 'overview': 'Tom the cat and Jerry the mouse get kicked out of their home and relocate to a fancy New York hotel, where a scrappy employee named Kayla will lose her job if she can’t evict Jerry before a high-class wedding at the hotel. Her solution? Hiring Tom to get rid of the pesky mouse.',
 'popularity': 43.462,
 'poster_path': '/8XZI9QZ7Pm3fVkigWJPbrXCMzjq.jpg',
 'production_companies': [{'id': 25120,
   'logo_path': '/lMj6nMJBOzfLEd2fu8uF530AJcv.png',
   'name': 'Warner Bros. Pictures Animation',
   'origin_country': 'US'},
  {'id': 8922,
   'logo_path': '/yZWehAyjfKi4KvKeg1bkJ1bm5H8.png',
   'name'

In [211]:
info['budget']


50000000

In [212]:
info['revenue']


136536687

In [213]:
info['imdb_id']



'tt1361336'

In [214]:
# TEST FUNCTION FOR avengers
test = get_movie_with_rating("tt0848228") #put your function name here
test





{'adult': False,
 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 102.75,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path'

In [215]:
# TEST FUNCTION FOR NOTEBOOK
test = get_movie_with_rating("tt0332280") #put your function name here
test

{'adult': False,
 'backdrop_path': '/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg',
 'belongs_to_collection': None,
 'budget': 29000000,
 'genres': [{'id': 10749, 'name': 'Romance'}, {'id': 18, 'name': 'Drama'}],
 'homepage': 'http://www.newline.com/properties/notebookthe.html',
 'id': 11036,
 'imdb_id': 'tt0332280',
 'original_language': 'en',
 'original_title': 'The Notebook',
 'overview': "An epic love story centered around an older man who reads aloud to a woman with Alzheimer's. From a faded notebook, the old man's words bring to life the story about a couple who is separated by World War II, and is then passionately reunited, seven years later, after they have taken different paths.",
 'popularity': 50.251,
 'poster_path': '/rNzQyW4f8B8cQeg7Dgj3n6eT5k9.jpg',
 'production_companies': [{'id': 12,
   'logo_path': '/mevhneWSqbjU22D1MXNd4H9x0r0.png',
   'name': 'New Line Cinema',
   'origin_country': 'US'},
  {'id': 1565, 'logo_path': None, 'name': 'Avery Pix', 'origin_country': 'US'},
  {'id': 26

In [216]:
YEARS_TO_GET = [2000,2001]

YEARS_TO_GET

[2000, 2001]

In [217]:
# Error list to reference later after the loops
errors = [ ]



In [219]:
from tqdm.notebook import tqdm_notebook
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0):
    # Some code to execute for each YEAR
    print(YEAR)


YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

2000
2001


In [220]:
#Defining the JSON file to store results for year
JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'



In [221]:
# Check if file exists
file_exists = os.path.isfile(JSON_FILE)



In [222]:
# If it does not exist: create it
if file_exists == False:
# save an empty dict with just "imdb_id" to the new json file.
    with open(JSON_FILE,'w') as f:
        json.dump([{'imdb_id':0}],f)



In [223]:
#Saving new year as the current df
current_df = basics_df.loc[ basics_df['startYear']==YEAR].copy()
# saving movie ids to list
movie_ids = df['tconst'].copy()

current_df.head(5)




Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
12,tt0114447,movie,The Silent Force,The Silent Force,0,2001.0,,90,Action
16,tt0116916,movie,The Lord Protector,The Lord Protector,0,2001.0,,101,"Action,Adventure,Fantasy"
21,tt0118589,movie,Glitter,Glitter,0,2001.0,,104,"Drama,Music,Romance"
22,tt0118652,movie,The Attic Expeditions,The Attic Expeditions,0,2001.0,,100,"Comedy,Horror,Mystery"


In [224]:
# Load existing data from json into a dataframe called "previous_df"
previous_df = pd.read_json(JSON_FILE)



In [225]:
# filter out any ids that are already in the JSON_FILE
movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]



## Start of the Inner Loop

In [226]:
    #Get index and movie id from list
    # INNER Loop
    for movie_id in tqdm(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        try:
            # Retrieve then data for the movie id
            temp = get_movie_with_rating(movie_id)  
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        except Exception as e:
            errors.append([movie_id, e])



Movies from 2001:   0%|          | 0/1581 [00:00<?, ?it/s]

## After the Loop

In [227]:
final_year_df = pd.read_json(JSON_FILE)
final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)



In [228]:
print(f"- Total errors: {len(errors)}")



- Total errors: 231


In [229]:
# Instead of previous_df=pd.read_json:
previous_df = read_and_fix_json(JSON_FILE)



### Observation:  All years are appended to one file (2000 and 2001).  To get both years, the for loop needs to be altered to reflect both.  