In [97]:
import json, time, os
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook
FOLDER = "Data/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)

['results_in_austin.json',
 ' final_tmdb_data_2000.csv.gz',
 'tmdp_api_results 2001.json',
 'results_in_austin.csv.gz',
 'tmdp_api_results 2000.json',
 ' final_tmdb_data_2001.csv.gz',
 '.ipynb_checkpoints']

# # Credentials and Accessing the API

In [98]:
import tmdbsimple as tmdb
with open("/Users/echo/Documents/0424_Data_Enrichment/.secret/tmdb_api.json", "r") as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()    
tmdb.API_KEY =  login['api-key']

In [99]:
movie = tmdb.Movies("tt0035423")
movie.info()

{'adult': False,
 'backdrop_path': '/hfeiSfWYujh6MKhtGTXyK3DD4nN.jpg',
 'belongs_to_collection': None,
 'budget': 48000000,
 'genres': [{'id': 10749, 'name': 'Romance'},
  {'id': 14, 'name': 'Fantasy'},
  {'id': 35, 'name': 'Comedy'}],
 'homepage': '',
 'id': 11232,
 'imdb_id': 'tt0035423',
 'original_language': 'en',
 'original_title': 'Kate & Leopold',
 'overview': "When her scientist ex-boyfriend discovers a portal to travel through time -- and brings back a 19th-century nobleman named Leopold to prove it -- a skeptical Kate reluctantly takes responsibility for showing Leopold the 21st century. The more time Kate spends with Leopold, the harder she falls for him. But if he doesn't return to his own time, his absence will forever alter history.",
 'popularity': 17.026,
 'poster_path': '/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg',
 'production_companies': [{'id': 85,
   'logo_path': None,
   'name': 'Konrad Pictures',
   'origin_country': ''},
  {'id': 14,
   'logo_path': '/m6AHu84oZQxvq7n1rsvM

In [100]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)

In [101]:
def get_movie_with_rating(movie_id):
    #Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
    #Save the .info .releases dictionaries
    info = movie.info()
    releases = movie.releases()
    #Loop thru countries in releases
    for c in releases["countries"]:
        #if the country abbreviation == US
        if c["iso_3166_1"] == "US":
            #Save a certification key to new column
            info["certification"] = c["certification"]
    return info 

# Load in the Title Basics data

In [102]:
import pandas as pd
# Load in the dataframe from project part 1 as basics:
basics = pd.read_csv("/Users/echo/Documents/0424_Data_Enrichment/Project-3/Data/title_basics.csv.gz")
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
2,tt0068865,movie,Lives of Performers,Lives of Performers,0,2016.0,,90,Drama
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


# Create Required Lists for the Loop
###  Define a list of the Years to Extract from the API

In [103]:
YEARS_TO_GET = [2000,2001]

In [104]:
movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df["imdb_id"])]
movie_ids_to_get

0        tt0035423
11       tt0114447
15       tt0116916
19       tt0118589
20       tt0118652
           ...    
79662    tt9071078
80118    tt9212730
80151    tt9228234
80938    tt9555974
80992    tt9578462
Name: tconst, Length: 1577, dtype: object

In [105]:
temp = get_movie_with_rating("tt0035423")
temp

{'adult': False,
 'backdrop_path': '/hfeiSfWYujh6MKhtGTXyK3DD4nN.jpg',
 'belongs_to_collection': None,
 'budget': 48000000,
 'genres': [{'id': 10749, 'name': 'Romance'},
  {'id': 14, 'name': 'Fantasy'},
  {'id': 35, 'name': 'Comedy'}],
 'homepage': '',
 'id': 11232,
 'imdb_id': 'tt0035423',
 'original_language': 'en',
 'original_title': 'Kate & Leopold',
 'overview': "When her scientist ex-boyfriend discovers a portal to travel through time -- and brings back a 19th-century nobleman named Leopold to prove it -- a skeptical Kate reluctantly takes responsibility for showing Leopold the 21st century. The more time Kate spends with Leopold, the harder she falls for him. But if he doesn't return to his own time, his absence will forever alter history.",
 'popularity': 17.026,
 'poster_path': '/mUvikzKJJSg9khrVdxK8kg3TMHA.jpg',
 'production_companies': [{'id': 85,
   'logo_path': None,
   'name': 'Konrad Pictures',
   'origin_country': ''},
  {'id': 14,
   'logo_path': '/m6AHu84oZQxvq7n1rsvM

### Define an errors list

In [106]:
#Create empty list to append errors later on 
errors = []

# Start OUTER loop

### Set up Progress Bar -> Create File ->
The progress bar works within the for statement of the for loop. Note that this will iterate through each year that is defined in the YEARS_TO_GET variable.

In [107]:
#Defining the JSON file to store results for each year
JSON_FILE = f"{FOLDER}tmdp_api_results {YEAR}.json"
# Check if file exists
file_exists = os.path.isfile(JSON_FILE)
# Start of OUTER loop
#progress bar based on years_to_get. DESC means Description name
for YEAR in tqdm_notebook(YEARS_TO_GET, desc="YEARS", position=0):
    # If it does not exist: create it
    if file_exists == False:
        # save an empty dict with just "imdb_id" to the new json file.
        with open(JSON_FILE, "w") as f:
            json.dump([{'imdb_id':0}],f)
    #Saving new year as the current df
    df = basics.loc[basics["startYear"] == YEAR].copy()
    # saving movie ids to list
    movie_ids = df["tconst"].copy()
    # Load existing data from json into a dataframe called "previous_df"
    previous_df = pd.read_json(JSON_FILE)
    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df["imdb_id"])]
    
    #Get index and movie id from list
    # INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        try:
            # Retrieve then data for the movie id
            temp = get_movie_with_rating(movie_id)
            # Append/extend results to existing file using a pre-made function
            write_json(temp,JSON_FILE)
            # Short 20 ms sleep to prevent overwhelming server
            time.sleep(0.02)
            
        except Exception as e:
            errors.append([movie_id, e])

                
    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER} final_tmdb_data_{YEAR}.csv.gz",
                        compression="gzip", index=False)
print(f"- Total errors: {len(errors)}")   

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/209 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/234 [00:00<?, ?it/s]

- Total errors: 443
