In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os, json, math, time
import glob
import regex as re
import tmdbsimple as tmdb
from tqdm.notebook import tqdm_notebook

In [47]:
KeyPath = %env CODINGDOJO
with open(KeyPath) as f: 
    login = json.load(f)
# other environment variables
data_file = 'tmdb_results_combined.csv.gz'
data_dir = 'data/'
data_dump = 'data_dumps/'
data_basics = 'title_basics.csv.gz'
years_requested = [*range(2022,2023,1)]
tmdb.API_KEY = login['tmdb-api-key-v3']
# if all years ran do not retrive any results, no need to recompile final merged csv
zero_data = True

In [48]:
def get_movie_with_rating(movie_id):
    try:
        movie = tmdb.Movies(movie_id)
        movie_info = movie.info()
        releases = movie.releases()
        x=0
        for c in releases['countries']:
            if c['iso_3166_1'] == 'US' and len(c['certification'])>0:
                x+=1
                movie_info['certification'] = c['certification']
        if x == 0:
            movie_info['certification']=np.NaN            
    except:
        movie_info = {"imdb_id": movie_id}
    return movie_info

In [49]:
%%time
for YEAR in tqdm_notebook(years_requested,desc='YEARS',position=0):
    JSON_FILE = f'{data_dir}tmdb_api_results_{YEAR}.json'
    #Create dump directory for each Year
    if not os.path.exists(f'{data_dump}{YEAR}'):
        os.makedirs(f'{data_dump}{YEAR}')
    file_list =  [x.replace('.json','') for x in os.listdir(f'{data_dump}{YEAR}')]
    #get complete list of movie_ids from the basics file for corresponding year 
    movie_ids = pd.read_csv(data_dir+data_basics)[['tconst','startYear']]
    movie_ids = [*movie_ids['tconst'].loc[movie_ids['startYear']==YEAR]]

    # filter out any ids that are already in file_data
    #possible if there are interuptions when extracting data from API
    movie_ids_to_get = [x for x in movie_ids if x not in file_list]
    
    #Get index and movie id from list
    # INNER Loop
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                  desc=f'Movies from {YEAR}',
                                  position=1,
                                  leave=True):
        #print(movie_id)
        json_req = get_movie_with_rating(movie_id)  #This uses your pre-made function
        # Just rewrite entire file. could run again later to update/add results.
        dump_file = f'{data_dump}{YEAR}/{movie_id}.json'
        with open(dump_file,'w') as file:
            #file_data.append(json_req)
            json.dump(json_req,file)
        # Short 150 ms sleep to prevent overwhelming server
        time.sleep(0.15)

    
    file_list =  glob.glob(f'{data_dump}{YEAR}/*.json') # files = glob.glob('/YOUR/PATH/*')
    json_data = list()
    for x in file_list:
        with open(x,'r') as file:
            json_data.append(json.load(file))
    with open(JSON_FILE, 'w') as output_file:
        json.dump(json_data, output_file)

    final_year_df = pd.read_json(json.dumps(json_data))
    final_year_df.to_csv(f"{data_dir}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)
    #remove all the individual dumped data no longer needed once in final .json file
    for x in file_list:
        os.remove(x)
    
    

YEARS:   0%|          | 0/1 [00:00<?, ?it/s]

Movies from 2022:   0%|          | 0/2545 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [50]:
%%time
#will always recreate the entire final data file because csv files really cannot be upserted
# and appending can add duplicated rows

#get a list of all csv.gz individual year files 
#https://stackoverflow.com/questions/2225564/get-a-filtered-list-of-files-in-a-directory

files = [f for f in os.listdir(data_dir) if re.match(r'final_tmdb_data_.*\.csv.gz', f)]

#Save a final merged .csv.gz of all of the tmdb api data
#write mode w+ will create file if not exists. a will append to end
if zero_data == False:
    cnt = 1
    for x in files:
        df= pd.read_csv(data_dir+x)
        if cnt == 1:
            df.to_csv(data_dir+data_file,mode='w+' ,compression="gzip", index=False)
            cnt+=1
        elif cnt > 1:
            df.to_csv(data_dir+data_file,mode='a' ,compression="gzip", index=False,header=False)

Wall time: 3 ms


#### Other transformations

In [7]:
%%time
#outside of loop for json to individual csv and combined
file_list =  glob.glob(f'{data_dir}*.json')
final_df = pd.DataFrame()
for x in file_list:
    with open(x,'r') as file:
        YEAR = re.sub(r'[^0-9]','',x)
        df = pd.read_json(json.dumps(json.load(file)))
        df.to_csv(f"{data_dir}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)
        final_df = pd.concat([final_df,df],axis=0)
final_df.to_csv(data_dir+data_file,mode='w+' ,compression="gzip", index=False)



Wall time: 21.1 s


In [16]:
final_df.duplicated(subset=['imdb_id','budget','revenue','certification']).sum() 

84

In [25]:
final_df = final_df.drop_duplicates(subset=['imdb_id','budget','revenue'],keep = 'last').reset_index(drop = True)
final_df.duplicated(subset=['imdb_id','budget','revenue','certification']).sum() 

0

In [27]:
final_df.to_csv(data_dir+data_file,mode='w+' ,compression="gzip", index=False)

In [46]:
final_df['certification'].str.strip().value_counts()

R                                  6547
NR                                 3476
PG-13                              3417
                                   1835
PG                                 1517
G                                   461
NC-17                               161
Unrated                               5
-                                     1
UR                                    1
Not Rated                             1
ScreamFest Horror Film Festival       1
10                                    1
Name: certification, dtype: int64

In [None]:
final_df['certification']

In [None]:
## fix certification col
repl_cert = {'UR':'NR',
             'Not Rated':'NR',
             'Unrated':'NR',
             '-':'NR',
             '10':np.nan,
             'ScreamFest Horror Film Festival':'NR'}

In [44]:
final_df[final_df['certification'].str.contains('one',case=False)==True ]

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification


In [30]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 69254 entries, 0 to 69253
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  62057 non-null  float64
 1   backdrop_path          38650 non-null  object 
 2   belongs_to_collection  3922 non-null   object 
 3   budget                 62057 non-null  float64
 4   genres                 62057 non-null  object 
 5   homepage               59585 non-null  object 
 6   id                     62057 non-null  float64
 7   imdb_id                69254 non-null  object 
 8   original_language      62057 non-null  object 
 9   original_title         62057 non-null  object 
 10  overview               62057 non-null  object 
 11  popularity             62057 non-null  float64
 12  poster_path            56302 non-null  object 
 13  production_companies   62057 non-null  object 
 14  production_countries   62057 non-null  object 
 15  re

In [126]:
#update ratings in json files
for x in glob.glob(f'{data_dir}/tmdb_api_results_*.json'):
    df = pd.read_json(x)
    try:
        for movie in [df[df['certification'] == '']['imdb_id']][0] :
            df.loc[df['imdb_id']==movie,'certification'] = get_movie_with_rating(movie)['certification']
            time.sleep(0.15)
    except:
        print(movie,':cert error')
        pass
    with open(x,'w') as output_file:
        json.dump(json.loads(df.to_json(orient="records")),output_file)
        print(x)

data\tmdb_api_results_2000-orginal.json
data\tmdb_api_results_2000.json
data\tmdb_api_results_2001.json
data\tmdb_api_results_2002.json
data\tmdb_api_results_2003.json


KeyError: 'certification'