Name: Faris Assallami

In [13]:
! pip install tmdbsimple



In [14]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm_notebook
import os, time,json
import tmdbsimple as tmdb 
import gzip

In [15]:
FOLDER = "/Users/faris/Documents/GitHub/IMDB/"
os.makedirs(FOLDER, exist_ok=True)
os.listdir(FOLDER)


['part 2 csvs',
 '.DS_Store',
 'tmdb api results',
 'README.md',
 '.gitattributes',
 'project 3 part 2 (API calls & EDA).ipynb',
 '.git',
 'project 3 part 3.ipynb',
 'project 3 part 1.ipynb',
 'part 1 csvs']

## Defining functions

In [16]:
def write_json(new_data, filename): 
    """Appends a list of records (new_data) to a json file (filename). 
    Adapted from: https://www.geeksforgeeks.org/append-to-json-file-using-python/"""  
    
    with open(filename,'r+') as file:
        # First we load existing data into a dict.
        file_data = json.load(file)
        ## Choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
             file_data.append(new_data)
        # Sets file's current position at offset.
        file.seek(0)
        # convert back to json.
        json.dump(file_data, file)


In [17]:
def get_movie_with_rating(movie_id):
# Get the movie object for the current id
    movie = tmdb.Movies(movie_id)
# save the .info .releases dictionaries
    info = movie.info()
    releases = movie.releases()
# Loop through countries in releases
    for c in releases['countries']:
    # if the country abbreviation==US
       if c['iso_3166_1' ] =='US':
        ## save a "certification" key in the info dict with the certification
          info['certification'] = c['certification']

    return info

## Loading in the title basics,ratings,and akas csv files from part 1 of the project

In [18]:
# Load in the the csvs into dataframes from project part 1 :
basics = pd.read_csv('/Users/faris/Documents/GitHub/IMDB/part 1 csvs/basics.csv')
ratings = pd.read_csv('/Users/faris/Documents/GitHub/IMDB/part 1 csvs/ratings.csv')
akas = pd.read_csv('/Users/faris/Documents/GitHub/IMDB/part 1 csvs/akas.csv')




## Create Required Lists for the Loop

In [19]:
## test extraction of movies that started in 2000, and 2001 (stakeholders request)
YEARS_TO_GET = [2000,2001]

# defining an errors list
errors = [ ]

## Importing TMDB API keys in the secret folder

In [20]:
with open('/Users/faris/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
## Display the keys of the loaded dict
login.keys()

dict_keys(['api-key'])

In [21]:
tmdb.API_KEY =  login['api-key']
tmdb_api = tmdb.API_KEY

## Testing the function on movie ID

In [22]:
# testing function on the movie: "the avengers"
test1 = get_movie_with_rating("tt0848228")
test1



{'adult': False,
 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 121.956,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path

In [23]:
# testing on the movie: "the notebook"
test2 = get_movie_with_rating("tt0332280")
test2

{'adult': False,
 'backdrop_path': '/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg',
 'belongs_to_collection': None,
 'budget': 29000000,
 'genres': [{'id': 10749, 'name': 'Romance'}, {'id': 18, 'name': 'Drama'}],
 'homepage': 'http://www.newline.com/properties/notebookthe.html',
 'id': 11036,
 'imdb_id': 'tt0332280',
 'original_language': 'en',
 'original_title': 'The Notebook',
 'overview': "An epic love story centered around an older man who reads aloud to a woman with Alzheimer's. From a faded notebook, the old man's words bring to life the story about a couple who is separated by World War II, and is then passionately reunited, seven years later, after they have taken different paths.",
 'popularity': 55.892,
 'poster_path': '/rNzQyW4f8B8cQeg7Dgj3n6eT5k9.jpg',
 'production_companies': [{'id': 12,
   'logo_path': '/5ThIuO93vsk47oexKTSdfKEr7EC.png',
   'name': 'New Line Cinema',
   'origin_country': 'US'},
  {'id': 1565, 'logo_path': None, 'name': 'Avery Pix', 'origin_country': 'US'},
  {'id': 26

## JSON to CSV

In [24]:
# Start of OUTER loop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc='YEARS', position=0) :

    #Defining the JSON files to store results for year 
    JSON_FILE = f'{FOLDER}tmdb_api_results_{YEAR}.json'

    # Checking to see if file exists
    file_exists = os.path.isfile(JSON_FILE)

    # If it does not exist: create it
    if file_exists == False:
    # Saving an empty dict with just "imdb_id" to the new json file.
         with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)

        # Saving new year as the current df
         df = basics.loc[ basics['startYear']==YEAR].copy()
        # Saving movie ids to list
         movie_ids = df['tconst'].copy()

         # Load existing data from json into a dataframe called "previous_df"
         previous_df = pd.read_json(JSON_FILE)      

         # filter out any ids that are already in the JSON_FILE
         movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]

          #Get index and movie id from list
          #INNER Loop
         for movie_id in tqdm_notebook(movie_ids_to_get, desc=f'Movies from {YEAR}', position=1, leave=True):
            
            try:
                # Retrieve the data for the movie id
                temp = get_movie_with_rating(movie_id)  
                # Append/extend results to existing file using a pre-made function
                write_json(temp,JSON_FILE)
                 # Short 20 ms sleep to prevent overwhelming server
                time.sleep(0.02)
            
            except Exception as e:
                errors.append([movie_id, e])

       
    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f"{FOLDER}final_tmdb_data_{YEAR}.csv.gz", compression="gzip", index=False)

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/2741 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/2870 [00:00<?, ?it/s]

## Loading the JSON files into a pandas dataframe

In [25]:
# Load JSON file into pandas dataframe
with open('/Users/faris/Documents/GitHub/IMDB/tmdb_api_results_2000.json', 'r') as f:
    imdb2000 = json.load(f)

df2000 = pd.json_normalize(imdb2000)

# Save dataframe as compressed CSV file
with gzip.open('/Users/faris/Documents/GitHub/IMDB/final_tmdb_data_2000.csv.gz', 'wt', encoding='utf8') as f:
    df.to_csv(f, index=False)

In [26]:
with open('/Users/faris/Documents/GitHub/IMDB/tmdb_api_results_2001.json', 'r') as f:
    imdb2001 = json.load(f)

df2001 = pd.json_normalize(imdb2001)

# Save dataframe as compressed CSV file
with gzip.open('/Users/faris/Documents/GitHub/IMDB/final_tmdb_data_2001.csv.gz', 'wt', encoding='utf8') as f:
    df.to_csv(f, index=False)

In [27]:
print(f"- Total errors: {len(errors)}")

- Total errors: 1098


## Exploratory Data Analysis

In [28]:
# Creating both the 2000 and 2001 data frames
df1 = pd.read_json('/Users/faris/Documents/GitHub/IMDB/tmdb_api_results_2000.json')
df2 = pd.read_json('/Users/faris/Documents/GitHub/IMDB/tmdb_api_results_2001.json')

In [29]:
df1.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
3,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,4.0,1.0,
4,tt0116748,0.0,/wr0hTHwkYIRC82MwNbhOvqrw27N.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,579396.0,hi,Karobaar,...,0.0,180.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,The Business of Love,Karobaar,0.0,7.0,3.0,


In [30]:
df2.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0035423,0.0,/hfeiSfWYujh6MKhtGTXyK3DD4nN.jpg,,48000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 14, ...",,11232.0,en,Kate & Leopold,...,76019048.0,118.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,"If they lived in the same century, they'd be p...",Kate & Leopold,0.0,6.326,1187.0,PG-13
2,tt0114447,0.0,,,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",,151007.0,en,The Silent Force,...,0.0,90.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,They left him for dead... They should have fin...,The Silent Force,0.0,5.0,3.0,
3,tt0114722,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 53, 'name...",,276251.0,es,3 Noches,...,0.0,105.0,"[{'english_name': 'Spanish', 'iso_639_1': 'es'...",Released,,3 Nights,0.0,0.0,0.0,
4,tt0116916,0.0,/rFpHBidSlhjflmnLu7BZilyKeQR.jpg,,0.0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",,73549.0,en,The Dark Mist,...,0.0,101.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,The Dark Mist,0.0,3.5,2.0,PG


In [31]:
# concatinating both dataframes
df = pd.concat([df1,df2],ignore_index = True, sort=False)
df.head()

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
3,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,4.0,1.0,
4,tt0116748,0.0,/wr0hTHwkYIRC82MwNbhOvqrw27N.jpg,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,579396.0,hi,Karobaar,...,0.0,180.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,The Business of Love,Karobaar,0.0,7.0,3.0,


In [32]:
display(df.info())
df.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4515 entries, 0 to 4514
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                4515 non-null   object 
 1   adult                  4513 non-null   float64
 2   backdrop_path          2066 non-null   object 
 3   belongs_to_collection  254 non-null    object 
 4   budget                 4513 non-null   float64
 5   genres                 4513 non-null   object 
 6   homepage               4513 non-null   object 
 7   id                     4513 non-null   float64
 8   original_language      4513 non-null   object 
 9   original_title         4513 non-null   object 
 10  overview               4513 non-null   object 
 11  popularity             4513 non-null   float64
 12  poster_path            3887 non-null   object 
 13  production_companies   4513 non-null   object 
 14  production_countries   4513 non-null   object 
 15  rele

None

Index(['imdb_id', 'adult', 'backdrop_path', 'belongs_to_collection', 'budget',
       'genres', 'homepage', 'id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'certification'],
      dtype='object')

## 1- How many movies had at least some valid financial information (values > 0 for budget OR revenue)?

In [33]:
valid_fin_info = df[ ( df['budget'] > 0) | (df['revenue'] > 0 )]
amount = len(valid_fin_info)
print(f'Movies with some valid financial information : {amount}')

Movies with some valid financial information : 740


## 2- How many movies are there in each of the certification categories (G/PG/PG-13/R)?

In [34]:
print(' Movies in each certification categories:')
df['certification'].value_counts()

 Movies in each certification categories:


           1030
R           476
PG-13       183
NR           77
PG           68
G            27
NC-17         7
Unrated       1
Name: certification, dtype: int64

## 3- What is the average revenue per certification category?

In [35]:
average_revenue_viewers_category = df[df['revenue']> 0].groupby('certification')['revenue'].mean()

print(f'Average revenue by certification category : {average_revenue_viewers_category}')

Average revenue by certification category : certification
         2.294001e+07
G        1.173648e+08
NC-17    1.167800e+04
NR       1.358396e+07
PG       1.249167e+08
PG-13    1.057456e+08
R        4.433585e+07
Name: revenue, dtype: float64


## 4- What is the average budget per certification category?

In [36]:
average_budget_viewers_category = df[df['budget']> 0].groupby('certification')['budget'].mean()

print(f'Average budget by certification category : {average_budget_viewers_category}')

Average budget by certification category : certification
         7.299854e+06
G        4.095383e+07
NR       8.928341e+06
PG       4.614698e+07
PG-13    4.470363e+07
R        2.191733e+07
Name: budget, dtype: float64


In [37]:
# save the final results to a compressed csv
df.to_csv('/Users/faris/Documents/GitHub/IMDB/tmdb_results_combined.csv.gz', compression='gzip',index=False)