# Movie Success Predictions

In [1]:
# Import Libraries 
import pandas as pd
import numpy as np
import os, json, time
from tqdm.notebook import tqdm_notebook 

## User Defined Functions

### get_movie_with_rating Function

In [2]:
# function to get movie raiting from tmdb api
def get_movie_with_rating(movie_id):
    """Adapted from source = https://github.com/celiao/tmdbsimple"""
    # get movie object for current id
    movie = tmdb.Movies(movie_id)
    
    # save the .info .releases dictionaries
    info = movie.info()
    releases = movie.releases()
    
    # loop through countries in releases
    for c in releases['countries']:
        # if country abbreviation==US
        if c['iso_3166_1'] == 'US':
            # save a 'certification' key in info with the certification
            info['certification'] = c['certification']
    
    return info

#### Test get_movie_with_rating Function

Test with The Avengers

In [41]:
test1 = get_movie_with_rating('tt0848228')
test1

{'adult': False,
 'backdrop_path': '/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg',
 'belongs_to_collection': {'id': 86311,
  'name': 'The Avengers Collection',
  'poster_path': '/yFSIUVTCvgYrpalUktulvk3Gi5Y.jpg',
  'backdrop_path': '/zuW6fOiusv4X9nnW3paHGfXcSll.jpg'},
 'budget': 220000000,
 'genres': [{'id': 878, 'name': 'Science Fiction'},
  {'id': 28, 'name': 'Action'},
  {'id': 12, 'name': 'Adventure'}],
 'homepage': 'https://www.marvel.com/movies/the-avengers',
 'id': 24428,
 'imdb_id': 'tt0848228',
 'original_language': 'en',
 'original_title': 'The Avengers',
 'overview': 'When an unexpected enemy emerges and threatens global safety and security, Nick Fury, director of the international peacekeeping agency known as S.H.I.E.L.D., finds himself in need of a team to pull the world back from the brink of disaster. Spanning the globe, a daring recruitment effort begins!',
 'popularity': 146.526,
 'poster_path': '/RYMX2wcKCBAr24UyPD7xwmjaTn.jpg',
 'production_companies': [{'id': 420,
   'logo_path

Test with The Notebook 

In [42]:
test2 = get_movie_with_rating('tt0332280')
test2

{'adult': False,
 'backdrop_path': '/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg',
 'belongs_to_collection': None,
 'budget': 29000000,
 'genres': [{'id': 10749, 'name': 'Romance'}, {'id': 18, 'name': 'Drama'}],
 'homepage': 'http://www.newline.com/properties/notebookthe.html',
 'id': 11036,
 'imdb_id': 'tt0332280',
 'original_language': 'en',
 'original_title': 'The Notebook',
 'overview': "An epic love story centered around an older man who reads aloud to a woman with Alzheimer's. From a faded notebook, the old man's words bring to life the story about a couple who is separated by World War II, and is then passionately reunited, seven years later, after they have taken different paths.",
 'popularity': 55.267,
 'poster_path': '/rNzQyW4f8B8cQeg7Dgj3n6eT5k9.jpg',
 'production_companies': [{'id': 12,
   'logo_path': '/iaYpEp3LQmb8AfAtmTvpqd4149c.png',
   'name': 'New Line Cinema',
   'origin_country': 'US'},
  {'id': 1565, 'logo_path': None, 'name': 'Avery Pix', 'origin_country': 'US'},
  {'id': 26

The function works as predicted.

### write_json Function

In [3]:
# appends a list of records (new_data) to a json file
# addapted from : https://www.geeksforgeeks.org/append-to-json-file-using-python/

def write_json(new_data, filename):
    with open(filename,'r+') as file:
        # first load existing data into dict
        file_data = json.load(file)
        # choose extend or append
        if (type(new_data) == list) & (type(file_data) == list):
            file_data.extend(new_data)
        else:
            file_data.append(new_data)
        # set file's current position at offset
        file.seek(0)
        # convert back to json
        json.dump(file_data, file)

#### Test write_json Function

In [46]:
# create new json file with empty list
JSON_FILE = 'Data/test_json.json'
with open(JSON_FILE,'w') as f:
    json.dump([],f)
write_json(test1, JSON_FILE)
write_json(test2, JSON_FILE)

In [47]:
# save json file as dataframe and look at info
test_df = pd.read_json(JSON_FILE)
test_df

Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,False,/9BBTo63ANSmhC4e6r62OJFuK2GL.jpg,"{'id': 86311, 'name': 'The Avengers Collection...",220000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",https://www.marvel.com/movies/the-avengers,24428,tt0848228,en,The Avengers,...,1518815515,143,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Some assembly required.,The Avengers,False,7.7,28252,PG-13
1,False,/qom1SZSENdmHFNZBXbtJAU0WTlC.jpg,,29000000,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",http://www.newline.com/properties/notebookthe....,11036,tt0332280,en,The Notebook,...,115603229,123,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Behind every great love is a great story.,The Notebook,False,7.878,10179,PG-13


The function works as predicted. 

## Preprocessing Data 

In [4]:
# Save all url's for the datasets
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'

In [5]:
# Convert data sets into dataframes
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

In [6]:
# Replace \N with NaN in all the dataframes
basics.replace({'\\N':np.nan}, inplace=True)
ratings.replace({'\\N':np.nan}, inplace=True)
akas.replace({'\\N':np.nan}, inplace=True)

In [7]:
# check for duplicate rows in dataframes
print(f'Duplicate rows in basics dataframe:', basics.duplicated().sum())
print(f'Duplicate rows in ratings dataframe:', ratings.duplicated().sum())
print(f'Duplicate rows in akas dataframe:', akas.duplicated().sum())

Duplicate rows in basics dataframe: 0
Duplicate rows in ratings dataframe: 0
Duplicate rows in akas dataframe: 0


### Cleaning AKAs Dataframe

In [8]:
# Display info for AKAs dataframe
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35115304 entries, 0 to 35115303
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.1+ GB


In [9]:
# Check value_counts for region column
akas['region'].value_counts()

DE    4199276
FR    4195721
JP    4195378
IN    4134627
ES    4117485
       ...   
CC          1
TV          1
NU          1
PW          1
NR          1
Name: region, Length: 247, dtype: int64

In [10]:
# Keep only movies in US region
akas = akas.loc[akas['region'] == 'US']
akas['region'].value_counts()

US    1418401
Name: region, dtype: int64

In [11]:
# Check info of akas dataframe
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1418401 entries, 5 to 35115048
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1418401 non-null  object
 1   ordering         1418401 non-null  int64 
 2   title            1418401 non-null  object
 3   region           1418401 non-null  object
 4   language         3840 non-null     object
 5   types            974792 non-null   object
 6   attributes       46099 non-null    object
 7   isOriginalTitle  1417056 non-null  object
dtypes: int64(1), object(7)
memory usage: 97.4+ MB


### Cleaning Title Basics Dataframe

In [12]:
# Display info for title basics dataframe
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9656025 entries, 0 to 9656024
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 663.0+ MB


In [13]:
# Eliminate movies that are null for runtimeMinutes
print(f'NaN count in runtimeMinutes column:', basics['runtimeMinutes'].isna().sum())
basics.dropna(subset=['runtimeMinutes'], inplace=True)
print(f'NaN count after eliminating null rows:',basics['runtimeMinutes'].isna().sum())

NaN count in runtimeMinutes column: 6822569
NaN count after eliminating null rows: 0


In [14]:
# Eliminate movies that are null for genre
print(f'NaN count in genre column:', basics['genres'].isna().sum())
basics.dropna(subset=['genres'], inplace=True)
print(f'NaN count after eliminating null rows:',basics['genres'].isna().sum())

NaN count in genre column: 75942
NaN count after eliminating null rows: 0


In [15]:
# Check value_counts for the titleType column
basics['titleType'].value_counts()

tvEpisode       1383048
short            593147
movie            378114
video            179116
tvMovie           90980
tvSeries          89460
tvSpecial         17753
tvMiniSeries      16899
tvShort            8679
videoGame           318
Name: titleType, dtype: int64

In [16]:
# Keep only rows in which titleType == movie
basics = basics.loc[basics['titleType'] == 'movie']
basics['titleType'].value_counts()

movie    378114
Name: titleType, dtype: int64

In [17]:
# Remove movies that are null for the startYear
# we will do this because we only want to keep movies with a startYear between
# 2000 - 2021, and null values are not between those years
basics.dropna(subset = ['startYear'], inplace=True)
basics['startYear'].isna().sum()

0

In [18]:
# convert start year column to type integer
basics['startYear'] = basics['startYear'].astype(int)
basics.dtypes

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear          int32
endYear           object
runtimeMinutes    object
genres            object
dtype: object

In [19]:
# Keep rows with startYear between 2000 - 2022
basics = basics.loc[(basics['startYear'] >= 2000) & (basics['startYear'] <= 2021)]

In [20]:
# check that startYear column only kept data between 2000 - 2021
basics['startYear'].describe()

count    209575.000000
mean       2012.837514
std           5.611038
min        2000.000000
25%        2009.000000
50%        2014.000000
75%        2017.000000
max        2021.000000
Name: startYear, dtype: float64

In [21]:
# Eliminate movies that include 'Documentary' in the genre 
is_documentary = basics['genres'].str.contains('documentary', case=False)
basics = basics[~is_documentary]

In [22]:
# Keep only movies made in the US region (as defined by the akas dataframe)
keepers = basics['tconst'].isin(akas['titleId'])
basics = basics[keepers]

In [23]:
# Check info of basics dataframe
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 81087 entries, 34803 to 9655791
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          81087 non-null  object
 1   titleType       81087 non-null  object
 2   primaryTitle    81087 non-null  object
 3   originalTitle   81087 non-null  object
 4   isAdult         81087 non-null  object
 5   startYear       81087 non-null  int32 
 6   endYear         0 non-null      object
 7   runtimeMinutes  81087 non-null  object
 8   genres          81087 non-null  object
dtypes: int32(1), object(8)
memory usage: 5.9+ MB


### Cleaning Title Ratings Dataframe

In [24]:
# check info for ratings dataframe
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1284591 entries, 0 to 1284590
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1284591 non-null  object 
 1   averageRating  1284591 non-null  float64
 2   numVotes       1284591 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 29.4+ MB


In [25]:
# Keep only movies made in the US region (as defined by the akas dataframe)
keepers = ratings['tconst'].isin(akas['titleId'])
ratings = ratings[keepers]

In [26]:
# check info of ratings dataframe
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 490706 entries, 0 to 1284569
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         490706 non-null  object 
 1   averageRating  490706 non-null  float64
 2   numVotes       490706 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.0+ MB


### Saving Dataframes in the GitHub Repository

In [27]:
# Create 'Data/' file in GitHub Repository
os.makedirs('Data/', exist_ok=True)
# confirm the folder is created
os.listdir("Data/")

['blue_long_2-9665a76b1ae401a510ec1e0ca40ddcb3b0cfe45f1d51b77a308fea0845885648.svg',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz']

In [28]:
# save dataframes to files
basics.to_csv("Data/title_basics.csv.gz", compression='gzip', index=False)
ratings.to_csv("Data/title_ratings.csv.gz", compression='gzip', index=False)
akas.to_csv("Data/title_akas.csv.gz", compression='gzip', index=False)

In [29]:
# check basics dataframe was saved correctly
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory=False)
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81087 entries, 0 to 81086
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          81087 non-null  object 
 1   titleType       81087 non-null  object 
 2   primaryTitle    81087 non-null  object 
 3   originalTitle   81087 non-null  object 
 4   isAdult         81087 non-null  int64  
 5   startYear       81087 non-null  int64  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  81087 non-null  int64  
 8   genres          81087 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 5.6+ MB


In [30]:
# check ratings dataframe was saved correctly
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory=False)
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 490706 entries, 0 to 490705
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         490706 non-null  object 
 1   averageRating  490706 non-null  float64
 2   numVotes       490706 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 11.2+ MB


In [31]:
# check akas dataframe was saved correctly
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory=False)
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1418401 entries, 0 to 1418400
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1418401 non-null  object 
 1   ordering         1418401 non-null  int64  
 2   title            1418401 non-null  object 
 3   region           1418401 non-null  object 
 4   language         3840 non-null     object 
 5   types            974792 non-null   object 
 6   attributes       46099 non-null    object 
 7   isOriginalTitle  1417056 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 86.6+ MB


## Extract Data from TMDB

### tmdbsimple Package

In [32]:
!pip install tmdbsimple;
import tmdbsimple as tmdb



In [33]:
# load login credentials using json file in notebook
with open('E:/Dropbox/Coding Dojo/Projects/Project 3/movie_success_predictions/.secret/tmdb_api.json', 'r') as f:
    login = json.load(f)
# display keys of loaded dict
login.keys()

dict_keys(['api-key'])

In [34]:
# save api-key using tmdbsimple
tmdb.API_KEY = login['api-key']

### Extract data from TMDB API

In [35]:
# define list of years to extract
YEARS_TO_GET = [2000, 2001]
# define empty errors list
errors = []

In [37]:
# set up progress bar as start of outer loop
for YEAR in tqdm_notebook(YEARS_TO_GET, desc = 'YEARS', position=0):
    
    # select JSON_FILE name and check if file exists
    JSON_FILE = f'Data/tmdb_api_results_{YEAR}.json'
    # check if JSON_FILE already exists or not
    file_exists = os.path.isfile(JSON_FILE)
    
    # if file does not exist, create it
    if file_exists == False:
        # save empty dict with just 'imdb_id' to the new json file
        with open(JSON_FILE,'w') as f:
            json.dump([{'imdb_id':0}],f)
            
    # save new year as the current df
    df = basics.loc[basics['startYear']==YEAR].copy()
    # save movie ids to list
    movie_ids = df['tconst'].copy()
    
    # load existing data from json to dataframe
    previous_df = pd.read_json(JSON_FILE)
    # filter out any ids that are already in the JSON_FILE
    movie_ids_to_get = movie_ids[~movie_ids.isin(previous_df['imdb_id'])]
    
    # get index and movie id from list
    # INNER LOOP
    for movie_id in tqdm_notebook(movie_ids_to_get,
                                 desc=f'Movies from {YEAR}',
                                 position=1,
                                 leave=True):
        try:
            # retrieve then data for the movie id
            temp = get_movie_with_rating(movie_id)
            # append/extend results to existing file using user defined functions
            write_json(temp, JSON_FILE)
            # short 20ms sleep to prevent overwhelming server
            time.sleep(.02)
            
        except Exception as e:
            # append errors to errors list
            errors.append([movie_id,e])
            
    # convert json file to .csv.gz
    final_year_df = pd.read_json(JSON_FILE)
    final_year_df.to_csv(f'Data/final_tmdb_data_{YEAR}.csv.gz', 
                         compression='gzip',
                        index=False)
    
print(f'- Total errors: {len(errors)}')

YEARS:   0%|          | 0/2 [00:00<?, ?it/s]

Movies from 2000:   0%|          | 0/1434 [00:00<?, ?it/s]

Movies from 2001:   0%|          | 0/1549 [00:00<?, ?it/s]

- Total errors: 456


## Exploratory Data Analysis

### Load Extracted Info as Dataframe

In [49]:
df_2000 = pd.read_csv('Data/final_tmdb_data_2000.csv.gz')
df_2000.head(3)

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,


In [50]:
df_2001 = pd.read_csv('Data/final_tmdb_data_2001.csv.gz')
df_2001.head(3)

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0035423,0.0,/hfeiSfWYujh6MKhtGTXyK3DD4nN.jpg,,48000000.0,"[{'id': 10749, 'name': 'Romance'}, {'id': 14, ...",,11232.0,en,Kate & Leopold,...,76019048.0,118.0,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,"If they lived in the same century, they'd be p...",Kate & Leopold,0.0,6.325,1177.0,PG-13
2,tt0114447,0.0,,,0.0,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",,151007.0,en,The Silent Force,...,0.0,90.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,They left him for dead... They should have fin...,The Silent Force,0.0,5.0,3.0,


In [54]:
# concatinate dataframes into a single dataframe
api_df = pd.concat([df_2000, df_2001], ignore_index=True)
api_df.info()
api_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2529 entries, 0 to 2528
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_id                2529 non-null   object 
 1   adult                  2527 non-null   float64
 2   backdrop_path          1352 non-null   object 
 3   belongs_to_collection  203 non-null    object 
 4   budget                 2527 non-null   float64
 5   genres                 2527 non-null   object 
 6   homepage               169 non-null    object 
 7   id                     2527 non-null   float64
 8   original_language      2527 non-null   object 
 9   original_title         2527 non-null   object 
 10  overview               2476 non-null   object 
 11  popularity             2527 non-null   float64
 12  poster_path            2268 non-null   object 
 13  production_companies   2527 non-null   object 
 14  production_countries   2527 non-null   object 
 15  rele

Unnamed: 0,imdb_id,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,original_language,original_title,...,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,certification
0,0,,,,,,,,,,...,,,,,,,,,,
1,tt0113026,0.0,/vMFs7nw6P0bIV1jDsQpxAieAVnH.jpg,,10000000.0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10402, '...",,62127.0,en,The Fantasticks,...,0.0,86.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,Try to remember the first time magic happened,The Fantasticks,0.0,5.5,22.0,
2,tt0113092,0.0,,,0.0,"[{'id': 878, 'name': 'Science Fiction'}]",,110977.0,en,For the Cause,...,0.0,100.0,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,The ultimate showdown on a forbidden planet.,For the Cause,0.0,5.1,8.0,
3,tt0116391,0.0,,,0.0,"[{'id': 18, 'name': 'Drama'}, {'id': 28, 'name...",,442869.0,hi,Gang,...,0.0,152.0,"[{'english_name': 'Hindi', 'iso_639_1': 'hi', ...",Released,,Gang,0.0,4.0,1.0,
4,tt0118694,0.0,/n4GJFGzsc7NinI1VeGDXIcQjtU2.jpg,,150000.0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,843.0,cn,花樣年華,...,12854953.0,99.0,"[{'english_name': 'Cantonese', 'iso_639_1': 'c...",Released,"Feel the heat, keep the feeling burning, let t...",In the Mood for Love,0.0,8.112,2135.0,PG


In [72]:
# save combined dataframe as single .csv.gz file
api_df.to_csv('Data/tmdb_results_combined.csv.gz', compression='gzip', index=False)

### Exploratory Data Analysis

#### How many movies had at least some valid financial information (values > 0 for budget OR revenue)?

In [55]:
# create filter to determine rows with valid financial info
valid_budget = api_df['budget'] > 0 
valid_revenue = api_df['revenue'] > 0

In [59]:
# create new dataframe with only valid financial info
valid_finance_df = api_df[valid_budget | valid_revenue]
print(f'Then number of movies with some valid financial info is {len(valid_finance_df)}.')

Then number of movies with some valid financial info is 632.


#### How many movies are there in each of the certification categories (G/PG/PG-13/R)?

In [60]:
# determine number of movies in each certification category
api_df['certification'].value_countsnts()

R          457
PG-13      183
NR          72
PG          63
G           24
NC-17        6
Unrated      1
-            1
Name: certification, dtype: int64

#### What is the average revenue per certification category?

In [71]:
# determine mean revenue per certificaiton category
avg_revenue = api_df.groupby('certification')['revenue'].mean()
# set option to supress scientific notation
pd.set_option('display.float_format', lambda x: f'{x:,.2f}')
avg_revenue

certification
-                  0.00
G         72,185,327.04
NC-17              0.00
NR         2,263,992.56
PG        62,590,769.00
PG-13     71,057,113.56
R         16,641,994.89
Unrated            0.00
Name: revenue, dtype: float64

#### What is the average budget per certification category?

In [68]:
# determine mean budget per certificaiton category
avg_budget = api_df.groupby('certification')['budget'].mean()
# set option to supress scientific notation
pd.set_option('display.float_format', lambda x: f'{x:,.2f}')
avg_budget

certification
-                  0.00
G         23,833,333.33
NC-17              0.00
NR         1,488,056.86
PG        25,039,638.71
PG-13     30,787,748.23
R          9,894,976.91
Unrated            0.00
Name: budget, dtype: float64