# Movie Success Predictions

In [3]:
# Import Libraries 
import pandas as pd
import numpy as np
import os, json

## Preprocessing Data 

In [2]:
# Save all url's for the datasets
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'

In [3]:
# Convert data sets into dataframes
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

In [4]:
# Replace \N with NaN in all the dataframes
basics.replace({'\\N':np.nan}, inplace=True)
ratings.replace({'\\N':np.nan}, inplace=True)
akas.replace({'\\N':np.nan}, inplace=True)

In [5]:
# check for duplicate rows in dataframes
print(f'Duplicate rows in basics dataframe:', basics.duplicated().sum())
print(f'Duplicate rows in ratings dataframe:', ratings.duplicated().sum())
print(f'Duplicate rows in akas dataframe:', akas.duplicated().sum())

Duplicate rows in basics dataframe: 0
Duplicate rows in ratings dataframe: 0
Duplicate rows in akas dataframe: 0


### Cleaning AKAs Dataframe

In [6]:
# Display info for AKAs dataframe
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34984864 entries, 0 to 34984863
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.1+ GB


In [7]:
# Check value_counts for region column
akas['region'].value_counts()

DE    4181766
FR    4178938
JP    4178526
IN    4117705
ES    4100704
       ...   
GS          1
TV          1
PW          1
NR          1
NU          1
Name: region, Length: 248, dtype: int64

In [8]:
# Keep only movies in US region
akas = akas.loc[akas['region'] == 'US']
akas['region'].value_counts()

US    1415820
Name: region, dtype: int64

In [24]:
# Check info of akas dataframe
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1415820 entries, 5 to 34984608
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1415820 non-null  object
 1   ordering         1415820 non-null  int64 
 2   title            1415820 non-null  object
 3   region           1415820 non-null  object
 4   language         3829 non-null     object
 5   types            973958 non-null   object
 6   attributes       46001 non-null    object
 7   isOriginalTitle  1414475 non-null  object
dtypes: int64(1), object(7)
memory usage: 97.2+ MB


### Cleaning Title Basics Dataframe

In [9]:
# Display info for title basics dataframe
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9631839 entries, 0 to 9631838
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 661.4+ MB


In [10]:
# Eliminate movies that are null for runtimeMinutes
print(f'NaN count in runtimeMinutes column:', basics['runtimeMinutes'].isna().sum())
basics.dropna(subset=['runtimeMinutes'], inplace=True)
print(f'NaN count after eliminating null rows:',basics['runtimeMinutes'].isna().sum())

NaN count in runtimeMinutes column: 6809952
NaN count after eliminating null rows: 0


In [11]:
# Eliminate movies that are null for genre
print(f'NaN count in genre column:', basics['genres'].isna().sum())
basics.dropna(subset=['genres'], inplace=True)
print(f'NaN count after eliminating null rows:',basics['genres'].isna().sum())

NaN count in genre column: 75581
NaN count after eliminating null rows: 0


In [12]:
# Check value_counts for the titleType column
basics['titleType'].value_counts()

tvEpisode       1373823
short            592201
movie            377538
video            178952
tvMovie           90899
tvSeries          89328
tvSpecial         17716
tvMiniSeries      16862
tvShort            8670
videoGame           317
Name: titleType, dtype: int64

In [13]:
# Keep only rows in which titleType == movie
basics = basics.loc[basics['titleType'] == 'movie']
basics['titleType'].value_counts()

movie    377538
Name: titleType, dtype: int64

In [14]:
# Remove movies that are null for the startYear
# we will do this because we only want to keep movies with a startYear between
# 2000 - 2021, and null values are not between those years
basics.dropna(subset = ['startYear'], inplace=True)
basics['startYear'].isna().sum()

0

In [15]:
# convert start year column to type integer
basics['startYear'] = basics['startYear'].astype(int)
basics.dtypes

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear          int32
endYear           object
runtimeMinutes    object
genres            object
dtype: object

In [16]:
# Keep rows with startYear between 2000 - 2022
basics = basics.loc[(basics['startYear'] >= 2000) & (basics['startYear'] <= 2021)]

In [17]:
# check that startYear column only kept data between 2000 - 2021
basics['startYear'].describe()

count    209447.000000
mean       2012.835810
std           5.610638
min        2000.000000
25%        2009.000000
50%        2014.000000
75%        2017.000000
max        2021.000000
Name: startYear, dtype: float64

In [18]:
# Eliminate movies that include 'Documentary' in the genre 
is_documentary = basics['genres'].str.contains('documentary', case=False)
basics = basics[~is_documentary]

In [21]:
# Keep only movies made in the US region (as defined by the akas dataframe)
keepers = basics['tconst'].isin(akas['titleId'])
basics = basics[keepers]

In [25]:
# Check info of basics dataframe
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80989 entries, 34803 to 9631605
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          80989 non-null  object
 1   titleType       80989 non-null  object
 2   primaryTitle    80989 non-null  object
 3   originalTitle   80989 non-null  object
 4   isAdult         80989 non-null  object
 5   startYear       80989 non-null  int32 
 6   endYear         0 non-null      object
 7   runtimeMinutes  80989 non-null  object
 8   genres          80989 non-null  object
dtypes: int32(1), object(8)
memory usage: 5.9+ MB


### Cleaning Title Ratings Dataframe

In [22]:
# check info for ratings dataframe
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1281765 entries, 0 to 1281764
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1281765 non-null  object 
 1   averageRating  1281765 non-null  float64
 2   numVotes       1281765 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 29.3+ MB


In [23]:
# Keep only movies made in the US region (as defined by the akas dataframe)
keepers = ratings['tconst'].isin(akas['titleId'])
ratings = ratings[keepers]

In [26]:
# check info of ratings dataframe
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 489683 entries, 0 to 1281743
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         489683 non-null  object 
 1   averageRating  489683 non-null  float64
 2   numVotes       489683 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 14.9+ MB


### Saving Dataframes in the GitHub Repository

In [30]:
# Create 'Data/' file in GitHub Repository
os.makedirs('Data/', exist_ok=True)
# confirm the folder is created
os.listdir("Data/")

[]

In [35]:
# save dataframes to files
basics.to_csv("Data/title_basics.csv.gz", compression='gzip', index=False)
ratings.to_csv("Data/title_ratings.csv.gz", compression='gzip', index=False)
akas.to_csv("Data/title_akas.csv.gz", compression='gzip', index=False)

In [34]:
# check basics dataframe was saved correctly
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory=False)
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80989 entries, 0 to 80988
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          80989 non-null  object 
 1   titleType       80989 non-null  object 
 2   primaryTitle    80989 non-null  object 
 3   originalTitle   80989 non-null  object 
 4   isAdult         80989 non-null  int64  
 5   startYear       80989 non-null  int64  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  80989 non-null  int64  
 8   genres          80989 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 5.6+ MB


In [37]:
# check ratings dataframe was saved correctly
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory=False)
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 489683 entries, 0 to 489682
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         489683 non-null  object 
 1   averageRating  489683 non-null  float64
 2   numVotes       489683 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 11.2+ MB


In [38]:
# check akas dataframe was saved correctly
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory=False)
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1415820 entries, 0 to 1415819
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1415820 non-null  object 
 1   ordering         1415820 non-null  int64  
 2   title            1415820 non-null  object 
 3   region           1415820 non-null  object 
 4   language         3829 non-null     object 
 5   types            973958 non-null   object 
 6   attributes       46001 non-null    object 
 7   isOriginalTitle  1414475 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 86.4+ MB


## Extract Data from TMDB

### tmdbsimple Package

In [2]:
!pip install tmdbsimple;
import tmdbsimple as tmdb



In [None]:
# load login credentials using json file in notebook
with open('C:')