In [1]:
import pandas as pd

In [16]:
#importing gzip file and reading into dataframe
fpath="Data/title.basics.tsv.gz"
basics = pd.read_csv(fpath, sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [22]:
#Viewing number of rows and data types
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1365788 entries, 0 to 10028141
Data columns (total 9 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   tconst          1365788 non-null  object
 1   titleType       1365788 non-null  object
 2   primaryTitle    1365788 non-null  object
 3   originalTitle   1365788 non-null  object
 4   isAdult         1365788 non-null  object
 5   startYear       1267177 non-null  object
 6   endYear         37158 non-null    object
 7   runtimeMinutes  862710 non-null   object
 8   genres          1337311 non-null  object
dtypes: object(9)
memory usage: 104.2+ MB


In [7]:
#importing gzip file and reading into dataframe
fpath="Data/title-akas-us-only.csv"
akas = pd.read_csv(fpath, low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [21]:
#Viewing number of rows and data types
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452564 entries, 0 to 1452563
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1452564 non-null  object
 1   ordering         1452564 non-null  int64 
 2   title            1452564 non-null  object
 3   region           1452564 non-null  object
 4   language         4018 non-null     object
 5   types            981678 non-null   object
 6   attributes       47016 non-null    object
 7   isOriginalTitle  1451222 non-null  object
dtypes: int64(1), object(7)
memory usage: 88.7+ MB


In [9]:
#importing gzip file and reading into dataframe
fpath="Data/title.ratings.tsv.gz"
ratings = pd.read_csv(fpath, sep='\t', low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
2,tt0000003,6.5,1849
3,tt0000004,5.5,178
4,tt0000005,6.2,2632


In [20]:
#Viewing number of rows and data types
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1332684 entries, 0 to 1332683
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1332684 non-null  object 
 1   averageRating  1332684 non-null  float64
 2   numVotes       1332684 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.5+ MB


# Cleaning

## Filter AKAS
* Keep only US movies
* Replace "\N" with np.nan

In [14]:
#Ensuring our df is only US movies
akas["region"].value_counts()

US    1452564
Name: region, dtype: int64

In [13]:
#Replacing \N values with np.nan. An extra backlash
#is needed to identify backslash.
#Inplace=True to make permanent changes.
akas.replace({'\\N':np.nan}, inplace=True)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


## Title Basics:
* Keep only US movies (Use AKAs table, see "Filtering one dataframe based on another" section below)
* Replace "\N" with np.nan
* Eliminate movies that are null for runtimeMinutes
* Eliminate movies that are null for genre
* Keep only titleType==Movie
* Convert the startYear column to float data type.
* Filter the dataframe using startYear. Keep years between 2000-2021 (Including 2000 and 2021)
* Eliminate movies that include "Documentary" in the genre (see tip below).

In [17]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = basics["tconst"].isin(akas["titleId"])
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10028047,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,\N,58,Family
10028076,tt9916620,movie,The Copeland Case,The Copeland Case,0,\N,\N,\N,Drama
10028114,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,\N,\N,\N,"Drama,Short"
10028137,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,\N,\N,Short


In [18]:
#Replacing \N values with np.nan. An extra backlash
#is needed to identify backslash.
#Inplace=True to make permanent changes.
basics.replace({'\\N':np.nan}, inplace=True)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,,1,"Short,Sport"


In [19]:
basics["runtimeMinutes"].isna().sum()

503078