In [102]:
import pandas as pd
import numpy as np

In [103]:
#importing gzip file and reading into dataframe
fpath="Data/title.basics.tsv.gz"
basics = pd.read_csv(fpath, sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


# Dataset Info

In [104]:
#Viewing number of rows and data types
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10028186 entries, 0 to 10028185
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 688.6+ MB


In [105]:
#importing gzip file and reading into dataframe
fpath="Data/title-akas-us-only.csv"
akas = pd.read_csv(fpath, low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [106]:
#Viewing number of rows and data types
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1452564 entries, 0 to 1452563
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1452564 non-null  object
 1   ordering         1452564 non-null  int64 
 2   title            1452564 non-null  object
 3   region           1452564 non-null  object
 4   language         1452564 non-null  object
 5   types            1452564 non-null  object
 6   attributes       1452564 non-null  object
 7   isOriginalTitle  1452564 non-null  object
dtypes: int64(1), object(7)
memory usage: 88.7+ MB


In [107]:
#importing gzip file and reading into dataframe
fpath="Data/title.ratings.tsv.gz"
ratings = pd.read_csv(fpath, sep='\t', low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
2,tt0000003,6.5,1849
3,tt0000004,5.5,178
4,tt0000005,6.2,2632


In [108]:
#Viewing number of rows and data types
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1332684 entries, 0 to 1332683
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1332684 non-null  object 
 1   averageRating  1332684 non-null  float64
 2   numVotes       1332684 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 30.5+ MB


# Cleaning

## AKAS
* Keep only US movies
* Replace "\N" with np.nan

In [109]:
#Ensuring our df is only US movies
akas["region"].value_counts()

US    1452564
Name: region, dtype: int64

In [110]:
#Replacing \N values with np.nan. An extra backlash
#is needed to identify backslash.
#Inplace=True to make permanent changes.
akas.replace({'\\N':np.nan}, inplace=True)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


## Title Basics
* Keep only US movies (Use AKAs table, see "Filtering one dataframe based on another" section below)
* Replace "\N" with np.nan
* Eliminate movies that are null for runtimeMinutes
* Eliminate movies that are null for genre
* Keep only titleType==Movie
* Convert the startYear column to float data type.
* Filter the dataframe using startYear. Keep years between 2000-2021 (Including 2000 and 2021)
* Eliminate movies that include "Documentary" in the genre (see tip below).

In [111]:
#Replacing \N values with np.nan. An extra backlash
#is needed to identify backslash.
#Inplace=True to make permanent changes.
basics.replace({'\\N':np.nan}, inplace=True)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [112]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = basics["tconst"].isin(akas["titleId"])
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10028047,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,,58,Family
10028076,tt9916620,movie,The Copeland Case,The Copeland Case,0,,,,Drama
10028114,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,,,,"Drama,Short"
10028137,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,,,Short


In [124]:
#Dropping all rows in "runtimeMinutes" with NaN (null values)
basics = basics.dropna(subset="runtimeMinutes")

In [125]:
#Ensuring we have no null values in "runtimeMinutes" column
basics["runtimeMinutes"].isna().sum()

0

In [126]:
#Dropping all NaN rows from genres column
basics = basics.dropna(subset="genres")

In [127]:
#Ensuring we have dropped all NaN values in genres column
basics["genres"].isna().sum()

0

In [118]:
#Dropping all values in "titleType" except for "movie"
basics.drop(basics[basics["titleType"] != "movie"].index, inplace=True)

In [129]:
#Ensuring our column only has "movie" as its value
basics["titleType"].value_counts()

movie    114442
Name: titleType, dtype: int64

In [130]:
# Convert the startYear column to float data type.
basics["startYear"] = basics["startYear"].astype(dtype=float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  basics["startYear"] = basics["startYear"].astype(dtype=float)


In [131]:
#Ensuring we have converted from object to float
basics["startYear"].info()

<class 'pandas.core.series.Series'>
Int64Index: 114442 entries, 34802 to 10027952
Series name: startYear
Non-Null Count   Dtype  
--------------   -----  
114442 non-null  float64
dtypes: float64(1)
memory usage: 1.7 MB


In [132]:
#Filter the dataframe using startYear. Keep years between 2000-2021 (Including 2000 and 2021)
basics = basics[(basics['startYear'] >= 2000.0) & (basics['startYear'] <= 2021.0)]
basics["startYear"].value_counts()

2019.0    8100
2018.0    7867
2017.0    7813
2016.0    7414
2015.0    7229
2014.0    7170
2020.0    7031
2013.0    6945
2021.0    6929
2012.0    6597
2011.0    6124
2010.0    5580
2009.0    5087
2008.0    4232
2007.0    3610
2006.0    3343
2005.0    2924
2004.0    2534
2003.0    2189
2002.0    2003
2001.0    1932
2000.0    1789
Name: startYear, dtype: int64

In [133]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

## Ratings

In [134]:
# Filter the ratings table down to only include the same filters as the AKAS dataframe
keepers = ratings["tconst"].isin(akas["titleId"])
ratings = ratings[keepers]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
4,tt0000005,6.2,2632
5,tt0000006,5.1,182
6,tt0000007,5.4,824
...,...,...,...
1332645,tt9916200,8.1,231
1332646,tt9916204,8.2,264
1332653,tt9916348,8.3,18
1332654,tt9916362,6.4,5427


In [135]:
#Ensuring we do not have any Null values in our Ratings Dataset
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64