In [1]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [2]:
#example making new folder with os
import os
os.makedirs('Data/',exist_ok=True)

In [3]:
os.listdir('Data/')

['title.basics.tsv.gz',
 'title.ratings.tsv.gz',
 'title-akas-us-only.csv',
 '.ipynb_checkpoints']

In [4]:
#title_basics
basics = pd.read_csv('Data/title.basics.tsv.gz', sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [6]:
#title_ratings
ratings = pd.read_csv('Data/title.ratings.tsv', sep='\t', low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1988
1,tt0000002,5.8,265
2,tt0000003,6.5,1849
3,tt0000004,5.5,178
4,tt0000005,6.2,2632


In [7]:
#title_akas
akas = pd.read_csv('Data/title-akas-us-only.csv', low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0


In [8]:
#filtering basics based on akas
filter_us_titles = basics['tconst'].isin(akas['titleId'])
basics = basics[filter_us_titles]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
8390189,tt6354554,short,The Box,The Box,0,2018,\N,11,"Drama,Short"
8390190,tt6354556,short,Too Dumb to Live,Too Dumb to Live,0,2016,\N,\N,"Comedy,Short"
8390196,tt6354568,movie,Camp Wedding,Camp Wedding,0,2019,\N,97,"Comedy,Horror,Mystery"
8390199,tt6354574,tvEpisode,Santa Claws,Santa Claws,0,2016,\N,21,"Action,Adventure,Animation"


In [10]:
#replacing \Ns in basics
basics = basics.replace({'\\N':np.nan})

In [11]:
#replacing \Ns in akas
akas = akas.replace({'\\N':np.nan})

In [12]:
#replacing \Ns in ratings
ratings = ratings.replace({'\\N':np.nan})

In [13]:
#Viewing null values in basics
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            0
originalTitle           0
isAdult                 0
startYear           79478
endYear           1154785
runtimeMinutes     428083
genres              25784
dtype: int64

In [14]:
#Convert startyear to float
basics['startYear'] = basics['startYear'].astype(float)

In [15]:
#keep startYear 2000-2022
basics = basics[(basics['startYear']>=2000)&(basics['startYear']<=2022)]
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
33802,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001.0,,20,Short
34802,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
39544,tt0040241,short,Color Rhapsodie,Color Rhapsodie,0,2021.0,,6,Short
43546,tt0044326,short,Abstronic,Abstronic,0,2021.0,,6,Short
49493,tt0050396,short,Final Curtain,Final Curtain,0,2012.0,,20,"Horror,Short"


In [16]:
#keep only titleType == movie
basics = basics.loc[ basics['titleType']=='movie']
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 123658 entries, 34802 to 8390196
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          123658 non-null  object 
 1   titleType       123658 non-null  object 
 2   primaryTitle    123658 non-null  object 
 3   originalTitle   123658 non-null  object 
 4   isAdult         123658 non-null  object 
 5   startYear       123658 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  105364 non-null  object 
 8   genres          120937 non-null  object 
dtypes: float64(1), object(8)
memory usage: 9.4+ MB


In [17]:
#Eliminate movies that are null for runtimeMinute and genres
basics = basics.dropna(subset=['runtimeMinutes','genres'])
basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           103986
runtimeMinutes         0
genres                 0
dtype: int64

In [18]:
#Filtering out movies that are documentaries
filter_documentaries = basics['genres'].str.contains('documentary',case=False)

In [19]:
#Using documentaries filter
basics = basics[~filter_documentaries]

In [20]:
#Filtering ratings based on basics
filter_basics = ratings['tconst'].isin(basics['tconst'])
ratings = ratings[filter_basics]
ratings

Unnamed: 0,tconst,averageRating,numVotes
17961,tt0035423,6.4,87153
40764,tt0062336,6.4,175
46645,tt0069049,6.7,7754
63640,tt0088751,5.2,336
69953,tt0096056,5.6,846
...,...,...,...
1181975,tt6353976,5.0,197
1181982,tt6354056,3.6,285
1181987,tt6354108,6.7,488
1181996,tt6354270,5.9,22


In [21]:
#Viewing info of basics
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 74616 entries, 34802 to 8390196
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          74616 non-null  object 
 1   titleType       74616 non-null  object 
 2   primaryTitle    74616 non-null  object 
 3   originalTitle   74616 non-null  object 
 4   isAdult         74616 non-null  object 
 5   startYear       74616 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  74616 non-null  object 
 8   genres          74616 non-null  object 
dtypes: float64(1), object(8)
memory usage: 5.7+ MB


In [22]:
#Viewing info of akas
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1266554 entries, 0 to 1266553
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1266554 non-null  object
 1   ordering         1266554 non-null  int64 
 2   title            1266554 non-null  object
 3   region           1266554 non-null  object
 4   language         3517 non-null     object
 5   types            848594 non-null   object
 6   attributes       43787 non-null    object
 7   isOriginalTitle  1265289 non-null  object
dtypes: int64(1), object(7)
memory usage: 77.3+ MB


In [23]:
basics.to_csv("Data/title-basics.csv", index=False)

In [24]:
ratings.to_csv("Data/title-ratings.csv", index=False)