In [1]:
# Imports
import pandas as pd
import numpy as np

In [2]:
# Data filenames
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'

In [3]:
# Load Data
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

# Filtering/Cleaning

## Replace "\N" (missing values) with np.nan and make them permanent

In [4]:
# Check missing values for all dataframes and replace with NAN
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            11
dtype: int64

In [5]:
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [6]:
akas.isna().sum()

titleId              0
ordering             0
title                5
region             111
language             0
types                0
attributes           0
isOriginalTitle      0
dtype: int64

In [7]:
basics.replace({'\\N':np.nan}, inplace=True)
akas.replace({'\\N':np.nan}, inplace=True)

## Eliminate movies that are null for runtimeMinutes (in 'basics' df)

In [8]:
# Drop rows in 'basics' df for null in 'runtimeMinutes' column and re-display
basics = basics.dropna(subset=['runtimeMinutes'])

In [9]:
basics['runtimeMinutes'].isna().sum()

0

## Eliminate movies that are null for genre

In [10]:
# Drop rows in 'basics' df for null in 'genre' column and re-display
basics.dropna(subset=['genres'], inplace=True)
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear          161586
endYear           2732651
runtimeMinutes          0
genres                  0
dtype: int64

## Keep only titleType==Movie

In [11]:
# Display all unique values in 'titleType' column
basics['titleType'].value_counts()

tvEpisode       1400784
short            596339
movie            379928
video            179640
tvMovie           91184
tvSeries          89916
tvSpecial         17891
tvMiniSeries      17008
tvShort            8727
videoGame           318
Name: titleType, dtype: int64

In [12]:
# Keep only the 'Movie' category and re-display to confirm
basics = basics[basics['titleType']=='movie']
basics['titleType'].value_counts()

movie    379928
Name: titleType, dtype: int64

## Keep startYear 2000-2022

In [13]:
# Display number of unique values in 'startYear' column
basics['startYear'].nunique()

129

In [14]:
# Keep only years 2000-2022 and re-display values
basics = basics[basics['startYear'].isin(['2000','2001','2002','2003','2004',
                                         '2005','2006','2007','2008','2009',
                                         '2010','2011','2012','2013','2014',
                                         '2015','2016','2017','2018','2019',
                                         '2020','2021','2022'])]
basics['startYear'].value_counts()

2017    14361
2018    14305
2019    14037
2016    13938
2015    13455
2014    13081
2022    12572
2013    12367
2021    12298
2012    11613
2020    11541
2011    10764
2010    10199
2009     9336
2008     8140
2007     6951
2006     6501
2005     5814
2004     5188
2003     4577
2002     4126
2001     3855
2000     3633
Name: startYear, dtype: int64

## Eliminate movies that include "Documentary" in genre

In [15]:
# Exclude movies that are included in the documentary category and re-display value counts
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]
basics['genres'].value_counts()

Drama                         35933
Comedy                        13423
Comedy,Drama                   6448
Horror                         5780
Drama,Romance                  4301
                              ...  
Action,Animation,Game-Show        1
Family,Musical,Sport              1
Horror,Music,Mystery              1
Comedy,History,Mystery            1
Crime,Fantasy,Sci-Fi              1
Name: genres, Length: 969, dtype: int64

## Keep only US movies 

In [16]:
# Filter the 'basics' table down to only include the US by using the filter 'akas' dataframe
keepers =basics['tconst'].isin(akas['titleId'])
keepers


34803      True
61116      True
67669      True
77964      True
86801      True
           ... 
9730755    True
9730764    True
9730803    True
9730848    True
9730932    True
Name: tconst, Length: 147118, dtype: bool

In [17]:
# Filter 'basics' df
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77964,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9730755,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9730764,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"
9730803,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,,84,Thriller
9730848,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"


In [18]:
# Keep only US movies in 'ratings' df
# Filter the 'basics' table down to only include the US by using the filter 'akas' dataframe
keepers2 =ratings['tconst'].isin(akas['titleId'])
keepers2

0           True
1           True
2           True
3           True
4           True
           ...  
1295014     True
1295015     True
1295016    False
1295017    False
1295018    False
Name: tconst, Length: 1295019, dtype: bool

In [19]:
# Filter 'ratings' df
ratings = ratings[keepers2]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1967
1,tt0000002,5.8,263
2,tt0000003,6.5,1803
3,tt0000004,5.6,179
4,tt0000005,6.2,2603
...,...,...,...
1294998,tt9916460,9.4,18
1295001,tt9916538,8.6,7
1295002,tt9916544,6.9,62
1295014,tt9916730,8.3,10


In [20]:
# Keep only US movies in 'akas' df
akas = akas[akas['region']=='US']
akas['region']

5           US
14          US
33          US
36          US
41          US
            ..
35413575    US
35413645    US
35413734    US
35413777    US
35413793    US
Name: region, Length: 1425682, dtype: object

## Save each dataframe to 'Data' Folder

In [21]:
# Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [22]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


In [23]:
# Open saved file and preview again
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1967
1,tt0000002,5.8,263
2,tt0000003,6.5,1803
3,tt0000004,5.6,179
4,tt0000005,6.2,2603


In [24]:
# Open saved file and preview again
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0
