In [1]:
# Imports
import pandas as pd
import numpy as np

In [2]:
# Data filenames
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'
ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'
akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'

In [3]:
# Load Data
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)

# Filtering/Cleaning

## Replace "\N" (missing values) with np.nan and make them permanent

In [4]:
# Check missing values for all dataframes and replace with NAN
basics.isna().sum()

tconst             0
titleType          0
primaryTitle      11
originalTitle     11
isAdult            0
startYear          0
endYear            0
runtimeMinutes     0
genres            12
dtype: int64

In [5]:
ratings.isna().sum()

tconst           0
averageRating    0
numVotes         0
dtype: int64

In [6]:
akas.isna().sum()

titleId              0
ordering             0
title                5
region             111
language             0
types                0
attributes           0
isOriginalTitle      0
dtype: int64

In [7]:
basics.replace({'\\N':np.nan}, inplace=True)
akas.replace({'\\N':np.nan}, inplace=True)

## Eliminate movies that are null for runtimeMinutes (in 'basics' df)

In [8]:
# Drop rows in 'basics' df for null in 'runtimeMinutes' column and re-display
basics = basics.dropna(subset=['runtimeMinutes'])

In [9]:
basics['runtimeMinutes'].isna().sum()

0

## Eliminate movies that are null for genre

In [10]:
# Drop rows in 'basics' df for null in 'genre' column and re-display
basics.dropna(subset=['genres'], inplace=True)
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear          164498
endYear           2748991
runtimeMinutes          0
genres                  0
dtype: int64

## Keep only titleType==Movie

In [11]:
# Display all unique values in 'titleType' column
basics['titleType'].value_counts()

tvEpisode       1414994
short            597606
movie            380533
video            179840
tvMovie           91261
tvSeries          89981
tvSpecial         17935
tvMiniSeries      17042
tvShort            8778
videoGame           319
Name: titleType, dtype: int64

In [12]:
# Keep only the 'Movie' category and re-display to confirm
basics = basics[basics['titleType']=='movie']
basics['titleType'].value_counts()

movie    380533
Name: titleType, dtype: int64

## Keep startYear 2000-2022

In [13]:
# Display number of unique values in 'startYear' column
basics['startYear'].nunique()

130

In [14]:
# Keep only years 2000-2022 and re-display values
basics = basics[basics['startYear'].isin(['2000','2001','2002','2003','2004',
                                         '2005','2006','2007','2008','2009',
                                         '2010','2011','2012','2013','2014',
                                         '2015','2016','2017','2018','2019',
                                         '2020','2021','2022'])]
basics['startYear'].value_counts()

2017    14368
2018    14312
2019    14040
2016    13939
2015    13462
2014    13093
2022    12635
2013    12372
2021    12304
2012    11617
2020    11552
2011    10765
2010    10200
2009     9340
2008     8142
2007     6954
2006     6504
2005     5820
2004     5192
2003     4582
2002     4127
2001     3856
2000     3636
Name: startYear, dtype: int64

## Eliminate movies that include "Documentary" in genre

In [15]:
# Exclude movies that are included in the documentary category and re-display value counts
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]
basics['genres'].value_counts()

Drama                     35955
Comedy                    13423
Comedy,Drama               6451
Horror                     5780
Drama,Romance              4302
                          ...  
Adult,Crime,Mystery           1
Family,Musical,Sport          1
Horror,Music,Mystery          1
Comedy,History,Mystery        1
Crime,Fantasy,Sci-Fi          1
Name: genres, Length: 969, dtype: int64

## Keep only US movies 

In [16]:
# Keep only US movies in 'akas' df
akas = akas[akas['region']=='US']
akas['region']

5           US
14          US
33          US
36          US
41          US
            ..
35553568    US
35553638    US
35553727    US
35553770    US
35553786    US
Name: region, Length: 1428693, dtype: object

In [17]:
# Filter the 'basics' table down to only include the US by using the filter 'akas' dataframe
keepers =basics['tconst'].isin(akas['titleId'])
keepers


34803       True
61116       True
67669       True
77964      False
86801       True
           ...  
9763772     True
9763781     True
9763820    False
9763865     True
9763949    False
Name: tconst, Length: 147215, dtype: bool

In [18]:
# Filter 'basics' df
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
93938,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama
...,...,...,...,...,...,...,...,...,...
9763237,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
9763632,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
9763772,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9763781,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"


In [19]:
# Keep only US movies in 'ratings' df
# Filter the 'basics' table down to only include the US by using the filter 'akas' dataframe
keepers2 =ratings['tconst'].isin(akas['titleId'])
keepers2

0           True
1           True
2          False
3          False
4           True
           ...  
1299872    False
1299873    False
1299874    False
1299875    False
1299876    False
Name: tconst, Length: 1299877, dtype: bool

In [20]:
# Filter 'ratings' df
ratings = ratings[keepers2]
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1965
1,tt0000002,5.8,262
4,tt0000005,6.2,2604
5,tt0000006,5.1,179
6,tt0000007,5.4,816
...,...,...,...
1299841,tt9916200,8.1,227
1299842,tt9916204,8.2,260
1299849,tt9916348,8.3,18
1299850,tt9916362,6.4,5277


## Save each dataframe to 'Data' Folder

In [22]:
# Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [23]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
3,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
4,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,,126,Drama


In [24]:
# Open saved file and preview again
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1965
1,tt0000002,5.8,262
2,tt0000005,6.2,2604
3,tt0000006,5.1,179
4,tt0000007,5.4,816


In [25]:
# Open saved file and preview again
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,,imdbDisplay,,0.0
1,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0.0
2,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0.0
3,tt0000005,1,Blacksmithing Scene,US,,alternative,,0.0
4,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0.0
