In [1]:
import pandas as pd
import numpy as np

basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"



## Converting urls into databases & loading the data

In [2]:
basics = pd.read_csv(basics_url,sep='\t',low_memory=False)
akas = pd.read_csv(akas_url,sep='\t',low_memory=False)
ratings = pd.read_csv(ratings_url,sep='\t',low_memory=False)

## Preprocessing

### Title Basics

#### Replacing "\N" with np.nan

In [3]:
basics = basics.replace({'\\N':np.nan})

#### Eliminating movies that are null for runtimeMinutes

In [4]:
# Checking to see how many null values there are in runtimeMinutes column
basics['runtimeMinutes'].isnull().value_counts()

True     6920490
False    2890911
Name: runtimeMinutes, dtype: int64

In [5]:
# Dropping null values in runtimeMinutes column
basics = basics.dropna(subset='runtimeMinutes')

In [6]:
# Double checking null values are deleted
basics['runtimeMinutes'].isnull().value_counts()


False    2890911
Name: runtimeMinutes, dtype: int64

#### Eliminating movies that are null for genre

In [7]:
# Checking null values in genre column
basics['genres'].isnull().value_counts()

False    2814331
True       76580
Name: genres, dtype: int64

In [8]:
# Dropping null values in genres column
basics = basics.dropna(subset='genres')

In [9]:
# Double checking null values in genre column are deleted
basics['genres'].isnull().value_counts()

False    2814331
Name: genres, dtype: int64

####  Keeping only titleType==Movie

In [10]:
basics['titleType'].value_counts()

tvEpisode       1427044
short            599511
movie            381585
video            180194
tvMovie           91444
tvSeries          90243
tvSpecial         18062
tvMiniSeries      17134
tvShort            8792
videoGame           322
Name: titleType, dtype: int64

In [11]:
titleType_movie = basics.loc[basics['titleType']=='movie']

#### Keeping only movies with startYear from 2000-2022

In [12]:
# checking data types
titleType_movie.dtypes

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear         object
endYear           object
runtimeMinutes    object
genres            object
dtype: object

In [13]:
# checking how many null values in startYear Column
titleType_movie.isnull().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear           6422
endYear           381585
runtimeMinutes         0
genres                 0
dtype: int64

In [14]:
# dropping null values in startYear Column
titleType_movie = titleType_movie.dropna(subset='startYear')

In [15]:
titleType_movie.isnull().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           375163
runtimeMinutes         0
genres                 0
dtype: int64

In [16]:
# converting startYear column to int data type
titleType_movie['startYear'] = titleType_movie['startYear'].astype(float)

In [17]:
# double checking startYear column conversion
titleType_movie.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 375163 entries, 8 to 9811351
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          375163 non-null  object 
 1   titleType       375163 non-null  object 
 2   primaryTitle    375163 non-null  object 
 3   originalTitle   375163 non-null  object 
 4   isAdult         375163 non-null  object 
 5   startYear       375163 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  375163 non-null  object 
 8   genres          375163 non-null  object 
dtypes: float64(1), object(8)
memory usage: 28.6+ MB


In [18]:
# keeping movies from startYear 2000-2022
titleType_movie_2000_2022 = titleType_movie.loc[(titleType_movie['startYear'] >= 2000) & (titleType_movie['startYear'] <= 2022) ]

#### Eliminating movies that include "Documentary" in genre

In [19]:
# Excluding movies that are included in the documentary category.
is_documentary = titleType_movie_2000_2022['genres'].str.contains('documentary',case=False)
movie_2000_2022_nodoc = titleType_movie_2000_2022[~is_documentary]

#### Keeping only US movies

In [20]:
# Filtering the basics table down to only include the US movies by using the filter akas dataframe
keepers =movie_2000_2022_nodoc['tconst'].isin(akas['titleId'])
keepers



34803      True
61116      True
67669      True
77964      True
86801      True
           ... 
9811074    True
9811083    True
9811122    True
9811167    True
9811251    True
Name: tconst, Length: 147367, dtype: bool

In [21]:
# Filtering the basics database
basics=movie_2000_2022_nodoc[keepers]

### Title AKAs

#### Replacing "\N" with np.nan

In [22]:
akas = akas.replace({'\\N':np.nan})

#### Keeping only US movies

In [23]:
akas_US = akas.loc[akas['region'] =="US"]

In [24]:
akas = akas_US

In [25]:
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
35743767,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,,imdbDisplay,,0
35743837,tt9916620,1,The Copeland Case,US,,imdbDisplay,,0
35743926,tt9916702,1,Loving London: The Playground,US,,,,0
35743969,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0


### Title Ratings

#### Replacing "\N" with np.nan

In [26]:
ratings = ratings.replace({'\\N':np.nan})

#### Keeping only US movies

In [27]:
# Filtering the basics table down to only include the US movies by using the filter akas dataframe
keepers2 =ratings['tconst'].isin(akas['titleId'])


In [28]:
# Filtering the basics database
ratings=ratings[keepers2]

In [29]:
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1966
1,tt0000002,5.8,263
4,tt0000005,6.2,2607
5,tt0000006,5.2,181
6,tt0000007,5.4,816
...,...,...,...
1306377,tt9916200,8.1,229
1306378,tt9916204,8.1,262
1306385,tt9916348,8.1,18
1306386,tt9916362,6.4,5307


## Exporting the csv files

In [30]:
basics.to_csv(r'/Users/faris/Documents/DataEnrichment/csvs/basics.csv', index=False)
akas.to_csv(r'/Users/faris/Documents/DataEnrichment/csvs/akas.csv', index=False)
ratings.to_csv(r'/Users/faris/Documents/DataEnrichment/csvs/ratings.csv', index=False)

In [31]:
## Save current dataframe to file.
basics.to_csv("/Users/faris/Documents/DataEnrichment/csvs/basics.csv.gz",compression='gzip',index=False)

akas.to_csv("/Users/faris/Documents/DataEnrichment/csvs/akas.csv.gz",compression='gzip',index=False)

ratings.to_csv("/Users/faris/Documents/DataEnrichment/csvs/ratings.csv.gz",compression='gzip',index=False)



In [32]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020.0,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
77964,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"
