In [40]:
import pandas as pd
import numpy as np

basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"



## Converting urls into databases & loading the data

In [41]:
basics = pd.read_csv(basics_url,sep='\t',low_memory=False)
akas = pd.read_csv(akas_url,sep='\t',low_memory=False)
ratings = pd.read_csv(ratings_url,sep='\t',low_memory=False)

## Preprocessing

### Title Basics

#### Replacing "\N" with np.nan

In [42]:
basics = basics.replace({'\\N':np.nan})

#### Eliminating movies that are null for runtimeMinutes

In [43]:
# Checking to see how many null values there are in runtimeMinutes column
basics['runtimeMinutes'].isnull().value_counts()

True     6918513
False    2890457
Name: runtimeMinutes, dtype: int64

In [44]:
# Dropping null values in runtimeMinutes column
basics = basics.dropna(subset='runtimeMinutes')

In [45]:
# Double checking null values are deleted
basics['runtimeMinutes'].isnull().value_counts()


False    2890457
Name: runtimeMinutes, dtype: int64

#### Eliminating movies that are null for genre

In [46]:
# Checking null values in genre column
basics['genres'].isnull().value_counts()

False    2813878
True       76579
Name: genres, dtype: int64

In [47]:
# Dropping null values in genres column
basics = basics.dropna(subset='genres')

In [48]:
# Double checking null values in genre column are deleted
basics['genres'].isnull().value_counts()

False    2813878
Name: genres, dtype: int64

####  Keeping only titleType==Movie

In [49]:
basics['titleType'].value_counts()

tvEpisode       1426839
short            599384
movie            381525
video            180172
tvMovie           91437
tvSeries          90224
tvSpecial         18057
tvMiniSeries      17127
tvShort            8791
videoGame           322
Name: titleType, dtype: int64

In [50]:
titleType_movie = basics.loc[basics['titleType']=='movie']

#### Keeping only movies with startYear from 2000-2022

In [51]:
# checking data types
titleType_movie.dtypes

tconst            object
titleType         object
primaryTitle      object
originalTitle     object
isAdult           object
startYear         object
endYear           object
runtimeMinutes    object
genres            object
dtype: object

In [52]:
# checking how many null values in startYear Column
titleType_movie.isnull().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear           6425
endYear           381525
runtimeMinutes         0
genres                 0
dtype: int64

In [53]:
# dropping null values in startYear Column
titleType_movie = titleType_movie.dropna(subset='startYear')

In [54]:
titleType_movie.isnull().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           375100
runtimeMinutes         0
genres                 0
dtype: int64

In [55]:
# converting startYear column to int data type
titleType_movie['startYear'] = titleType_movie['startYear'].astype(int)

In [56]:
# double checking startYear column conversion
titleType_movie.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 375100 entries, 8 to 9808920
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          375100 non-null  object
 1   titleType       375100 non-null  object
 2   primaryTitle    375100 non-null  object
 3   originalTitle   375100 non-null  object
 4   isAdult         375100 non-null  object
 5   startYear       375100 non-null  int64 
 6   endYear         0 non-null       object
 7   runtimeMinutes  375100 non-null  object
 8   genres          375100 non-null  object
dtypes: int64(1), object(8)
memory usage: 28.6+ MB


In [57]:
# keeping movies from startYear 2000-2022
titleType_movie_2000_2022 = titleType_movie.loc[(titleType_movie['startYear'] >= 2000) & (titleType_movie['startYear'] <= 2022) ]

#### Eliminating movies that include "Documentary" in genre

In [58]:
# Excluding movies that are included in the documentary category.
is_documentary = titleType_movie_2000_2022['genres'].str.contains('documentary',case=False)
movie_2000_2022_nodoc = titleType_movie_2000_2022[~is_documentary]

#### Keeping only US movies

In [59]:
# Filtering the basics table down to only include the US movies by using the filter akas dataframe
keepers =movie_2000_2022_nodoc['tconst'].isin(akas['titleId'])
keepers



34803      True
61116      True
67669      True
77964      True
86801      True
           ... 
9808643    True
9808652    True
9808691    True
9808736    True
9808820    True
Name: tconst, Length: 147358, dtype: bool

In [60]:
# Filtering the basics database
basics=movie_2000_2022_nodoc[keepers]

### Title AKAs

#### Replacing "\N" with np.nan

In [61]:
akas = akas.replace({'\\N':np.nan})

#### Keeping only US movies

In [62]:
akas_US = akas.loc[akas['region'] =="US"]

In [63]:
akas = akas_US

In [64]:
akas

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0
...,...,...,...,...,...,...,...,...
35736144,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,,imdbDisplay,,0
35736214,tt9916620,1,The Copeland Case,US,,imdbDisplay,,0
35736303,tt9916702,1,Loving London: The Playground,US,,,,0
35736346,tt9916756,1,Pretty Pretty Black Girl,US,,imdbDisplay,,0


### Title Ratings

#### Replacing "\N" with np.nan

In [65]:
ratings = ratings.replace({'\\N':np.nan})

#### Keeping only US movies

In [66]:
# Filtering the basics table down to only include the US movies by using the filter akas dataframe
keepers2 =ratings['tconst'].isin(akas['titleId'])


In [67]:
# Filtering the basics database
ratings=ratings[keepers2]

In [68]:
ratings

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1966
1,tt0000002,5.8,263
4,tt0000005,6.2,2607
5,tt0000006,5.2,181
6,tt0000007,5.4,816
...,...,...,...
1305894,tt9916200,8.1,229
1305895,tt9916204,8.1,262
1305902,tt9916348,8.1,18
1305903,tt9916362,6.4,5307


## Exporting the csv files

In [72]:
basics.to_csv(r'/Users/faris/Documents/DataEnrichment/csvs/basics.csv', index=False)
akas.to_csv(r'/Users/faris/Documents/DataEnrichment/csvs/akas.csv', index=False)
ratings.to_csv(r'/Users/faris/Documents/DataEnrichment/csvs/ratings.csv', index=False)

In [73]:
## Save current dataframe to file.
basics.to_csv("/Users/faris/Documents/DataEnrichment/csvs/basics.csv.gz",compression='gzip',index=False)

akas.to_csv("/Users/faris/Documents/DataEnrichment/csvs/akas.csv.gz",compression='gzip',index=False)

ratings.to_csv("/Users/faris/Documents/DataEnrichment/csvs/ratings.csv.gz",compression='gzip',index=False)



In [74]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34803,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61116,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
67669,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77964,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86801,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
