In [1]:
import pandas as pd
import numpy as np
import os

# Import Data:
    * title.basics.tsv.gz
    * title.ratings.tsv.gz
    * title.akas.tsv.gz

## Title Ratings

In [34]:
df_TitleRatings = pd.read_table('data_downloaded/title.ratings.tsv.gz',compression='gzip', header=0, sep='\t')
df_TitleRatings.info()
df_TitleRatings.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1262969 entries, 0 to 1262968
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1262969 non-null  object 
 1   averageRating  1262969 non-null  float64
 2   numVotes       1262969 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.9+ MB


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1905
1,tt0000002,5.9,256
2,tt0000003,6.5,1702
3,tt0000004,5.7,168
4,tt0000005,6.2,2517


## Title Basics

In [3]:
%%time
df_TitleBasics = pd.read_table('data_downloaded/title.basics.tsv.gz',compression='gzip', header=0, sep='\t',low_memory=False)
df_TitleBasics.info()
df_TitleBasics.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9146679 entries, 0 to 9146678
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 628.1+ MB
Wall time: 55.9 s


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


## Title AKAS

In [4]:
%%time
df_TitleAKAS = pd.read_table('data_downloaded/title.akas.tsv.gz',compression='gzip', header=0, sep='\t',low_memory=False)
df_TitleAKAS.info()
df_TitleAKAS.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32845092 entries, 0 to 32845091
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.0+ GB
Wall time: 1min 34s


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


# Processing
    * Filter and clean tables per requirements

## Title Basics:
* Replace "\N" with np.nan
* Eliminate movies that are null for runtimeMinutes
* Eliminate movies that are null for genre
* keep only titleType==Movie
* keep startYear 2000-2022
* Eliminate movies that include  "Documentary" in genre 

In [5]:
#Replace "\N" with np.nan
df_TitleBasics.replace({'\\N':np.nan},inplace=True)

In [6]:
#Eliminate movies that are null for runtimeMinutes
#Eliminate movies that are null for genre
df_TitleBasics.dropna(subset=['runtimeMinutes','genres'],inplace=True)

In [8]:
#keep only titleType==Movie
filter_movie = df_TitleBasics['titleType'] == 'movie'

#keep startYear 2000-2022
#format as int , must change nan to 0 first
df_TitleBasics['startYear'] = df_TitleBasics['startYear'].fillna(0)
df_TitleBasics['startYear'] = df_TitleBasics['startYear'].astype(int)
filter_years = (df_TitleBasics['startYear'] >= 2000) & (df_TitleBasics['startYear'] <= 2022)

#Eliminate movies that include "Documentary" in genre
filter_documentary = df_TitleBasics['genres'].str.contains('documentary',case=False)                                                     
                                                       
#apply filters to documnetaries                                                                                  
df_TitleBasics = df_TitleBasics.loc[filter_years & filter_movie & ~filter_documentary]
                            

In [9]:
df_TitleBasics.shape

(142018, 9)

## AKAs:
   * keep only US entries.
   * Replace "\N" with np.nan

In [10]:
#Replace "\N" with np.nan
df_TitleAKAS.replace({'\\N':np.nan},inplace=True)

In [11]:
#keep only US entries.
df_TitleAKAS = df_TitleAKAS.loc[df_TitleAKAS['region']=='US']

In [12]:
df_TitleAKAS.shape

(1342190, 8)

## Ratings:
   * Replace "\N" with np.nan (if any)

In [35]:
#Replace "\N" with np.nan
df_TitleRatings.replace({'\\N':np.nan},inplace=True)

## Filter Basics with AKA filtered (US)

In [14]:
#since AKAS is filtered on US, filter using the titleId to tconst
filter_us = df_TitleBasics['tconst'].isin(df_TitleAKAS['titleId'])
df_TitleBasics = df_TitleBasics[filter_us]

In [15]:
df_TitleBasics.shape

(82086, 9)

In [36]:
# might as well filter the ratings for us movies. 
filter_us_ratings = df_TitleRatings['tconst'].isin(df_TitleAKAS['titleId'])
df_TitleRatings = df_TitleRatings[filter_us_ratings]

In [37]:
df_TitleRatings.shape

(475727, 3)

# Save the Files to Repository
* Create a "Data" folder.
* Save Dataframes as Compressed .csv.gz Files

In [27]:
os.makedirs('data/',exist_ok=True) 
os.listdir("data/")

['.ipynb_checkpoints',
 'README.md',
 'title_akas.csv.gz',
 'title_basics.csv.gz',
 'title_ratings.csv.gz']

In [28]:
df_TitleBasics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82086 entries, 34790 to 9146444
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   tconst          82086 non-null  object
 1   titleType       82086 non-null  object
 2   primaryTitle    82086 non-null  object
 3   originalTitle   82086 non-null  object
 4   isAdult         82086 non-null  object
 5   startYear       82086 non-null  int32 
 6   endYear         0 non-null      object
 7   runtimeMinutes  82086 non-null  object
 8   genres          82086 non-null  object
dtypes: int32(1), object(8)
memory usage: 5.9+ MB


In [29]:
df_TitleBasics.to_csv("data/title_basics.csv.gz",compression='gzip',index=False)

In [30]:
df_TitleAKAS.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1342190 entries, 5 to 32844836
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1342190 non-null  object
 1   ordering         1342190 non-null  int64 
 2   title            1342190 non-null  object
 3   region           1342190 non-null  object
 4   language         3676 non-null     object
 5   types            963271 non-null   object
 6   attributes       44717 non-null    object
 7   isOriginalTitle  1340815 non-null  object
dtypes: int64(1), object(7)
memory usage: 92.2+ MB


In [31]:
df_TitleAKAS.to_csv("data/title_akas.csv.gz",compression='gzip',index=False)

In [38]:
df_TitleRatings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 475727 entries, 0 to 1262965
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         475727 non-null  object 
 1   averageRating  475727 non-null  float64
 2   numVotes       475727 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 14.5+ MB


In [39]:
df_TitleRatings.to_csv("data/title_ratings.csv.gz",compression='gzip',index=False)