In [2]:
import pandas as pd
import numpy as np

# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) # Confirm folder created
os.listdir("Data/")



['data (3) (1).csv',
 '.DS_Store',
 'title.ratings.tsv.gz.csv.gz',
 'title.basics.tsv.gz',
 'title_basics.csv.gz',
 'data (5) (1).csv',
 'data (4) (1).csv',
 'title.ratings.tsv.gz',
 'title.basics.csv.gz',
 'title-akas-us-only.csv',
 'data (6) (1).csv',
 'title_akas.csv.gz',
 'title.basics.tsv.gz.csv.gz',
 'title_ratings.csv.gz']

In [None]:
# Load the three files into respective dataframes
url_akas = "https://datasets.imdbws.com/title.akas.tsv.gz"
url_basics = "https://datasets.imdbws.com/title.basics.tsv.gz"
url_ratings = "https://datasets.imdbws.com/title.ratings.tsv.gz"

akas_df = pd.read_csv(url_akas, compression='gzip', sep='\t', low_memory=False)
basics_df = pd.read_csv(url_basics, compression='gzip', sep='\t', low_memory=False)
ratings_df = pd.read_csv(url_ratings, compression='gzip', sep='\t', low_memory=False)

# Display the first few rows of each dataframe to understand their structure
akas_df.head(), basics_df.head(), ratings_df.head()


In [4]:
akas_df = akas_df[(akas_df['region'] == 'US')]

In [9]:
akas_df.replace({'\\N': np.nan}, inplace=True)

# Display the first few rows of the processed dataframe
akas_df.head()



Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,,imdbDisplay,,0
14,tt0000002,7,The Clown and His Dogs,US,,,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,,imdbDisplay,,0
36,tt0000005,1,Blacksmithing Scene,US,,alternative,,0
41,tt0000005,6,Blacksmith Scene #1,US,,alternative,,0


In [6]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers1 =basics_df['tconst'].isin(akas_df['titleId'])
keepers1



0            True
1            True
2           False
3           False
4            True
            ...  
10135601    False
10135602    False
10135603    False
10135604    False
10135605    False
Name: tconst, Length: 10135606, dtype: bool

In [7]:
basics_df.replace({'\\N': np.nan}, inplace=True)

# Display the first few rows of the processed dataframe
basics_df.head()
basics_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10135606 entries, 0 to 10135605
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 696.0+ MB


In [10]:
basics_df = basics_df[keepers1]
basics_df



Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,,1,"Short,Sport"
...,...,...,...,...,...,...,...,...,...
10135467,tt9916560,tvMovie,March of Dimes Presents: Once Upon a Dime,March of Dimes Presents: Once Upon a Dime,0,1963,,58,Family
10135496,tt9916620,movie,The Copeland Case,The Copeland Case,0,,,,Drama
10135534,tt9916702,short,Loving London: The Playground,Loving London: The Playground,0,,,,"Drama,Short"
10135557,tt9916756,short,Pretty Pretty Black Girl,Pretty Pretty Black Girl,0,2019,,,Short


In [11]:
basics_df = basics_df[basics_df['runtimeMinutes'].notna()]

basics_df = basics_df[basics_df['genres'].notna()]

basics_df = basics_df[basics_df.titleType == 'movie']

basics_df = basics_df[basics_df['startYear'].notna()]

basics_df['startYear'] = basics_df['startYear'].astype(float)

basics_df.dtypes

tconst             object
titleType          object
primaryTitle       object
originalTitle      object
isAdult            object
startYear         float64
endYear            object
runtimeMinutes     object
genres             object
dtype: object

In [12]:
basics_df

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894.0,,45,Romance
144,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897.0,,100,"Documentary,News,Sport"
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906.0,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907.0,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908.0,,120,"Adventure,Fantasy"
...,...,...,...,...,...,...,...,...,...
10134961,tt9915436,movie,Vida em Movimento,Vida em Movimento,0,2019.0,,70,Documentary
10135139,tt9915872,movie,The Last White Witch,Boku no kanojo wa mahoutsukai,0,2019.0,,97,"Comedy,Drama,Fantasy"
10135279,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
10135288,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,95,"Action,Adventure,Thriller"


In [13]:
# Filtering the basics dataframe using startYear column to keep movies between 2000 and 2021 inclusive
basics_df = basics_df[(basics_df['startYear'] >= 2000) & (basics_df['startYear'] <= 2021)]
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 114623 entries, 34800 to 10135372
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   tconst          114623 non-null  object 
 1   titleType       114623 non-null  object 
 2   primaryTitle    114623 non-null  object 
 3   originalTitle   114623 non-null  object 
 4   isAdult         114623 non-null  object 
 5   startYear       114623 non-null  float64
 6   endYear         0 non-null       object 
 7   runtimeMinutes  114623 non-null  object 
 8   genres          114623 non-null  object 
dtypes: float64(1), object(8)
memory usage: 8.7+ MB


In [15]:
# Exclude movies that are included in the documentary category.
is_documentary = basics_df['genres'].str.contains('documentary',case=False)
basics_df = basics_df[~is_documentary]
basics_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 82041 entries, 34800 to 10135372
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          82041 non-null  object 
 1   titleType       82041 non-null  object 
 2   primaryTitle    82041 non-null  object 
 3   originalTitle   82041 non-null  object 
 4   isAdult         82041 non-null  object 
 5   startYear       82041 non-null  float64
 6   endYear         0 non-null      object 
 7   runtimeMinutes  82041 non-null  object 
 8   genres          82041 non-null  object 
dtypes: float64(1), object(8)
memory usage: 6.3+ MB


In [16]:
ratings_df.replace({'\\N': np.nan}, inplace=True)

In [21]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers2 =ratings_df['tconst'].isin(basics_df['tconst'])
keepers2


0          False
1          False
2          False
3          False
4          False
           ...  
1345295    False
1345296    False
1345297    False
1345298    False
1345299    False
Name: tconst, Length: 1345300, dtype: bool

In [22]:
ratings_df = ratings_df[keepers2]
ratings_df

Unnamed: 0,tconst,averageRating,numVotes
17896,tt0035423,6.4,87470
40653,tt0062336,6.4,180
46366,tt0068865,5.4,74
46523,tt0069049,6.7,7812
58014,tt0082328,5.9,1747
...,...,...,...
1345219,tt9914942,6.6,181
1345246,tt9915872,6.4,9
1345259,tt9916170,7.0,7
1345260,tt9916190,3.7,243


In [18]:
# Save Dataframe
akas_df.to_csv("Data/title_akas.csv.gz", compression='gzip', index=False)

# Open saved file
akas_df = pd.read_csv("Data/title_akas.csv.gz", low_memory=False)
akas_df.head()
akas_df.info() # resubmit added


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1463841 entries, 0 to 1463840
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   titleId          1463841 non-null  object 
 1   ordering         1463841 non-null  int64  
 2   title            1463841 non-null  object 
 3   region           1463841 non-null  object 
 4   language         4126 non-null     object 
 5   types            983442 non-null   object 
 6   attributes       47431 non-null    object 
 7   isOriginalTitle  1462499 non-null  float64
dtypes: float64(1), int64(1), object(6)
memory usage: 89.3+ MB


In [19]:
basics_df.to_csv("Data/title_basics.csv.gz", compression='gzip' , index=False)

#open saved file
basics_df = pd.read_csv("Data/title_basics.csv.gz", low_memory=False)
basics_df.head()
basics_df.info() #resubmit added


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 82041 entries, 0 to 82040
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          82041 non-null  object 
 1   titleType       82041 non-null  object 
 2   primaryTitle    82041 non-null  object 
 3   originalTitle   82041 non-null  object 
 4   isAdult         82041 non-null  int64  
 5   startYear       82041 non-null  float64
 6   endYear         0 non-null      float64
 7   runtimeMinutes  82041 non-null  int64  
 8   genres          82041 non-null  object 
dtypes: float64(2), int64(2), object(5)
memory usage: 5.6+ MB


In [23]:
ratings_df.to_csv("Data/title_ratings.csv.gz", compression='gzip' , index=False)

#open saved file
ratings_df = pd.read_csv("Data/title_ratings.csv.gz", low_memory=False)
ratings_df.head()
ratings_df.info() #resubmit added

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68189 entries, 0 to 68188
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   tconst         68189 non-null  object 
 1   averageRating  68189 non-null  float64
 2   numVotes       68189 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 1.6+ MB
