In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
# creating variables for the three data urls
basics_url = 'https://datasets.imdbws.com/title.basics.tsv.gz'

akas_url = 'https://datasets.imdbws.com/title.akas.tsv.gz'

ratings_url = 'https://datasets.imdbws.com/title.ratings.tsv.gz'

In [3]:
# Creating basics df and adding, tab separated and low_memory parameters
# using head to show first 5 row. 

basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [4]:
# Creating title akas df and adding, tab separated and low_memory parameters
# using head to show first 5 row. 

akas = pd.read_csv(akas_url, sep='\t', low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [5]:
# Creating ratings df and adding, tab separated and low_memory parameters
# using head to show first 5 row. 

ratings = pd.read_csv(ratings_url, sep='\t', low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1910
1,tt0000002,5.8,256
2,tt0000003,6.5,1714
3,tt0000004,5.6,169
4,tt0000005,6.2,2529


In [6]:
# replacing \N values with NaN
basics.replace({'\\N':np.nan}, inplace=True)

akas.replace({'\\N':np.nan}, inplace=True)

ratings.replace({'\\N':np.nan}, inplace=True)

In [7]:
# Creating filters for null run time minutes and null genre
run_time = basics['runtimeMinutes'].isna()
no_genre = basics['genres'].isna() 



In [8]:
# dropping the movies that don't have a run time minutes

basics.drop(basics[run_time].index, inplace=True)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [9]:
# dropping the movies with no genre
basics.drop(basics[no_genre].index, inplace=True)
basics.head()

  basics.drop(basics[no_genre].index, inplace=True)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [10]:
# Creating filter for start year

start_year = basics['startYear'] < '2000'

basics[start_year].head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [11]:
# Creating filter for documentary type

doc = basics['genres'].str.contains('documentary', case=False)
basics = basics[~doc]

In [12]:
# dropping  any movie that had start year before 2000

basics.drop(basics[start_year].index, inplace=True)
basics.head()

  basics.drop(basics[start_year].index, inplace=True)


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
33792,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001,,20,Short
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
39534,tt0040241,short,Color Rhapsodie,Color Rhapsodie,0,2021,,6,Short
43538,tt0044326,short,Abstronic,Abstronic,0,2021,,6,Short
44080,tt0044879,short,Mandala,Mandala,0,2021,,3,Short


In [13]:
# filter for movie title type

film = basics['titleType'] != 'movie'
basics[film].head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
33792,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001,,20,Short
39534,tt0040241,short,Color Rhapsodie,Color Rhapsodie,0,2021,,6,Short
43538,tt0044326,short,Abstronic,Abstronic,0,2021,,6,Short
44080,tt0044879,short,Mandala,Mandala,0,2021,,3,Short
55741,tt0056840,short,Aufsätze,Aufsätze,0,2021,,10,Short


In [14]:
# dropping all non movie title types
basics.drop(basics[film].index, inplace=True)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
60162,tt0061366,movie,Around the World,Around the World,0,,,178,"Comedy,Romance"
61093,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
67639,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
77933,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"


In [16]:
keepers = basics['tconst'].isin(akas['titleId'])
keepers

34792      True
60162      True
61093      True
67639      True
77933      True
           ... 
9228410    True
9228419    True
9228458    True
9228503    True
9228587    True
Name: tconst, Length: 147615, dtype: bool

In [17]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34792,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
60162,tt0061366,movie,Around the World,Around the World,0,,,178,"Comedy,Romance"
61093,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67639,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77933,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
...,...,...,...,...,...,...,...,...,...
9228410,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9228419,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"
9228458,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,,84,Thriller
9228503,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"


In [18]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

## Save current dataframe to file.
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

## Save current dataframe to file.
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)



In [19]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()



Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0061366,movie,Around the World,Around the World,0,,,178,"Comedy,Romance"
2,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
4,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"


In [20]:
# Open saved file and preview again
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()



Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0.0
1,tt0000001,2,Carmencita,DE,,,literal title,0.0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0.0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0.0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0.0


In [21]:
# Open saved file and preview again
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings.head()



Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1910
1,tt0000002,5.8,256
2,tt0000003,6.5,1714
3,tt0000004,5.6,169
4,tt0000005,6.2,2529
