In [53]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [54]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"

In [55]:
basics = pd.read_csv(basics_url,sep='\t', low_memory=False)

In [56]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [57]:
ratings_url="https://datasets.imdbws.com/title.ratings.tsv.gz"

In [58]:
ratings = pd.read_csv(ratings_url,sep='\t', low_memory=False)

In [59]:
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1905
1,tt0000002,5.9,256
2,tt0000003,6.5,1702
3,tt0000004,5.7,168
4,tt0000005,6.2,2517


In [60]:
akas_url="https://datasets.imdbws.com/title.akas.tsv.gz"

In [61]:
akas = pd.read_csv(akas_url,sep='\t', low_memory=False)

In [62]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [63]:
basics = basics.replace({'\\N':np.nan})

In [64]:
akas = akas.replace({'\\N':np.nan})

In [65]:
ratings = ratings.replace({'\\N':np.nan})

In [66]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9154356 entries, 0 to 9154355
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 628.6+ MB


In [67]:
basics['runtimeMinutes'].value_counts()

30                         131013
60                         102591
22                          92309
44                          69125
45                          58328
                            ...  
569                             1
670                             1
924                             1
Animation,Comedy,Family         1
2088                            1
Name: runtimeMinutes, Length: 873, dtype: int64

In [68]:
basics.duplicated().sum()

0

In [69]:
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle           10
originalTitle          10
isAdult                 1
startYear         1212014
endYear           9059143
runtimeMinutes    6693342
genres             416297
dtype: int64

In [70]:
basics.dropna(subset=['runtimeMinutes'], inplace=True)
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 1
startYear           37615
endYear           2414479
runtimeMinutes          0
genres              67310
dtype: int64

In [71]:
basics.dropna(subset=['genres'], inplace=True)
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear           36269
endYear           2348738
runtimeMinutes          0
genres                  0
dtype: int64

In [72]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear           30755
endYear           1978752
runtimeMinutes          0
genres                  0
dtype: int64

In [73]:
basics['titleType'].value_counts()

tvEpisode       952139
short           483225
movie           279107
video           139536
tvSeries         73952
tvMovie          56054
tvSpecial        13864
tvMiniSeries     11602
tvShort           7013
videoGame          280
Name: titleType, dtype: int64

In [74]:
basics = basics[basics["titleType"].str.contains("movie")==True]
basics['titleType'].value_counts()

movie    279107
Name: titleType, dtype: int64

In [75]:
basics['startYear'].value_counts()

2018    9564
2017    9383
2019    9291
2016    8983
2015    8538
        ... 
1906       1
1903       1
1908       1
2027       1
1894       1
Name: startYear, Length: 124, dtype: int64

In [76]:
basics.dropna(subset=['startYear'], inplace=True)
basics.isna().sum()

tconst                 0
titleType              0
primaryTitle           0
originalTitle          0
isAdult                0
startYear              0
endYear           274821
runtimeMinutes         0
genres                 0
dtype: int64

In [77]:
basics = basics[basics['startYear'] >= '2000']
basics['startYear'].value_counts()

2018    9564
2017    9383
2019    9291
2016    8983
2015    8538
2014    8142
2021    7970
2013    7755
2020    7472
2012    7267
2011    6735
2010    6337
2009    5945
2022    5490
2008    5180
2007    4601
2006    4366
2005    3880
2004    3504
2003    3213
2002    2966
2001    2842
2000    2714
2023     261
2024      28
2025       6
2026       2
2027       1
Name: startYear, dtype: int64

In [78]:
basics = basics[basics['startYear'] <= '2021']
basics['startYear'].value_counts()

2018    9564
2017    9383
2019    9291
2016    8983
2015    8538
2014    8142
2021    7970
2013    7755
2020    7472
2012    7267
2011    6735
2010    6337
2009    5945
2008    5180
2007    4601
2006    4366
2005    3880
2004    3504
2003    3213
2002    2966
2001    2842
2000    2714
Name: startYear, dtype: int64

In [79]:
keepers =basics['tconst'].isin(akas['titleId'])
keepers

34790      True
61089      True
67634      True
77928      True
86765      True
           ... 
9154028    True
9154037    True
9154076    True
9154121    True
9154205    True
Name: tconst, Length: 136648, dtype: bool

In [80]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34790,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61089,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67634,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77928,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86765,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9154028,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9154037,tt9916190,movie,Safeguard,Safeguard,0,2020,,95,"Action,Adventure,Thriller"
9154076,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,,84,Thriller
9154121,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"


In [82]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [84]:
akas['region'].value_counts()

FR    3922819
JP    3922398
DE    3905831
IN    3850455
ES    3846016
       ...   
TV          1
NU          1
PW          1
NR          1
TC          1
Name: region, Length: 246, dtype: int64

In [86]:
akas = akas[akas["region"].str.contains("US")==True]
akas['region'].value_counts()

US    1342706
Name: region, dtype: int64

In [87]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1342706 entries, 5 to 32863389
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1342706 non-null  object
 1   ordering         1342706 non-null  int64 
 2   title            1342706 non-null  object
 3   region           1342706 non-null  object
 4   language         3681 non-null     object
 5   types            963269 non-null   object
 6   attributes       44738 non-null    object
 7   isOriginalTitle  1341331 non-null  object
dtypes: int64(1), object(7)
memory usage: 92.2+ MB


In [88]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136038 entries, 34790 to 9154205
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          136038 non-null  object
 1   titleType       136038 non-null  object
 2   primaryTitle    136038 non-null  object
 3   originalTitle   136038 non-null  object
 4   isAdult         136038 non-null  object
 5   startYear       136038 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  136038 non-null  object
 8   genres          136038 non-null  object
dtypes: object(9)
memory usage: 10.4+ MB


In [89]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1263582 entries, 0 to 1263581
Data columns (total 3 columns):
 #   Column         Non-Null Count    Dtype  
---  ------         --------------    -----  
 0   tconst         1263582 non-null  object 
 1   averageRating  1263582 non-null  float64
 2   numVotes       1263582 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 28.9+ MB


In [91]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

[]

In [92]:
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [93]:
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [94]:
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)