In [209]:
import pandas as pd
import numpy as np

In [210]:
basics_url='https://datasets.imdbws.com/title.basics.tsv.gz'
akas_url='https://datasets.imdbws.com/title.akas.tsv.gz'
ratings_url='https://datasets.imdbws.com/title.ratings.tsv.gz'

In [211]:
basics = pd.read_csv(basics_url,sep='\t', low_memory=False)

In [212]:
akas = pd.read_csv(akas_url,sep='\t', low_memory=False)

In [213]:
ratings = pd.read_csv(ratings_url,sep='\t', low_memory=False)

In [214]:
basics = basics.replace({'\\N':np.nan}) 
akas = akas.replace({'\\N':np.nan}) 
ratings = ratings.replace({'\\N':np.nan}) 

In [215]:
basics.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9033256 entries, 0 to 9033255
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 620.3+ MB


In [216]:
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [217]:
basics['runtimeMinutes'].isna().sum()

6601846

In [218]:
basics= basics.dropna(subset=['runtimeMinutes'])

In [219]:
basics['runtimeMinutes'].isna().sum()

0

In [220]:
basics['genres'].isna().sum()

66959

In [221]:
basics=basics.dropna(subset=['genres'])

In [222]:
basics['genres'].isna().sum()

0

In [223]:
basics['titleType'].value_counts()

tvEpisode       1043893
short            568572
movie            363210
video            173606
tvMovie           87821
tvSeries          85712
tvSpecial         16249
tvMiniSeries      15772
tvShort            9326
videoGame           290
Name: titleType, dtype: int64

In [224]:
movie_filter= basics['titleType']!='movie'

In [225]:
basics= basics.drop(basics[movie_filter].index,inplace=False)

In [226]:
basics['titleType'].value_counts()

movie    363210
Name: titleType, dtype: int64

In [236]:
start_date= '2000'
end_date='2022'

In [244]:
after_start_date = basics['startYear'] >= start_date

In [245]:
before_end_date = basics['startYear'] <= end_date

In [246]:
between_two_dates = after_start_date & before_end_date

In [247]:
filtered_dates = basics.loc[between_two_dates]

In [248]:
filtered_dates

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
13082,tt0013274,movie,Istoriya grazhdanskoy voyny,Istoriya grazhdanskoy voyny,0,2021,,133,Documentary
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61119,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67672,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77968,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
...,...,...,...,...,...,...,...,...,...
9033021,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"
9033105,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,,123,Drama
9033146,tt9916622,movie,Rodolpho Teóphilo - O Legado de um Pioneiro,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,,57,Documentary
9033173,tt9916680,movie,De la ilusión al desconcierto: cine colombiano...,De la ilusión al desconcierto: cine colombiano...,0,2007,,100,Documentary


In [251]:
basics= filtered_dates

In [252]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 212445 entries, 13082 to 9033206
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          212445 non-null  object
 1   titleType       212445 non-null  object
 2   primaryTitle    212445 non-null  object
 3   originalTitle   212445 non-null  object
 4   isAdult         212445 non-null  object
 5   startYear       212445 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  212445 non-null  object
 8   genres          212445 non-null  object
dtypes: object(9)
memory usage: 24.3+ MB


In [253]:
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]


In [256]:
is_documentary.sum()

72023

In [254]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 140422 entries, 34805 to 9033105
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          140422 non-null  object
 1   titleType       140422 non-null  object
 2   primaryTitle    140422 non-null  object
 3   originalTitle   140422 non-null  object
 4   isAdult         140422 non-null  object
 5   startYear       140422 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  140422 non-null  object
 8   genres          140422 non-null  object
dtypes: object(9)
memory usage: 10.7+ MB


In [257]:
keepers =basics['tconst'].isin(akas['titleId'])
keepers

34805      True
61119      True
67672      True
77968      True
86806      True
           ... 
9032928    True
9032937    True
9032976    True
9033021    True
9033105    True
Name: tconst, Length: 140422, dtype: bool

In [258]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
34805,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
61119,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020,,70,Drama
67672,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
77968,tt0079644,movie,November 1828,November 1828,0,2001,,140,"Drama,War"
86806,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"
...,...,...,...,...,...,...,...,...,...
9032928,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
9032937,tt9916190,movie,Safeguard,Safeguard,0,2020,,90,"Action,Adventure,Thriller"
9032976,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,,84,Thriller
9033021,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"
