# Data Clean on IMDB Title Basics Data Set

In [1]:
# import libraries
import pandas as pd
import pickle

In [2]:
# load the title basics dataset
tBasics = pd.read_csv('./data/title.basics.tsv', sep='\t',low_memory=False)

# display row & column counts
tBasics.shape

(5139702, 9)

In [3]:
# view data frame
tBasics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,\N,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,Short


In [4]:
# drop all adult titles
tBasicsSub = tBasics[tBasics.isAdult == 0]

# display row & column counts
tBasicsSub.shape

(4982679, 9)

In [5]:
# drop all title types except movies
tBasicsSub = tBasicsSub[tBasicsSub.titleType == 'movie']

# display row & column counts
tBasicsSub.shape

(482649, 9)

In [6]:
# view data frame
tBasicsSub.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
145,tt0000147,movie,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,0,1897,\N,20,"Documentary,News,Sport"
332,tt0000335,movie,Soldiers of the Cross,Soldiers of the Cross,0,1900,\N,\N,"Biography,Drama"
499,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
571,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,\N,70,"Biography,Crime,Drama"


In [7]:
# drop all movies that have no runtime value
tBasicsSub = tBasicsSub[tBasicsSub.runtimeMinutes != '\\N']

# display row & column counts
tBasicsSub.shape

(301622, 9)

In [8]:
# convert runtime minutes to an integer type
tBasicsSub.runtimeMinutes = pd.to_numeric(tBasicsSub.runtimeMinutes , errors='coerce')

Drop any movie title that has a runtime of 79 minutes or less. The Screen Actor Guild (SAG) defines a feature film at 80 minutes or greater in runtime length. 
https://screenwriting.io/what-is-a-feature-film/
https://en.wikipedia.org/wiki/Feature_film

In [9]:
# select all movies that are 80-minutes or more in runtime minutes
tBasicsSub = tBasicsSub[tBasicsSub.runtimeMinutes > 79]

# display row & column counts
tBasicsSub.shape

(204512, 9)

In [10]:
# view data frame
tBasicsSub.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
499,tt0000502,movie,Bohemios,Bohemios,0,1905,\N,100,\N
673,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,\N,120,"Adventure,Fantasy"
1740,tt0001756,movie,Lucha por la herencia,Lucha por la herencia,0,1911,\N,92,\N
2077,tt0002101,movie,Cleopatra,Cleopatra,0,1912,\N,100,"Drama,History"
2397,tt0002423,movie,Madame DuBarry,Madame DuBarry,0,1919,\N,85,"Biography,Drama"


In [11]:
# save cleaned data set to a pickle file
tBasicsSub.to_pickle('v1_tBasicsCleaned.pkl')