In [51]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import math
from statsmodels.stats import diagnostic
import statsmodels.stats as st
from scipy import stats
import networkx as nx
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier
# from sklearn.preprocessing import OneHotEncoder
# from pandas.plotting import scatter_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error, auc, roc_curve

from director_scrap import director_scrap

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Crew Database

Database of directors and writers associated to each film

In [52]:
df_crew = pd.read_table('data/crew.tsv')
df_crew.head(1)

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,


# Name Database

Database of names of people having contributed to the movie (director, writer, actor, cosplayer...)

In [54]:
df_name = pd.read_table('data/name.tsv')
df_name.head(1)

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,nm0000001,Fred Astaire,1899.0,1987.0,"soundtrack,actor,miscellaneous","tt0072308,tt0053137,tt0031983,tt0050419"


# Database Title

Database of titles of movies' and many other informations

In [55]:
df_title = pd.read_table('data/title.tsv')
df_title.head(1)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0.0,1894.0,,1.0,"Documentary,Short"


# Database Imdb_Rating

Database of movie ratings

In [56]:
df_rating = pd.read_table('data/imdb_rating.tsv')
df_rating.head(1)

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2004


# Processing

All aditionnal processing done to the data

In [57]:
split = []
titles_string = df_name['knownForTitles'].values
for i in range(len(titles_string)):
    val = titles_string[i]
    if type(val) == type(''):
        title_ids = val.split(',')
        split.append(title_ids)
    else: split.append(np.nan)

# Merging Step

Merge the above databases and keep only the useful informations 

In [76]:
df_crew_rating = df_crew.merge(df_rating, how='inner', on='tconst')

In [79]:
df_crew_rating.head(2)

Unnamed: 0,tconst,directors,writers,averageRating,numVotes
0,tt0000001,nm0005690,,5.7,2004
1,tt0000002,nm0721526,,5.8,269


In [78]:
df_cr_title = df_crew_rating.merge(df_title, how='inner', on='tconst')
df_cr_title.head(2)

Unnamed: 0,tconst,directors,writers,averageRating,numVotes,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,nm0005690,,5.7,2004,short,Carmencita,Carmencita,0.0,1894.0,,1.0,"Documentary,Short"
1,tt0000002,nm0721526,,5.8,269,short,Le clown et ses chiens,Le clown et ses chiens,0.0,1892.0,,5.0,"Animation,Short"


In [80]:
df_crt_dir = df_cr_title.merge(df_name, left_on='directors', right_on='nconst', how='inner')
df_crt_dir.head(2)

Unnamed: 0,tconst,directors,writers,averageRating,numVotes,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,nconst,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,tt0000001,nm0005690,,5.7,2004,short,Carmencita,Carmencita,0.0,1894.0,,1.0,"Documentary,Short",nm0005690,William K.L. Dickson,1860.0,1935.0,"cinematographer,director,producer","tt1428455,tt0219560,tt0308254,tt1496763"
1,tt0000005,nm0005690,,6.2,2685,short,Blacksmith Scene,Blacksmith Scene,0.0,1893.0,,1.0,"Comedy,Short",nm0005690,William K.L. Dickson,1860.0,1935.0,"cinematographer,director,producer","tt1428455,tt0219560,tt0308254,tt1496763"


In [97]:
df_crtd = df_crt_dir.drop(['tconst', 'directors', 'numVotes', 'primaryProfession', 'isAdult'], axis=1)
df_crtd.head()

Unnamed: 0,writers,averageRating,titleType,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,genres,nconst,primaryName,birthYear,deathYear,knownForTitles
0,,5.7,short,Carmencita,Carmencita,1894.0,,1.0,"Documentary,Short",nm0005690,William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763"
1,,6.2,short,Blacksmith Scene,Blacksmith Scene,1893.0,,1.0,"Comedy,Short",nm0005690,William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763"
2,,5.0,short,Chinese Opium Den,Chinese Opium Den,1894.0,,1.0,Short,nm0005690,William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763"
3,,5.4,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,1894.0,,1.0,"Documentary,Short",nm0005690,William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763"
4,nm0410331,4.4,short,Awakening of Rip,Awakening of Rip,1896.0,,1.0,"Drama,Short",nm0005690,William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763"


In [104]:
df_crtd = df_crtd.rename(columns={'primaryName': 'director', 'birthYear': 'dir_birth', 'deathYear': 'dir_death'
                                 , 'runtimeMinutes': 'runtime_min', 'averageRating': 'imdb_rating', 
                                  'knownForTitles': 'dir_known_titles'})
df_crtd.head()

Unnamed: 0,writers,imdb_rating,titleType,primaryTitle,originalTitle,startYear,endYear,runtime,genres,nconst,director,dir_birth,dir_death,dir_known_titles
0,,5.7,short,Carmencita,Carmencita,1894.0,,1.0,"Documentary,Short",nm0005690,William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763"
1,,6.2,short,Blacksmith Scene,Blacksmith Scene,1893.0,,1.0,"Comedy,Short",nm0005690,William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763"
2,,5.0,short,Chinese Opium Den,Chinese Opium Den,1894.0,,1.0,Short,nm0005690,William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763"
3,,5.4,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,1894.0,,1.0,"Documentary,Short",nm0005690,William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763"
4,nm0410331,4.4,short,Awakening of Rip,Awakening of Rip,1896.0,,1.0,"Drama,Short",nm0005690,William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763"


In [105]:
df_fin = df_crtd.merge(df_name, left_on='writers', right_on='nconst', how='left')
df_fin.head()

Unnamed: 0,writers,imdb_rating,titleType,primaryTitle,originalTitle,startYear,endYear,runtime,genres,nconst_x,director,dir_birth,dir_death,dir_known_titles,nconst_y,primaryName,birthYear,deathYear,primaryProfession,knownForTitles
0,,5.7,short,Carmencita,Carmencita,1894.0,,1.0,"Documentary,Short",nm0005690,William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763",,,,,,
1,,6.2,short,Blacksmith Scene,Blacksmith Scene,1893.0,,1.0,"Comedy,Short",nm0005690,William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763",,,,,,
2,,5.0,short,Chinese Opium Den,Chinese Opium Den,1894.0,,1.0,Short,nm0005690,William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763",,,,,,
3,,5.4,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,1894.0,,1.0,"Documentary,Short",nm0005690,William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763",,,,,,
4,nm0410331,4.4,short,Awakening of Rip,Awakening of Rip,1896.0,,1.0,"Drama,Short",nm0005690,William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763",nm0410331,Washington Irving,1783.0,1859.0,"writer,script_department,miscellaneous","tt0051312,tt9899970,tt3055374,tt0162661"


In [116]:
df = df_fin.drop(['nconst_x', 'nconst_y', 'primaryProfession', 'writers'], axis=1)
df = df.rename(columns={'primaryName': 'writer', 'birthYear': 'writer_birth', 'deathYear': 'writer_death',
                        'knownForTitles': 'writer_know_titles', 'titleType': 'type', 
                        'primaryTitle': 'popular_title', 'originalTitle': 'original_title',
                       'startYear': 'movie_start_year', 'endYear': 'movie_end_year'})

In [117]:
df.head()

Unnamed: 0,imdb_rating,type,popular_title,original_title,movie_start_year,movie_end_year,runtime,genres,director,dir_birth,dir_death,dir_known_titles,writer,writer_birth,writer_death,writer_know_titles
0,5.7,short,Carmencita,Carmencita,1894.0,,1.0,"Documentary,Short",William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763",,,,
1,6.2,short,Blacksmith Scene,Blacksmith Scene,1893.0,,1.0,"Comedy,Short",William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763",,,,
2,5.0,short,Chinese Opium Den,Chinese Opium Den,1894.0,,1.0,Short,William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763",,,,
3,5.4,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,1894.0,,1.0,"Documentary,Short",William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763",,,,
4,4.4,short,Awakening of Rip,Awakening of Rip,1896.0,,1.0,"Drama,Short",William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763",Washington Irving,1783.0,1859.0,"tt0051312,tt9899970,tt3055374,tt0162661"


In [None]:
print(len(df))
df = df.drop_duplicates()
print(len(df))

In [122]:
df = df.drop_duplicates(subset=['popular_title'])
len(df)

734498

In [123]:
df.head()

Unnamed: 0,imdb_rating,type,popular_title,original_title,movie_start_year,movie_end_year,runtime,genres,director,dir_birth,dir_death,dir_known_titles,writer,writer_birth,writer_death,writer_know_titles
0,5.7,short,Carmencita,Carmencita,1894.0,,1.0,"Documentary,Short",William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763",,,,
1,6.2,short,Blacksmith Scene,Blacksmith Scene,1893.0,,1.0,"Comedy,Short",William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763",,,,
2,5.0,short,Chinese Opium Den,Chinese Opium Den,1894.0,,1.0,Short,William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763",,,,
3,5.4,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,1894.0,,1.0,"Documentary,Short",William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763",,,,
4,4.4,short,Awakening of Rip,Awakening of Rip,1896.0,,1.0,"Drama,Short",William K.L. Dickson,1860.0,1935.0,"tt1428455,tt0219560,tt0308254,tt1496763",Washington Irving,1783.0,1859.0,"tt0051312,tt9899970,tt3055374,tt0162661"


# Export Database to tsv

**Dont run the below cell except if you want to modify the imdb_data.tsv file**

In [124]:
df.to_csv('data/imdb_data.tsv', sep='\t', index=False)