In [343]:
import pandas as pd
from imdb import IMDb
import datetime 
import time
import re
import numpy as np
import sklearn.svm

Import Move Data  
make sure we are using the same movies across all three data sets

In [344]:
#TMDB
df_tm        = pd.read_csv('./datasets/input/tmdb_data_trim.csv'
                           , encoding='utf-8'
                           , usecols = ['id']
                          ) 
df_tm = df_tm.dropna(axis=0, how='any') 
df_tm.drop_duplicates(inplace=True)

#Movie Lense (#dont need 'ml_id' just tmdb_id and imdb_id)
df_ml        = pd.read_csv(  './datasets/input/ml_data.csv'                           
                           , encoding='utf-8'                           
                           , usecols = ['tmdbId','mlId','imdbId'])
df_ml = df_ml.dropna(axis=0, how='any')
df_ml.drop_duplicates(inplace=True)
#drop all non uniqie maps
df_ml = df_ml[df_ml.duplicated(subset=['imdbId'],keep=False)==False]

#IMDB
df_im = pd.read_csv('./datasets/input/imdb_movies.csv',encoding='utf-8',usecols=range(2,15))
df_im = df_im.dropna(axis=0, how='all')
df_im.drop_duplicates()
#Dedup by imdb_id
df_im = df_im[df_im.duplicated(subset=['imdb_id'],keep='first')==False]

df_im_people = pd.read_csv('./datasets/input/imdb_people.csv',encoding='utf-8') 
df_im_people.drop_duplicates(inplace=True)

print (len(df_tm.index))
print (len(df_ml.index))
print (len(df_im.index))
print (len(df_im_people.index))

36129
37720
37967
2023923


Filter IMDb to match ids in TMDb

In [345]:
#get 'imdb_id' for movies in both movile lens and TMDb
df_ml_ids = df_tm.merge(df_ml , left_on='id', right_on='tmdbId')
df_ml_ids = df_ml_ids[['imdbId','tmdbId','mlId']]  
df_ml_ids.drop_duplicates(inplace=True)

#limit IMDb to moves that are in both movie lens and TMDb
df_im = df_ml_ids.merge(df_im , left_on='imdbId', right_on='imdb_id')
#drop duplicate IMDB_id column
df_im = df_im.ix[:,1:]

print ('TMDb  : ',str(len(df_tm.index)))
print ('M Lens: ',str(len(df_ml.index)))
print ('IMDb  : ',str(len(df_im.index)))

('TMDb  : ', '36129')
('M Lens: ', '37720')
('IMDb  : ', '36007')


Filter the people

In [346]:
#limit people to reduced movie list
df_imdb_id = pd.DataFrame({'movieid' : df_im['imdb_id']})

#df_im_people_trim = df_imdb_id.merge(df_im_people, left_on='movieid', right_on='imdb_id') 
df_im_people = pd.merge(left = df_imdb_id, right = df_im_people, left_on='movieid', right_on='imbd_id') 

#drop excess columns and rename
df_im_people = df_im_people[['imbd_id', 'name', 'person_id','role_id']]
df_im_people.columns = ['imdb_id', 'name', 'person_id','role_id']

df_im_people.to_csv('./datasets/output/imdb_people_trim.csv', encoding='utf-8', index=False)

clean imdb movie file

In [347]:
#strip country code from run time leaving just minutes
df_im.runtime = pd.to_numeric(df_im.runtime.str.extract('([0-9]*)', expand=False))
#Flag Shorts as feature not a genre
df_im["is_short"] = df_im.genres.str.contains('Short').tolist()
df_im.head(1)

Unnamed: 0,tmdbId,mlId,country_codes,cover_url,cover_url_full,genres,imdb_id,kind,language_codes,plot,plot_outline,rating,runtime,title,votes,is_short
0,862.0,1,us,https://images-na.ssl-images-amazon.com/images...,https://images-na.ssl-images-amazon.com/images...,Animation|Adventure|Comedy|Family|Fantasy,114709,movie,en,A little boy named Andy loves to be in his roo...,A cowboy doll is profoundly threatened and jea...,8.3,81.0,Toy Story,667412.0,False


Create Multi label response

In [348]:
#start response table with just the id key for linkage
y = df_im[['imdb_id','genres']]

#Split multi value genres column into seperate columns 
#"Romantic|Comedy" -> 0:Romantic , 1: Comedy 
genres_df = df_im['genres'].str.split(pat="[|]", n=-1, expand=True)

#The genres in each column are now singular but all mixed up
#1 hot encode each column and populate the y table
for column in genres_df.columns:
    #1 hot encode
    encoding = pd.get_dummies(genres_df[column]) 
    
    for i in xrange(len(encoding.columns)):
        if encoding.columns[i] in y.columns:
            #the matching encoded genre is already inthe y table
            #add the column value to the y table
            y[encoding.columns[i]] = y[encoding.columns[i]] + encoding.ix[:,i]            
        else:
            #genre not in the y table yet
            #append the column to the y table
            y = pd.concat([y, encoding.ix[:,i]], axis=1)

In [349]:
df_im = pd.concat([y.ix[:,2:len(y.columns)], df_im], axis=1)

In [350]:
df_im =  df_im[[   'Action','Adult','Adventure',
                'Animation','Biography','Comedy',
                'Crime','Documentary','Drama',
                'Family','Fantasy','Film-Noir',
                'History','Horror','Music',
                'Musical','Mystery','Romance',
                'Sci-Fi','Short','Sport',
                'Talk-Show','Thriller','War',
                'Western','Game-Show','News',
                'Reality-TV',
                'tmdbId','mlId', 'imdb_id','title',
                'genres','kind',
                'country_codes','language_codes',
                'rating','votes','runtime',
                'is_short', 'plot','plot_outline'
            ]]

df_im.to_csv('./datasets/output/imdb_movies_trim.csv', encoding='utf-8', index=False)