In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import json
import ast

In [2]:


#Credits
credits = pd.read_csv(".\credits.csv")
print("{0} {1} : {2}".format("credits.csv", credits.shape, credits.columns))
#Keywords
keywords = pd.read_csv(".\keywords.csv")
print("{0} {1} : {2}".format("keywords.csv", keywords.shape, keywords.columns))
#Links
links = pd.read_csv(".\links.csv")
print("{0} {1} : {2}".format("links.csv", links.shape, links.columns))
#MetaData
metadata = pd.read_csv(".\movies_metadata.csv")
print("{0} {1} : {2}".format("movies_metadata.csv", metadata.shape, metadata.columns))

def check_int(value):
    try:
        int(value)
        return np.NaN
    except ValueError:
        return value
    
def check_bool(value):
    try:
        bool(value)
        return np.NaN
    except ValueError:
        return value

credits.csv (45476, 3) : Index(['cast', 'crew', 'id'], dtype='object')
keywords.csv (46419, 2) : Index(['id', 'keywords'], dtype='object')
links.csv (45843, 3) : Index(['movieId', 'imdbId', 'tmdbId'], dtype='object')
movies_metadata.csv (45466, 24) : Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')


  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


### 1. Many types of ids in tables, verify 'id' is feature to merge on

In [3]:
#Find and drop invalid data in metadata
sub = metadata[['imdb_id','id','title']]

#tried to convert 'id' in metadata to int, but couldn't
#need to check values
invalid = sub['id'].apply(check_int).dropna()
print("Invalid Ids")
print(sub.loc[sub['id'].isin(invalid)])

cleansub = sub.loc[~sub['id'].isin(invalid)]
cleansub = cleansub.astype({'id':'int64'})
credit_meta = pd.merge(cleansub, credits, on='id')

credit_meta_keywords = pd.merge(credit_meta, keywords, on='id')

print()
print("Merged Tables")
print(credit_meta_keywords.dtypes)
print(credit_meta_keywords.columns)

Invalid Ids
      imdb_id          id title
19730       0  1997-08-20   NaN
29503       0  2012-09-29   NaN
35587       0  2014-01-01   NaN

Merged Tables
imdb_id     object
id           int64
title       object
cast        object
crew        object
keywords    object
dtype: object
Index(['imdb_id', 'id', 'title', 'cast', 'crew', 'keywords'], dtype='object')


##### Verified can merge on 'id', which means 'links.csv' not needed

### 2 - Extract important features from each csv, and cast to types

In [4]:
metaSubset = metadata[['adult', 'budget', 'genres', 'id',
       'original_language','popularity', 'production_companies','production_countries', 
        'release_date', 'revenue', 'runtime','spoken_languages', 'status', 
        'title', 'video','vote_average', 'vote_count']]

metaSubset = metaSubset.loc[~sub['id'].isin(invalid)]
metaSubset = metaSubset.astype({'adult':'bool', 
                                'budget':'float', 
                                'id':'int64', 
                                'original_language':'string',
                                'popularity':'float',
                                'runtime':'float',
                               'title':'string',
                               'video':'bool'})

metaSubset['release_date'] = pd.to_datetime(metaSubset['release_date'])


#cut unreleased films, then remove column
metaSubset = metaSubset[metaSubset['status']=='Released']
metaSubset = metaSubset.drop(columns = 'status')

#missing revenue for 37619 movies out of 43000 in dataset, remove column
print("Number of missing revenue values: {0}".format(len(metaSubset[metaSubset['revenue']<= 0])))
metaSubset = metaSubset.drop(columns= 'revenue')

print(metaSubset['production_countries'][0])

#genres, production_companies, production_countries come in as strings, interpret as objects(dictionaries, arrays, etc)
metaSubset['genres'] = metaSubset['genres'].apply(ast.literal_eval)
metaSubset['genreIds'] = metaSubset['genres'].apply(lambda x : [y['id'] for y in x])

metaSubset['production_companies'] = metaSubset['production_companies'].apply(lambda x : ast.literal_eval(x))
metaSubset['productionCompIds'] = metaSubset['production_companies'].apply(lambda x : [y['id'] for y in x])

metaSubset['production_countries'] = metaSubset['production_countries'].apply(lambda x : ast.literal_eval(x))
metaSubset['productionCountryId'] = metaSubset['production_countries'].apply(lambda x : [y['iso_3166_1'] for y in x])

Number of missing revenue values: 37619
[{'iso_3166_1': 'US', 'name': 'United States of America'}]


### credits table
##### (this one takes some time)

In [20]:
creditsSub = credits[['cast', 'crew','id']]
creditsSub = creditsSub


def ParseCrew(crewString):
    c = ast.literal_eval(crewString)
    if(len(c) == 0):
        return np.nan
    
    c = [ { key:value for (key,value) in x.items() if key in ['id','job','name']} \
          for x in c if x['job'] in ['Director', 'Screenplay','Editor']  ]
    return c

creditsSub['crew'] = creditsSub['crew'].apply(ParseCrew)
print(creditsSub['crew'].head())



def ParseCast(castString):
    c = ast.literal_eval(castString)
    if(len(c) == 0):
        return np.nan
    
    c = [ { key:value for (key,value) in x.items() if key in ['id','name']} \
          for x in c]
    return c
    
creditsSub['cast'] = creditsSub['cast'].apply(ParseCast)


0    [{'id': 7879, 'job': 'Director', 'name': 'John...
1    [{'id': 876, 'job': 'Screenplay', 'name': 'Jon...
2    [{'id': 26502, 'job': 'Director', 'name': 'How...
3    [{'id': 2178, 'job': 'Director', 'name': 'Fore...
4    [{'id': 17698, 'job': 'Screenplay', 'name': 'N...
Name: crew, dtype: object


In [21]:
creditsSub.head()

Unnamed: 0,cast,crew,id
0,"[{'id': 31, 'name': 'Tom Hanks'}, {'id': 12898...","[{'id': 7879, 'job': 'Director', 'name': 'John...",862
1,"[{'id': 2157, 'name': 'Robin Williams'}, {'id'...","[{'id': 876, 'job': 'Screenplay', 'name': 'Jon...",8844
2,"[{'id': 6837, 'name': 'Walter Matthau'}, {'id'...","[{'id': 26502, 'job': 'Director', 'name': 'How...",15602
3,"[{'id': 8851, 'name': 'Whitney Houston'}, {'id...","[{'id': 2178, 'job': 'Director', 'name': 'Fore...",31357
4,"[{'id': 67773, 'name': 'Steve Martin'}, {'id':...","[{'id': 17698, 'job': 'Screenplay', 'name': 'N...",11862


### keywords table

In [30]:
keywordsSub = keywords[['id','keywords']]
keywordsSub['keywords'] = keywordsSub['keywords'].apply(ast.literal_eval)
keywordsSub.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


## Merge Tables

In [32]:
allData = metaSubset.merge(credits, on='id')
allData.head()

len(allData[allData['budget']==0])


36200