In [3]:
import pandas as pd
import json
import numpy as np
import seaborn as sn
from matplotlib import pyplot as plt
from io import StringIO
%matplotlib inline

In [4]:
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import auc, roc_auc_score, roc_curve, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [5]:
###Data Cleanup and EDA####

In [6]:
df_movies =  pd.read_csv('5kMovies.csv')
df_credits = pd.read_csv('5kCredits.csv')

In [7]:
df_tmdb = pd.concat([df_movies, df_credits], axis=1)

In [9]:
new_column_order = ['id', 'title', 'original_language', 'cast','genres', 'release_date','runtime', 'popularity', 'vote_count', 'vote_average', 'budget', 'revenue']

In [None]:
genres_list = ['Action', 'Adventure', 'Fantasy', 'Science Fiction', 'Thriller', 'Comedy']

In [10]:
def data_cleanup(df, new_columns):
    df_tmdb_new = df[new_column_order]
    df_tmdb_new = df_tmdb_new.loc[:, ~df_tmdb_new.columns.duplicated()]
    
    categorical= list(df_tmdb_new.dtypes[df_tmdb_new.dtypes=='object'].index)
    numerical = list(df_tmdb_new.dtypes[df_tmdb_new.dtypes!='object'].index)
    
    for c in categorical:
        df_tmdb_new.loc[:,c] = df_tmdb_new.loc[:,c].fillna('NA')

    for c in numerical:
        df_tmdb_new.loc[:,c] = df_tmdb_new.loc[:,c].fillna(0)

    
    df_tmdb_new.columns = ['id', 'title', 'language', 'cast','genres', 'release_date','runtime', 'popularity', 'vote_count', 'vote_avg', 'budget', 'revenue']
    
    df_filtered = df_tmdb_new[df_tmdb_new['genres'].str.len() > 2]
    return df_filtered

In [11]:
def jsontocsv(jsonstr:list):
    if len(jsonstr) < 3:
        return 'NA'
    df = pd.read_json(StringIO(jsonstr))
    csv_string = ', '.join(df['name'].apply(str))
    return csv_string

In [12]:
def genres_json_to_csv(row):
    return jsontocsv(row['genres'])

In [27]:
def normalize_genres(df_movies):
    df_movies['genres_csv'] = df_movies.apply(genres_json_to_csv, axis=1)
    for g in genres_list:
        df_movies[g] = df_movies['genres_csv'].apply(lambda x: g in x)
    df_movies = df_movies.drop('genres_csv', axis=1)
    df_movies = df_movies.drop('genres', axis=1)
    df_movies.columns = ['id', 'title', 'language', 'cast','Action', 'Adventure', 'Fantasy', 'Science Fiction', 'Thriller', 'Comedy', 'release_date','runtime', 'popularity', 'vote_count', 'vote_avg', 'budget', 'revenue']
    return df_movies

In [28]:
df_movies = data_cleanup(df_tmdb, new_column_order)
df_movies = normalize_genres(df_movies)

In [29]:
df_movies.head(10)

Unnamed: 0,id,title,language,cast,Action,Adventure,Fantasy,Science Fiction,Thriller,Comedy,release_date,runtime,popularity,vote_count,vote_avg,budget,revenue
0,19995,Avatar,en,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...",2009-12-10,162.0,150.437577,11800,7.2,237000000,2787965087,True,True,True,True,False,False
1,285,Pirates of the Caribbean: At World's End,en,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...",2007-05-19,169.0,139.082615,4500,6.9,300000000,961000000,True,True,True,False,False,False
2,206647,Spectre,en,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...",2015-10-26,148.0,107.376788,4466,6.3,245000000,880674609,True,True,False,False,False,False
3,49026,The Dark Knight Rises,en,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...",2012-07-16,165.0,112.31295,9106,7.6,250000000,1084939099,True,False,False,False,True,False
4,49529,John Carter,en,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...",2012-03-07,132.0,43.926995,2124,6.1,260000000,284139100,True,True,False,True,False,False
5,559,Spider-Man 3,en,"[{""cast_id"": 30, ""character"": ""Peter Parker / ...",2007-05-01,139.0,115.699814,3576,5.9,258000000,890871626,True,True,True,False,False,False
6,38757,Tangled,en,"[{""cast_id"": 34, ""character"": ""Flynn Rider (vo...",2010-11-24,100.0,48.681969,3330,7.4,260000000,591794936,False,False,False,False,False,False
7,99861,Avengers: Age of Ultron,en,"[{""cast_id"": 76, ""character"": ""Tony Stark / Ir...",2015-04-22,141.0,134.279229,6767,7.3,280000000,1405403694,True,True,False,True,False,False
8,767,Harry Potter and the Half-Blood Prince,en,"[{""cast_id"": 3, ""character"": ""Harry Potter"", ""...",2009-07-07,153.0,98.885637,5293,7.4,250000000,933959197,False,True,True,False,False,False
9,209112,Batman v Superman: Dawn of Justice,en,"[{""cast_id"": 18, ""character"": ""Bruce Wayne / B...",2016-03-23,151.0,155.790452,7004,5.7,250000000,873260194,True,True,True,False,False,False
