In [123]:
import pandas as pd
import numpy as np
import ast
import requests
from sklearn.preprocessing import OneHotEncoder

In [124]:
api_key_auth3 = 'fad9ac13c7b36b3e05f6b63be16e74f0'

# Get genre ids

In [125]:
genre_dictionary = {}

response = requests.get(f"https://api.themoviedb.org/3/genre/movie/list?api_key={api_key_auth3}&language=en-US")
response = response.json()

for genre in response['genres']:
    genre_dictionary[genre['id']] = genre['name']

# Load in dataset and process

In [126]:
feature_df = pd.read_csv(r'./MovieDataEnhanced.csv', encoding = "ISO-8859-1")
feature_df['box_office'] = feature_df['domestic_box_office'] + feature_df['international_box_office']
feature_df.head()

Unnamed: 0,id,genre_ids,original_language,adult,popularity,vote_average,vote_count,movie_name,production_year,production_budget,domestic_box_office,international_box_office,genre,running_time,box_office
0,16781,"[18, 35, 10749]",en,False,9.536,6.374,179,Madea's+Family+Reunion,2006,10000000,63257940,62581,Comedy,,63320521
1,32740,"[28, 878]",hi,False,9.654,6.378,193,Krrish,2006,10000000,1430721,31000000,Action,,32430721
2,9526,"[18, 35, 10402]",en,False,10.004,6.3,261,A+Prairie+Home+Companion,2006,10000000,20342852,6373339,Comedy,105.0,26716191
3,214,"[27, 53, 80]",en,False,36.265,6.382,3825,Saw+III,2006,10000000,80238724,83638091,Horror,,163876815
4,9794,"[35, 10749]",en,False,12.064,5.537,531,Employee+of+the+Month,2006,10000000,28444855,9920000,Comedy,108.0,38364855


In [127]:
feature_df.describe()

Unnamed: 0,id,popularity,vote_average,vote_count,production_year,production_budget,domestic_box_office,international_box_office,running_time,box_office
count,1407.0,1407.0,1407.0,1407.0,1407.0,1407.0,1407.0,1407.0,1308.0,1407.0
mean,122444.1,26.298989,6.288824,3151.733475,2010.823028,52954810.0,63414920.0,93269070.0,109.764526,156684000.0
std,162214.0,28.205945,1.220506,4153.45673,3.522023,52147770.0,79707900.0,150283900.0,20.046836,221513300.0
min,35.0,0.6,0.0,0.0,2006.0,10000000.0,0.0,0.0,0.0,9069.0
25%,11381.0,11.0265,5.8725,526.5,2008.0,19850000.0,11349430.0,9226397.0,97.0,27233380.0
50%,41233.0,17.835,6.402,1588.0,2010.0,35000000.0,37384050.0,38400000.0,108.0,76235000.0
75%,218413.5,31.0475,6.951,4008.5,2014.0,65000000.0,81654880.0,101884500.0,121.0,181032000.0
max,1110807.0,298.358,10.0,33502.0,2018.0,425000000.0,760507600.0,2015838000.0,201.0,2776345000.0


## Group by box office revenue

In [128]:
# Group by box_office
# Groups numerical revenue, currently 6 bins
num = [0,1,2,3,4,5]

revenue_in = pd.qcut(feature_df['box_office'], q=6, labels=num)

enc = OneHotEncoder().fit(np.array(revenue_in).reshape(-1,1))
df = pd.DataFrame(enc.transform(np.array(revenue_in).reshape(-1,1)).toarray())
df['combine'] = df.values.tolist()
feature_df['box_office_group'] = df['combine']

## Convert genres list to one hot array

In [129]:
feature_df['genre_ids'] = [ast.literal_eval(x) for x in feature_df['genre_ids']]
feature_df['genres'] = [[genre_dictionary[v] for v in g_array] for g_array in feature_df['genre_ids']]

In [130]:
feature_df.head()

Unnamed: 0,id,genre_ids,original_language,adult,popularity,vote_average,vote_count,movie_name,production_year,production_budget,domestic_box_office,international_box_office,genre,running_time,box_office,box_office_group,genres
0,16781,"[18, 35, 10749]",en,False,9.536,6.374,179,Madea's+Family+Reunion,2006,10000000,63257940,62581,Comedy,,63320521,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]","[Drama, Comedy, Romance]"
1,32740,"[28, 878]",hi,False,9.654,6.378,193,Krrish,2006,10000000,1430721,31000000,Action,,32430721,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[Action, Science Fiction]"
2,9526,"[18, 35, 10402]",en,False,10.004,6.3,261,A+Prairie+Home+Companion,2006,10000000,20342852,6373339,Comedy,105.0,26716191,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[Drama, Comedy, Music]"
3,214,"[27, 53, 80]",en,False,36.265,6.382,3825,Saw+III,2006,10000000,80238724,83638091,Horror,,163876815,"[0.0, 0.0, 0.0, 0.0, 1.0, 0.0]","[Horror, Thriller, Crime]"
4,9794,"[35, 10749]",en,False,12.064,5.537,531,Employee+of+the+Month,2006,10000000,28444855,9920000,Comedy,108.0,38364855,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]","[Comedy, Romance]"


In [131]:
exploded_df = feature_df[['id', 'genres']].explode(column='genres')
exploded_df = pd.get_dummies(exploded_df, columns=['genres']).groupby('id', as_index=False).sum()

In [132]:
select_features = ['popularity', 'vote_average'
                   , 'vote_count', 'production_budget', 'running_time']

## Get final dataset with select features

In [140]:
merged_features = pd.merge(left=feature_df[['id', 'movie_name', 'box_office_group'] + select_features]
                        , right=exploded_df
                        , left_on='id'
                        , right_on='id')

In [141]:
merged_features.describe()

Unnamed: 0,id,popularity,vote_average,vote_count,production_budget,running_time,genres_Action,genres_Adventure,genres_Animation,genres_Comedy,...,genres_History,genres_Horror,genres_Music,genres_Mystery,genres_Romance,genres_Science Fiction,genres_TV Movie,genres_Thriller,genres_War,genres_Western
count,1407.0,1407.0,1407.0,1407.0,1407.0,1308.0,1407.0,1407.0,1407.0,1407.0,...,1407.0,1407.0,1407.0,1407.0,1407.0,1407.0,1407.0,1407.0,1407.0,1407.0
mean,122444.1,26.298989,6.288824,3151.733475,52954810.0,109.764526,0.276475,0.221748,0.102345,0.364606,...,0.069652,0.078181,0.041222,0.074627,0.151386,0.107321,0.005686,0.246624,0.035537,0.015636
std,162214.0,28.205945,1.220506,4153.45673,52147770.0,20.046836,0.453728,0.425715,0.312452,0.484436,...,0.25465,0.268551,0.198875,0.262882,0.358552,0.314191,0.075217,0.437747,0.185198,0.135083
min,35.0,0.6,0.0,0.0,10000000.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,11381.0,11.0265,5.8725,526.5,19850000.0,97.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,41233.0,17.835,6.402,1588.0,35000000.0,108.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,218413.5,31.0475,6.951,4008.5,65000000.0,121.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1110807.0,298.358,10.0,33502.0,425000000.0,201.0,2.0,2.0,2.0,2.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,2.0


In [142]:
#normalize
merged_features[select_features]=(merged_features[select_features]-merged_features[select_features].min())/(merged_features[select_features].max()-merged_features[select_features].min())

In [143]:
merged_features.head(3)

Unnamed: 0,id,movie_name,box_office_group,popularity,vote_average,vote_count,production_budget,running_time,genres_Action,genres_Adventure,...,genres_History,genres_Horror,genres_Music,genres_Mystery,genres_Romance,genres_Science Fiction,genres_TV Movie,genres_Thriller,genres_War,genres_Western
0,16781,Madea's+Family+Reunion,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",0.030011,0.6374,0.005343,0.0,,0,0,...,0,0,0,0,1,0,0,0,0,0
1,32740,Krrish,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",0.030407,0.6378,0.005761,0.0,,1,0,...,0,0,0,0,0,1,0,0,0,0
2,9526,A+Prairie+Home+Companion,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0]",0.031583,0.63,0.007791,0.0,0.522388,0,0,...,0,0,1,0,0,0,0,0,0,0


In [144]:
merged_features.describe()

Unnamed: 0,id,popularity,vote_average,vote_count,production_budget,running_time,genres_Action,genres_Adventure,genres_Animation,genres_Comedy,...,genres_History,genres_Horror,genres_Music,genres_Mystery,genres_Romance,genres_Science Fiction,genres_TV Movie,genres_Thriller,genres_War,genres_Western
count,1407.0,1407.0,1407.0,1407.0,1407.0,1308.0,1407.0,1407.0,1407.0,1407.0,...,1407.0,1407.0,1407.0,1407.0,1407.0,1407.0,1407.0,1407.0,1407.0,1407.0
mean,122444.1,0.086308,0.628882,0.094076,0.103506,0.546092,0.276475,0.221748,0.102345,0.364606,...,0.069652,0.078181,0.041222,0.074627,0.151386,0.107321,0.005686,0.246624,0.035537,0.015636
std,162214.0,0.094728,0.122051,0.123976,0.125657,0.099736,0.453728,0.425715,0.312452,0.484436,...,0.25465,0.268551,0.198875,0.262882,0.358552,0.314191,0.075217,0.437747,0.185198,0.135083
min,35.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,11381.0,0.035017,0.58725,0.015715,0.023735,0.482587,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,41233.0,0.057883,0.6402,0.0474,0.060241,0.537313,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,218413.5,0.102256,0.6951,0.11965,0.13253,0.60199,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1110807.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,...,1.0,1.0,1.0,1.0,1.0,2.0,1.0,2.0,1.0,2.0


In [145]:
merged_features.to_csv('./cleaned_features.csv', index=False)