In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import statsmodels.formula.api as smf



In [16]:
#read in movie data
df = pd.read_csv('movie_metadata.csv')


## Data Cleaning

In [17]:
#remove rows having genres with combiend revenue = 0 (film_noir,,game_shwo, reality_tv)

df = df[df['genres'] != 'film_noir']
df = df[df['genres'] != 'game_show']
df = df[df['genres'] != 'reality_tv']

#create profit column
df['profit'] = df['gross'] - df['budget']


In [18]:
#remove missing values
df = df.drop(columns=['color'])
n_original = len(df)
df.dropna(inplace=True)
n_null_removed = len(df)
print("original size, new size: "+ str(n_original)+", " +str(n_null_removed))


original size, new size: 5043, 3757


In [19]:
#remove outliers of profit within 3 standard deviations
df = df[df['profit'] < 3*df['profit'].std()]
df = df[df['profit'] > -3*df['profit'].std()]

In [20]:
df["title_year"] = df["title_year"].astype(int)


## EDA

In [21]:
# create bins based on quartiles of the "gross" variable
df['gross_bin'] = pd.qcut(df['gross'], 4, labels=False)
# normalize "gross_bins" to have values in the range [0, 1]
df['gross_bin'] = df['gross_bin'] / df['gross_bin'].max()


In [22]:
df['gross_bin'].value_counts()

1.000000    937
0.666667    937
0.000000    937
0.333333    937
Name: gross_bin, dtype: int64

In [23]:
df["title_year"] = df["title_year"].astype(int)


### Genres

In [24]:
# create binary variables for each genre
genres_df = df['genres'].str.get_dummies(sep='|')
df = pd.concat([df, genres_df], axis=1)


### Actors


In [25]:
df_people = df[["director_name", "director_facebook_likes",  "actor_1_name", "actor_1_facebook_likes","actor_2_name", "actor_2_facebook_likes", "actor_3_name","actor_3_facebook_likes", "cast_total_facebook_likes",  "profit"]]
yay = df_people.groupby(["director_name","actor_1_name", "actor_2_name", "actor_3_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False)
df_people.groupby(["director_name","actor_1_name", "actor_2_name", "actor_3_name"]).agg("count")
director = df_people.groupby(["director_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).reset_index()
actor_1 = df_people.groupby(["actor_1_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).reset_index()
actor_2 = df_people.groupby(["actor_2_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).reset_index()
actor_3 = df_people.groupby(["actor_3_name"])[["profit"]].agg("mean").sort_values(by = "profit", ascending=False).reset_index()
director_high = [x for x in director['director_name'].iloc[:(round(len(director)/2))]]
director_low = [x for x in director['director_name'].iloc[(round(len(director)/2)):len(director)]]
actor_1_high = [x for x in actor_1['actor_1_name'].iloc[:(round(len(actor_1)/2))]]
actor_1_low = [x for x in actor_1['actor_1_name'].iloc[(round(len(actor_1)/2)):len(actor_1)]]
actor_2_high = [x for x in actor_2['actor_2_name'].iloc[:(round(len(actor_2)/2))]]
actor_2_low = [x for x in actor_2['actor_2_name'].iloc[(round(len(actor_2)/2)):len(actor_2)]]
actor_3_high = [x for x in actor_3['actor_3_name'].iloc[:(round(len(actor_3)/2))]]
actor_3_low = [x for x in actor_3['actor_3_name'].iloc[(round(len(actor_3)/2)):len(actor_3)]]
df["director_high"] = [1 if director_high.count(x) > 0 else 0 for x in df["director_name"]]
df["director_low"] = [1 if director_low.count(x) > 0 else 0 for x in df["director_name"]]
df["actor_1_high"] = [1 if actor_1_high.count(x) > 0 else 0 for x in df["actor_1_name"]]
df["actor_1_low"] = [1 if actor_1_low.count(x) > 0 else 0 for x in df["actor_1_name"]]
df["actor_2_high"] = [1 if actor_2_high.count(x) > 0 else 0 for x in df["actor_2_name"]]
df["actor_2_low"] = [1 if actor_2_low.count(x) > 0 else 0 for x in df["actor_2_name"]]
df["actor_2_high"] = [1 if actor_2_high.count(x) > 0 else 0 for x in df["actor_2_name"]]
df["actor_2_low"] = [1 if actor_2_low.count(x) > 0 else 0 for x in df["actor_2_name"]]
df["actor_3_high"] = [1 if actor_3_high.count(x) > 0 else 0 for x in df["actor_3_name"]]
df["actor_3_low"] = [1 if actor_3_low.count(x) > 0 else 0 for x in df["actor_3_name"]]