In [None]:
""" Build 2 regression models, which predict 1) the return on investment 2) the IMDB rating of a movie.
Inspect the freature importance to find out, how to most efficiently manipulate these targets. """

In [None]:
# imports
%reset -f
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
sns.set()
import os
import numpy as np
from scipy.stats import uniform

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [None]:
movies_cleaned_directors_filtered =  pd.read_csv("./output/movies_cleaned_directors_filtered.csv")

## Pre-Processing and Feature Selection

In [None]:
# create return on investment feature
movies_cleaned_directors_filtered['return']  = (movies_cleaned_directors_filtered['revenue']-movies_cleaned_directors_filtered['budget']) / movies_cleaned_directors_filtered['budget'] # define the return of investment as multiplicative factor

In [None]:
# remove outliers with very unusual budgets/revenues
min_budget = 50000 # set a minimum budget for movies to be considered here
movies_cleaned_directors_filtered = movies_cleaned_directors_filtered[movies_cleaned_directors_filtered['budget']>=min_budget]
# limit the max return
max_return = 450 
movies_cleaned_directors_filtered = movies_cleaned_directors_filtered[movies_cleaned_directors_filtered['return']<=max_return]

In [None]:
movies_cleaned_directors_filtered.sort_values('return',ascending=False).head(5)

In [None]:
df = movies_cleaned_directors_filtered[['runtimeMinutes','genres','averageRating','numVotes','directors','writers','original_language','budget','return']]

In [None]:
df.isna().sum()

In [None]:
# drop rows with insufficient data
df.dropna(axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
# create new column for each feature and convert "genres" column into 1 or 0 one-hot-encoding
# get unique genres first
dm = df['genres'].str.split(',', expand=True)
# find unique genres
g1 = dm[0].unique()
g2 = dm[1].unique()
g3 = dm[2].unique()
g  = np.concatenate([g1,g2,g3])
genre_list = pd.Series(g).unique()
# remove nan values from list
result = []
for el in genre_list:
    if type(el) == str:
        result.append(el)
genre_list = result
# remove uninteresting genres - very few movies here
genre_list.remove('Film-Noir')
genre_list.remove('News')
genre_list.remove('Sport')

In [None]:
# create new genres columns
for genre in genre_list:
    df[genre] = 0
for row in range(len(df)):
    for genre in genre_list:
        if genre in df.loc[row, 'genres']:
            df.loc[row, genre] = 1
df.drop('genres',axis=1,inplace=True)

## Feature Engineering

In [None]:
## create the following features:

# number of directors per film
df['directors'] = df['directors'].apply(lambda row: len(row.split(',')))
# number of writers per film
df['writers'] = df['writers'].apply(lambda row: len(row.split(',')))
# if a film is in original english or not
df['foreign_language'] = df['original_language'].apply(lambda row: 0 if row=='en' else 1)
df.drop('original_language',axis=1,inplace=True)

In [None]:
df.reset_index(drop=True, inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.describe()

In [None]:
# the data is highly scewed, we need to remove outliers
sns.displot(data=df['return'])

In [None]:
# remove unusually high returns
cols = ['return']

Q1 = df[cols].quantile(0.00)
Q3 = df[cols].quantile(0.97)
IQR = Q3 - Q1

df = df[~((df[cols] < (Q1 - 1.5 * IQR)) |(df[cols] > (Q3 + 1.5 * IQR))).any(axis=1)]

In [None]:
sns.lineplot(data=df['return'])

In [None]:
sns.displot(data=df['return'])

In [None]:
df

In [None]:
sns.lineplot(data=df['averageRating'])

In [None]:
sns.distplot(df['averageRating'])

# 2. Model Building

## 2.1 Predicting Return on Investment

In [None]:
y = df['return']
X = df.drop('return', axis=1)
# split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,random_state=0)

In [None]:
model_label = []
estimator = []
best_cv_score = []
best_metric_score = []
example_prob_result = []
nonsense_prob_result = []
# define parameters for ALL grid searches
n_iter = 75
scoring = 'neg_mean_squared_error' 
cv = 5  
verbose = 1
return_train_score = True
random_state = 0
n_jobs = -1

### Linear Regression

In [None]:
model_label.append('linear')
regressor = LinearRegression()
pipe = Pipeline(steps=[("linear", regressor)])

# set up param distributions for grid search
param_dist = {
    "linear__fit_intercept": [False, True],
}
search = RandomizedSearchCV(pipe, param_dist, n_iter= n_iter, scoring=scoring,
                            cv=cv, verbose=verbose, return_train_score=return_train_score,
                            random_state=random_state)

search.fit(X_train, y_train)

print("Best parameter for grid search (CV score=%0.4f):" % search.best_score_)
print(search.best_params_)
print("Training Set performance: ")
y_pred = search.best_estimator_.predict(X_test)
print(mean_squared_error(y_test, y_pred))
# save data for model comparison later
estimator.append(search.best_estimator_)
best_cv_score.append(search.best_score_)
best_metric_score.append(mean_squared_error(y_test, y_pred))

### Lasso

In [None]:
model_label.append('lasso')
regressor = Lasso(max_iter=2000)
pipe = Pipeline(steps=[("lasso", regressor)])

# set up param distributions for grid search
param_dist = {
    "lasso__fit_intercept": [False, True],
    "lasso__alpha": uniform(loc=0, scale=5).rvs(size=200),
}
search = RandomizedSearchCV(pipe, param_dist, n_iter= n_iter, scoring=scoring,
                            cv=cv, verbose=verbose, return_train_score=return_train_score,
                            random_state=random_state)

search.fit(X_train, y_train)

print("Best parameter for grid search (CV score=%0.4f):" % search.best_score_)
print(search.best_params_)
print("Training Set performance: ")
y_pred = search.best_estimator_.predict(X_test)
print(mean_squared_error(y_test, y_pred))
# save data for model comparison later
estimator.append(search.best_estimator_)
best_cv_score.append(search.best_score_)
best_metric_score.append(mean_squared_error(y_test, y_pred))

### SVM

In [None]:
# model_label.append('svr')
# regressor = SVR(max_iter=-1)
# pipe = Pipeline(steps=[("svr", regressor)])

# # set up param distributions for grid search
# param_dist = {
#     "svr__C": uniform(loc=0, scale=3).rvs(size=50),
#     "svr__kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
#     "svr__degree": [2, 3, 4],
#     "svr__epsilon": uniform(loc=0, scale=3).rvs(size=50),
# }
# search = RandomizedSearchCV(pipe, param_dist, n_iter=n_iter, scoring=scoring,
#                             cv=cv, verbose=verbose, return_train_score=return_train_score,
#                             random_state=random_state)

# search.fit(X_train, y_train)

# print("Best parameter for grid search (CV score=%0.4f):" % search.best_score_)
# print(search.best_params_)
# print("Training Set performance: ")
# y_pred = search.best_estimator_.predict(X_test)
# print(mean_squared_error(y_test, y_pred))
# #save data for model comparison later
# estimator.append(search.best_estimator_)
# best_cv_score.append(search.best_score_)
# best_metric_score.append(mean_squared_error(y_test, y_pred))

### Decision Tree

In [None]:
model_label.append('decisiontree')
regressor = DecisionTreeRegressor()
pipe = Pipeline(steps=[("decisiontree", regressor)])

# set up param distributions for grid search
depth = list(range(1,10,1))
depth.append(None)
param_dist = {
    "decisiontree__criterion": ['absolute_error'],
    "decisiontree__max_depth": depth,
    "decisiontree__min_samples_split": list(range(2,200,1)),
}
search = RandomizedSearchCV(pipe, param_dist, n_iter= n_iter, scoring=scoring,
                            cv=cv, verbose=verbose, return_train_score=return_train_score,
                            random_state=random_state)

search.fit(X_train, y_train)

print("Best parameter for grid search (CV score=%0.4f):" % search.best_score_)
print(search.best_params_)
print("Training Set performance: ")
y_pred = search.best_estimator_.predict(X_test)
print(mean_squared_error(y_test, y_pred))
#save data for model comparison later
estimator.append(search.best_estimator_)
best_cv_score.append(search.best_score_)
best_metric_score.append(mean_squared_error(y_test, y_pred))


### Random Forest

In [None]:
model_label.append('randomforest')
regressor = RandomForestRegressor()
pipe = Pipeline(steps=[("randomforest", regressor)])

# set up param distributions for grid search
depth = list(range(1,10,1))
depth.append(None)
param_dist = {
    "randomforest__n_estimators": list(range(1,1500,10)),
    "randomforest__max_depth": depth,
}
search = RandomizedSearchCV(pipe, param_dist, n_iter= n_iter, scoring=scoring,
                            cv=cv, verbose=verbose, return_train_score=return_train_score,
                            random_state=random_state)

search.fit(X_train, y_train)

print("Best parameter for grid search (CV score=%0.4f):" % search.best_score_)
print(search.best_params_)
print("Training Set performance: ")
y_pred = search.best_estimator_.predict(X_test)
print(mean_squared_error(y_test, y_pred))
#save data for model comparison later
estimator.append(search.best_estimator_)
best_cv_score.append(search.best_score_)
best_metric_score.append(mean_squared_error(y_test, y_pred))

# 21.8 28.26

### XGBoost

In [None]:
model_label.append('xgboost')
regressor = XGBRegressor(n_jobs=-1)
pipe = Pipeline(steps=[("xgboost", regressor)])

# set up param distributions for grid search
depth = list(range(1,10,1))
depth.append(None)
param_dist = {
    "xgboost__booster": ['gbtree'],
    "xgboost__n_estimators": list(range(1,300,1)),
    "xgboost__max_depth": depth,
    "xgboost__min_child_weight": list(range(0,150,1)),
    "xgboost__learning_rate": uniform(loc=0, scale=0.2).rvs(size=25),
}
search = RandomizedSearchCV(pipe, param_dist, n_iter = 200, scoring=scoring,
                            cv=cv, verbose=verbose, return_train_score=return_train_score,
                            random_state=random_state)

search.fit(X_train, y_train)

print("Best parameter for grid search (CV score=%0.4f):" % search.best_score_)
print(search.best_params_)
print("Training Set performance: ")
y_pred = search.best_estimator_.predict(X_test)
print(mean_squared_error(y_test, y_pred))
#save data for model comparison later
estimator.append(search.best_estimator_)
best_cv_score.append(search.best_score_)
best_metric_score.append(mean_squared_error(y_test, y_pred))

In [None]:
# draw histogram of importances of each feature
model = search.best_estimator_[0]
importances = pd.DataFrame(zip(model.feature_importances_,model.feature_names_in_), columns=['Importance', 'Feature'])
importances.sort_values(by='Importance', inplace=True, ascending=False)
plt.figure(figsize=(40, 20))
sns.barplot(x='Feature', y='Importance', data=importances)

importances.to_csv("./output/importances_return.csv", index=False)

### Model Comparison

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(14,14), sharey=True)
colors = sns.color_palette("pastel")

axs[0].set_title("Model score comparisons", fontsize=16)
best_cv_score = [- k for k in best_cv_score]
axs[0].bar(x=range(len(model_label)), height=(best_cv_score), width=0.5, color=colors)
axs[1].bar(x=range(len(model_label)), height=best_metric_score, width=0.5, color=colors)

for i in range(2):
    axs[i].set_xticks(range(len(model_label)))
    axs[i].set_xticklabels(labels=model_label, fontsize=13, rotation=0)

axs[1].set_xlabel("Model", fontsize=16)

axs[0].set_ylabel("Best CV score", fontsize=16)
axs[1].set_ylabel("Accuracy vs. test data", fontsize=16)

#axs[0].set_ylim([0.45,1.05])

plt.show()

## 2.2 Predicting Rating

In [None]:
y = df['averageRating']
X = df.drop('averageRating', axis=1)
# split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
model_label = []
estimator = []
best_cv_score = []
best_metric_score = []
example_prob_result = []
nonsense_prob_result = []
# define parameters for ALL grid searches
n_iter = 75
scoring = 'neg_mean_squared_error' 
cv = 5  
verbose = 1
return_train_score = True
random_state = 0
n_jobs = -1

### Linear Regression

In [None]:
model_label.append('linear')
regressor = LinearRegression()
pipe = Pipeline(steps=[("linear", regressor)])

# set up param distributions for grid search
param_dist = {
    "linear__fit_intercept": [False, True],
}
search = RandomizedSearchCV(pipe, param_dist, n_iter= n_iter, scoring=scoring,
                            cv=cv, verbose=verbose, return_train_score=return_train_score,
                            random_state=random_state)

search.fit(X_train, y_train)

print("Best parameter for grid search (CV score=%0.4f):" % search.best_score_)
print(search.best_params_)
print("Training Set performance: ")
y_pred = search.best_estimator_.predict(X_test)
print(mean_squared_error(y_test, y_pred))
# save data for model comparison later
estimator.append(search.best_estimator_)
best_cv_score.append(search.best_score_)
best_metric_score.append(mean_squared_error(y_test, y_pred))

### Lasso

In [None]:
model_label.append('lasso')
regressor = Lasso(max_iter=2000)
pipe = Pipeline(steps=[("lasso", regressor)])

# set up param distributions for grid search
param_dist = {
    "lasso__fit_intercept": [False, True],
    "lasso__alpha": uniform(loc=0, scale=5).rvs(size=200),
}
search = RandomizedSearchCV(pipe, param_dist, n_iter= n_iter, scoring=scoring,
                            cv=cv, verbose=verbose, return_train_score=return_train_score,
                            random_state=random_state)

search.fit(X_train, y_train)

print("Best parameter for grid search (CV score=%0.4f):" % search.best_score_)
print(search.best_params_)
print("Training Set performance: ")
y_pred = search.best_estimator_.predict(X_test)
print(mean_squared_error(y_test, y_pred))
# save data for model comparison later
estimator.append(search.best_estimator_)
best_cv_score.append(search.best_score_)
best_metric_score.append(mean_squared_error(y_test, y_pred))

### SVM

In [None]:
# model_label.append('svr')
# regressor = SVR(max_iter=-1)
# pipe = Pipeline(steps=[("svr", regressor)])

# # set up param distributions for grid search
# param_dist = {
#     "svr__C": uniform(loc=0, scale=3).rvs(size=50),
#     "svr__kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
#     "svr__degree": [2, 3, 4],
#     "svr__epsilon": uniform(loc=0, scale=3).rvs(size=50),
# }
# search = RandomizedSearchCV(pipe, param_dist, n_iter=n_iter, scoring=scoring,
#                             cv=cv, verbose=verbose, return_train_score=return_train_score,
#                             random_state=random_state)

# search.fit(X_train, y_train)

# print("Best parameter for grid search (CV score=%0.4f):" % search.best_score_)
# print(search.best_params_)
# print("Training Set performance: ")
# y_pred = search.best_estimator_.predict(X_test)
# print(mean_squared_error(y_test, y_pred))
# #save data for model comparison later
# estimator.append(search.best_estimator_)
# best_cv_score.append(search.best_score_)
# best_metric_score.append(mean_squared_error(y_test, y_pred))

### Decision Tree

In [None]:
model_label.append('decisiontree')
regressor = DecisionTreeRegressor()
pipe = Pipeline(steps=[("decisiontree", regressor)])

# set up param distributions for grid search
depth = list(range(1,10,1))
depth.append(None)
param_dist = {
    "decisiontree__criterion": ['absolute_error'],
    "decisiontree__max_depth": depth,
    "decisiontree__min_samples_split": list(range(2,200,1)),
}
search = RandomizedSearchCV(pipe, param_dist, n_iter= n_iter, scoring=scoring,
                            cv=cv, verbose=verbose, return_train_score=return_train_score,
                            random_state=random_state)

search.fit(X_train, y_train)

print("Best parameter for grid search (CV score=%0.4f):" % search.best_score_)
print(search.best_params_)
print("Training Set performance: ")
y_pred = search.best_estimator_.predict(X_test)
print(mean_squared_error(y_test, y_pred))
#save data for model comparison later
estimator.append(search.best_estimator_)
best_cv_score.append(search.best_score_)
best_metric_score.append(mean_squared_error(y_test, y_pred))


### Random Forest

In [None]:
model_label.append('randomforest')
regressor = RandomForestRegressor()
pipe = Pipeline(steps=[("randomforest", regressor)])

# set up param distributions for grid search
depth = list(range(1,10,1))
depth.append(None)
param_dist = {
    "randomforest__n_estimators": list(range(1,1500,10)),
    "randomforest__max_depth": depth,
}
search = RandomizedSearchCV(pipe, param_dist, n_iter= n_iter, scoring=scoring,
                            cv=cv, verbose=verbose, return_train_score=return_train_score,
                            random_state=random_state)

search.fit(X_train, y_train)

print("Best parameter for grid search (CV score=%0.4f):" % search.best_score_)
print(search.best_params_)
print("Training Set performance: ")
y_pred = search.best_estimator_.predict(X_test)
print(mean_squared_error(y_test, y_pred))
#save data for model comparison later
estimator.append(search.best_estimator_)
best_cv_score.append(search.best_score_)
best_metric_score.append(mean_squared_error(y_test, y_pred))

# 21.8 28.26

### XGBoost

In [None]:
model_label.append('xgboost')
regressor = XGBRegressor(n_jobs=-1)
pipe = Pipeline(steps=[("xgboost", regressor)])

# set up param distributions for grid search
depth = list(range(1,10,1))
depth.append(None)
param_dist = {
    "xgboost__booster": ['gbtree'],
    "xgboost__n_estimators": list(range(1,300,1)),
    "xgboost__max_depth": depth,
    "xgboost__min_child_weight": list(range(0,150,1)),
    "xgboost__learning_rate": uniform(loc=0, scale=0.2).rvs(size=25),
}
search = RandomizedSearchCV(pipe, param_dist, n_iter = 200, scoring=scoring,
                            cv=cv, verbose=verbose, return_train_score=return_train_score,
                            random_state=random_state)

search.fit(X_train, y_train)

print("Best parameter for grid search (CV score=%0.4f):" % search.best_score_)
print(search.best_params_)
print("Training Set performance: ")
y_pred = search.best_estimator_.predict(X_test)
print(mean_squared_error(y_test, y_pred))
#save data for model comparison later
estimator.append(search.best_estimator_)
best_cv_score.append(search.best_score_)
best_metric_score.append(mean_squared_error(y_test, y_pred))

In [None]:
# draw histogram of importances of each feature
model = search.best_estimator_[0]
importances = pd.DataFrame(zip(model.feature_importances_,model.feature_names_in_), columns=['Importance', 'Feature'])
importances.sort_values(by='Importance', inplace=True, ascending=False)
plt.figure(figsize=(40, 20))
sns.barplot(x='Feature', y='Importance', data=importances)

importances.to_csv("./output/importances_rating.csv", index=False)

### Model Comparison

In [None]:
fig, axs = plt.subplots(2, 1, figsize=(14,14), sharey=True)
colors = sns.color_palette("pastel")

axs[0].set_title("Model score comparisons", fontsize=16)
best_cv_score = [- k for k in best_cv_score]
axs[0].bar(x=range(len(model_label)), height=(best_cv_score), width=0.5, color=colors)
axs[1].bar(x=range(len(model_label)), height=best_metric_score, width=0.5, color=colors)

for i in range(2):
    axs[i].set_xticks(range(len(model_label)))
    axs[i].set_xticklabels(labels=model_label, fontsize=13, rotation=0)

axs[1].set_xlabel("Model", fontsize=16)

axs[0].set_ylabel("Best CV score", fontsize=16)
axs[1].set_ylabel("Accuracy vs. test data", fontsize=16)

#axs[0].set_ylim([0.45,1.05])

plt.show()