# INFO-4604 Final Project 

* Created by Garrett Glissmann on November 17, 2017

In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dfMovies = pd.read_csv('tmdb_5000_movies_out_final.csv', sep=',', header=0, encoding='latin1')
dfMovies.head()

Unnamed: 0,budget,homepage,id,original_title,overview,popularity,release_date,revenue,runtime,status,...,Keyword_gang,Keyword_psychopath,Keyword_neighbor,Keyword_mother daughter relationship,Keyword_drug dealer,Keyword_faith,Keyword_mutant,Keyword_hero,Keyword_rivalry,Keyword_california
0,237000000,http://www.avatarmovie.com/,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,12/10/09,2787965087,162.0,Released,...,0,0,0,0,0,0,0,0,0,0
1,300000000,http://disney.go.com/disneypictures/pirates/,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,5/19/07,961000000,169.0,Released,...,0,0,0,0,0,0,0,0,0,0
2,245000000,http://www.sonypictures.com/movies/spectre/,206647,Spectre,A cryptic message from BondÛªs past sends him...,107.376788,10/26/15,880674609,148.0,Released,...,0,0,0,0,0,0,0,0,0,0
3,250000000,http://www.thedarkknightrises.com/,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,7/16/12,1084939099,165.0,Released,...,0,0,0,0,0,0,0,1,0,0
4,260000000,http://movies.disney.com/john-carter,49529,John Carter,"John Carter is a war-weary, former military ca...",43.926995,3/7/12,284139100,132.0,Released,...,0,0,0,0,0,0,0,0,0,0


In [66]:
dfMoviesSelect = dfMovies.drop([
    'homepage', 'id', 'original_title', 'overview', 'release_date', 'title', 'status', 'tagline'
], axis=1)
dfMoviesSelect.fillna(0, inplace=True)

df_revenue_X = dfMoviesSelect.loc[:, dfMoviesSelect.columns != 'revenue']
df_revenue_Y = dfMoviesSelect.loc[:, dfMoviesSelect.columns == 'revenue']

df_vote_X = dfMoviesSelect.loc[:, dfMoviesSelect.columns != 'vote_average']
df_vote_Y = dfMoviesSelect.loc[:, dfMoviesSelect.columns == 'vote_average']

dfMoviesSelect.head()

Unnamed: 0,budget,popularity,revenue,runtime,vote_average,vote_count,genres_action,genres_adventure,genres_fantasy,genres_science_fiction,...,Keyword_gang,Keyword_psychopath,Keyword_neighbor,Keyword_mother daughter relationship,Keyword_drug dealer,Keyword_faith,Keyword_mutant,Keyword_hero,Keyword_rivalry,Keyword_california
0,237000000,150.437577,2787965087,162.0,7.2,11800,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,300000000,139.082615,961000000,169.0,6.9,4500,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,245000000,107.376788,880674609,148.0,6.3,4466,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,250000000,112.31295,1084939099,165.0,7.6,9106,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,260000000,43.926995,284139100,132.0,6.1,2124,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [67]:
from sklearn import preprocessing, linear_model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.svm import SVR
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.neural_network import MLPRegressor

class RegressionSteps:
    def __init__(self, all_x, all_y, test_size=[0.2], do_std=False, percentiles=[1,5,10], C_param=[0.1, 1, 10], kernel_param=['rbf'], layer_param=[(200)]):
        self.all_x = all_x
        self.all_y = all_y
        self.random_state = 42
        self.test_size = test_size
        self.do_std = do_std
        self.percentiles = percentiles
        self.C_param = C_param
        self.kernel_param = kernel_param
        self.layer_param = layer_param
        
    def start(self):
        for ts in self.test_size:
            print("Starting data split: {}/{}".format(int(100*(1 - ts)), int(ts * 100)))
            self.start_set(ts)
    
    def start_set(self, test_size):
        self.X_train_orig, self.X_test_orig, self.Y_train, self.Y_test = train_test_split(
            self.all_x, 
            self.all_y, 
            test_size=test_size, 
            random_state=self.random_state
        )
        
        # Standardization
        if (self.do_std):
            min_max_scaler = preprocessing.MinMaxScaler()
            self.X_train_orig = min_max_scaler.fit_transform(self.X_train_orig)
            self.X_test_orig = min_max_scaler.transform(self.X_test_orig)
            
        self.do_tests()
        
    # Feature selection (percentile)
    def set_selection(self, per):
        selection = SelectPercentile(percentile=per, score_func=chi2)
        self.X_train = selection.fit_transform(self.X_train_orig, self.Y_train)
        self.X_test = selection.transform(self.X_test_orig)
    
    def fit_and_predict (self, model, params = {}):
        if (len(params) > 0):
            # Performs 5-fold cross-validation with the above classifier and parameter options
            gs = GridSearchCV(model, params, cv=5)
            # Train the model using the training sets
            gs.fit(self.X_train, self.Y_train)
            # Make predictions using the testing set
            y_pred = gs.predict(self.X_test)
            print("\t\tBest parameter settings:", gs.best_params_)
            return gs, y_pred
        else:
            # Train the model using the training sets
            model.fit(self.X_train, self.Y_train)
            # Make predictions using the testing set
            y_pred = model.predict(self.X_test)
            return model, y_pred

    def show_metrics (self, tup):
        model, y_pred = tup
        # The mean squared error
        print("\t\tMean squared error: %.2f" % mean_squared_error(self.Y_test, y_pred))
        # Explained variance score: 1 is perfect prediction
        print("\t\tVariance score: %.2f" % r2_score(self.Y_test, y_pred))
        # Different?
        print("\t\tExplained variance score: %.2f" % explained_variance_score(self.Y_test, y_pred))
    
    def do_tests(self):
        for p in self.percentiles:
            print("Percentile: {}".format(p))
            self.set_selection(p)
            self.test_selection()

    def test_selection(self):
        # Simple linear regression model
        print("\tLinear Regression:")
        regr = linear_model.LinearRegression()
        self.show_metrics(self.fit_and_predict(regr))

        # Support Vector Regression
        print("\tSupport Vector Regression:")
        svr_regr = SVR(gamma=0.1)
        self.show_metrics(self.fit_and_predict(svr_regr, {
            'C': self.C_param,
            'kernel': self.kernel_param
        }))

        # Multi-layer Perceptron Regressor
        print("\tMulti-layer Perceptron Regressor:")
        mlp_regr = MLPRegressor(random_state=123)
        self.show_metrics(self.fit_and_predict(mlp_regr, {
            'hidden_layer_sizes': self.layer_param
        }))
        

In [None]:
# Revenue ML

df_revenue_X_mtx = df_revenue_X.as_matrix()
df_revenue_Y_arr = df_revenue_Y['revenue'].values

regr = RegressionSteps(df_revenue_X_mtx, df_revenue_Y_arr,
                       percentiles = [1,5,10,20,40,60,80,100],
                       C_param=[0.001, 0.01, 0.1, 1.0, 10, 100, 1000],
                       layer_param=[(200), (300), (100,100), (300,100)]
                      )
regr.start()

Starting data split: 80/20
Percentile: 1
	Linear Regression:
		Mean squared error: 6512572838733131.00
		Variance score: 0.75
		Explained variance score: 0.75
	Support Vector Regression:
		Best parameter settings: {'C': 1000, 'kernel': 'rbf'}
		Mean squared error: 30326492826145536.00
		Variance score: -0.16
		Explained variance score: 0.00
	Multi-layer Perceptron Regressor:
		Best parameter settings: {'hidden_layer_sizes': 200}
		Mean squared error: 9453789112746722.00
		Variance score: 0.64
		Explained variance score: 0.64
Percentile: 5
	Linear Regression:
		Mean squared error: 6515237972215048.00
		Variance score: 0.75
		Explained variance score: 0.75
	Support Vector Regression:
		Best parameter settings: {'C': 1000, 'kernel': 'rbf'}
		Mean squared error: 30326493046920228.00
		Variance score: -0.16
		Explained variance score: 0.00
	Multi-layer Perceptron Regressor:
		Best parameter settings: {'hidden_layer_sizes': 200}
		Mean squared error: 9440303533448342.00
		Variance score: 0.6

In [None]:
# Vote ML

df_vote_X_mtx = df_vote_X.as_matrix()
df_vote_Y_arr = df_vote_Y['vote_average'].values * 10

regr = RegressionSteps(df_vote_X_mtx, df_vote_Y_arr,
                       percentiles = [1,5,10,20,40,60,80,100],
                       C_param=[0.001, 0.01, 0.1, 1.0, 10, 100, 1000],
                       layer_param=[(200), (300), (100,100), (300,100)]
                      )
regr.start()

Starting data split: 80/20
Percentile: 1
	Linear Regression:
		Mean squared error: 114.67
		Variance score: 0.24
		Explained variance score: 0.24
	Support Vector Regression:
		Best parameter settings: {'C': 100, 'kernel': 'rbf'}
		Mean squared error: 99.55
		Variance score: 0.34
		Explained variance score: 0.34
	Multi-layer Perceptron Regressor:
		Best parameter settings: {'hidden_layer_sizes': 200}
		Mean squared error: 2432670155.28
		Variance score: -16133503.59
		Explained variance score: -13749494.77
Percentile: 5
	Linear Regression:
		Mean squared error: 109.38
		Variance score: 0.27
		Explained variance score: 0.28
	Support Vector Regression:
		Best parameter settings: {'C': 100, 'kernel': 'rbf'}
		Mean squared error: 98.13
		Variance score: 0.35
		Explained variance score: 0.35
	Multi-layer Perceptron Regressor:
		Best parameter settings: {'hidden_layer_sizes': (100, 100)}
		Mean squared error: 611803942.61
		Variance score: -4057491.83
		Explained variance score: -3937580.40
P