# INFO-4604 Final Project 

* Created by Garrett Glissmann on November 17, 2017

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

dfMovies = pd.read_csv('tmdb_5000_movies_out_final.csv', sep=',', header=0, encoding='latin1')
dfMovies.head()

Unnamed: 0,budget,homepage,id,original_title,overview,popularity,release_date,revenue,runtime,status,...,Keyword_gang,Keyword_psychopath,Keyword_neighbor,Keyword_mother daughter relationship,Keyword_drug dealer,Keyword_faith,Keyword_mutant,Keyword_hero,Keyword_rivalry,Keyword_california
0,237000000,http://www.avatarmovie.com/,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,12/10/09,2787965087,162.0,Released,...,0,0,0,0,0,0,0,0,0,0
1,300000000,http://disney.go.com/disneypictures/pirates/,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,5/19/07,961000000,169.0,Released,...,0,0,0,0,0,0,0,0,0,0
2,245000000,http://www.sonypictures.com/movies/spectre/,206647,Spectre,A cryptic message from BondÛªs past sends him...,107.376788,10/26/15,880674609,148.0,Released,...,0,0,0,0,0,0,0,0,0,0
3,250000000,http://www.thedarkknightrises.com/,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,7/16/12,1084939099,165.0,Released,...,0,0,0,0,0,0,0,1,0,0
4,260000000,http://movies.disney.com/john-carter,49529,John Carter,"John Carter is a war-weary, former military ca...",43.926995,3/7/12,284139100,132.0,Released,...,0,0,0,0,0,0,0,0,0,0


In [2]:
dfMoviesSelect = dfMovies.drop([
    'homepage', 'id', 'original_title', 'overview', 'release_date', 'title', 'status', 'tagline'
], axis=1)
dfMoviesSelect.fillna(0, inplace=True)

df_revenue_X = dfMoviesSelect.loc[:, dfMoviesSelect.columns != 'revenue']
df_revenue_Y = dfMoviesSelect.loc[:, dfMoviesSelect.columns == 'revenue']

dfMoviesSelect.head()

Unnamed: 0,budget,popularity,revenue,runtime,vote_average,vote_count,genres_action,genres_adventure,genres_fantasy,genres_science_fiction,...,Keyword_gang,Keyword_psychopath,Keyword_neighbor,Keyword_mother daughter relationship,Keyword_drug dealer,Keyword_faith,Keyword_mutant,Keyword_hero,Keyword_rivalry,Keyword_california
0,237000000,150.437577,2787965087,162.0,7.2,11800,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,300000000,139.082615,961000000,169.0,6.9,4500,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,245000000,107.376788,880674609,148.0,6.3,4466,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,250000000,112.31295,1084939099,165.0,7.6,9106,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,260000000,43.926995,284139100,132.0,6.1,2124,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Revenue ML
from sklearn import preprocessing, linear_model
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, explained_variance_score
from sklearn.svm import SVR
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.neural_network import MLPRegressor

df_revenue_X_mtx = df_revenue_X.as_matrix()
df_revenue_Y_arr = df_revenue_Y['revenue'].values

# 80/20 split
X_train, X_test, Y_train, Y_test = train_test_split(
    df_revenue_X_mtx, df_revenue_Y_arr, 
    test_size=0.20, random_state=42
)

# Feature standardization
# min_max_scaler = preprocessing.MinMaxScaler()
# X_train_minmax = min_max_scaler.fit_transform(X_train)
# X_test_minmax = min_max_scaler.transform(X_test)
# print(X_train_minmax)
X_train_minmax, X_test_minmax = X_train, X_test

# Feature selection (percentile)
def get_selection(per):
    # Global: X_train_minmax, Y_train, X_test_minmax
    selection = SelectPercentile(percentile=per, score_func=chi2)
    X_train_selected = selection.fit_transform(X_train_minmax, Y_train)
    X_test_selected = selection.transform(X_test_minmax)
    return X_train_selected, X_test_selected

def fit_and_predict (model, train_test_tup, params = {}):
    # Global: Y_train
    x_train, x_test = train_test_tup
    if (len(params) > 0):
        # Performs 5-fold cross-validation with the above classifier and parameter options
        gs = GridSearchCV(model, params, cv=5)
        # Train the model using the training sets
        gs.fit(x_train, Y_train)
        # Make predictions using the testing set
        y_pred = gs.predict(x_test)
        print("\t\tBest parameter settings:", gs.best_params_)
        return gs, y_pred
    else:
        # Train the model using the training sets
        model.fit(x_train, Y_train)
        # Make predictions using the testing set
        y_pred = model.predict(x_test)
        return model, y_pred

def show_metrics (tup):
    # Global: Y_test
    model, y_pred = tup
    # The mean squared error
    print("\t\tMean squared error: %.2f" % mean_squared_error(Y_test, y_pred))
    # Explained variance score: 1 is perfect prediction
    print("\t\tVariance score: %.2f" % r2_score(Y_test, y_pred))
    # Different?
    print("\t\tExplained variance score: %.2f" % explained_variance_score(Y_test, y_pred))
    print("\n")
    

percentiles = [1,5,10,20,40,60,80,100]
C_param = [0.001, 0.01, 0.1, 1.0, 10, 100, 1000]
Layer_param = [(200), (300), (100,100), (300,100)]
Kernel_param = ['rbf'] #,'linear','poly'

for p in percentiles:
    print("Percentile: {}".format(p))
    select_data = get_selection(p)
    
    # Simple linear regression model
    print("\tLinear Regression:")
    regr = linear_model.LinearRegression()
    show_metrics(fit_and_predict(regr, select_data))

    # Support Vector Regression
    print("\tSupport Vector Regression:")
    svr_regr = SVR(gamma=0.1)
    show_metrics(fit_and_predict(svr_regr, select_data, {
        'C': C_param,
        'kernel': Kernel_param
    }))
    
    # Multi-layer Perceptron Regressor
    print("\tMulti-layer Perceptron Regressor:")
    mlp_regr = MLPRegressor(random_state=123)
    show_metrics(fit_and_predict(mlp_regr, select_data, {
        'hidden_layer_sizes': Layer_param
    }))


Percentile: 1
	Linear Regression:
		Mean squared error: 6512572838733131.00
		Variance score: 0.75
		Explained variance score: 0.75


	Support Vector Regression:
		Best parameter settings: {'C': 1000, 'kernel': 'rbf'}
		Mean squared error: 30326492826145536.00
		Variance score: -0.16
		Explained variance score: 0.00


	Multi-layer Perceptron Regressor:
		Best parameter settings: {'hidden_layer_sizes': 200}
		Mean squared error: 9453789112746722.00
		Variance score: 0.64
		Explained variance score: 0.64


Percentile: 5
	Linear Regression:
		Mean squared error: 6515237972215048.00
		Variance score: 0.75
		Explained variance score: 0.75


	Support Vector Regression:
		Best parameter settings: {'C': 1000, 'kernel': 'rbf'}
		Mean squared error: 30326493046920228.00
		Variance score: -0.16
		Explained variance score: 0.00


	Multi-layer Perceptron Regressor:
		Best parameter settings: {'hidden_layer_sizes': 200}
		Mean squared error: 9440303533448342.00
		Variance score: 0.64
		Explained var