In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score, KFold, RepeatedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
from keras import models
from keras import layers
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn import preprocessing
from sklearn import svm
from sklearn import metrics
from xgboost import XGBClassifier
pd.options.mode.chained_assignment = None  # default='warn'
import datetime, time
import warnings
warnings.filterwarnings('ignore')
print('Libraries Imported')

# Set random seed
np.random.seed(17)

  from numpy.core.umath_tests import inner1d
Using TensorFlow backend.


## Data

In [14]:
i_path = 'D:\\Data\\Box-Office-Forecasting'
m = pd.read_csv(os.path.join(i_path, 'movie-master-final.csv'), header=0, sep=';', engine='python', encoding= 'utf8')

#target_variable = 'revenue_range'; problem_type = 'MULTICLASS'
target_variable = 'is_profitable'; problem_type = 'BINARY'

all_features = ['mpaa', 'budget', 'seasonality', 'is_sequel', 'screen_count', 'runtime']
yc_features = ['like_ratio', 'polarity_tb', 'polarity_sia']
# all_features = all_features + yc_features
data = m[all_features + [target_variable]]

# like_ratio column has some 'infinity' values, we replace them with one.
data = data.replace(np.Inf, 1)

# Factorize revenue_range to get numbers instead of labels
factor = pd.factorize(data[target_variable])
data[target_variable] = factor[0]
definitions = factor[1]

# Use LabelEncoder to convert textual classifications to numeric. We will use the same encoder later to convert them back.
encoder = preprocessing.LabelEncoder()
if 'mpaa' in all_features:
    data['mpaa'] = encoder.fit_transform(data['mpaa'].astype(str))
if 'genre' in all_features:
    data['genre'] = encoder.fit_transform(data['genre'].astype(str))

# Split columns into independent/predictor variables vs dependent/response/outcome variable
X = np.array(data.drop([target_variable], 1))
y = np.array(data[target_variable])

# Scale the data. We will use the same scaler later for scoring function
scaler = preprocessing.StandardScaler().fit(X)
X = scaler.transform(X)

# Training - Test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, random_state=17)

# 5 fold stratified cross validation
kf = KFold(n_splits=5, shuffle=True, random_state=17)

## Model

In [15]:
# Create file for results
date_part = datetime.datetime.fromtimestamp(time.time()).strftime('%Y%m%d-%H%M%S')
file_name = "D:\\Master\\Thesis Related\\Results\\" + problem_type + "_Results_{0}.txt".format(date_part)
f = open(file_name, "w+")
f.write("Model Training has started.\n")
f.write("Number of Features: {0}\n".format(len(all_features)))
f.write("Selected Features: {0}\n".format(all_features))
f.close()

In [16]:
def classification_report_df(report):
    report_data = []
    lines = report.split('\n')
    for line in lines[2:-3]:
        row = {}
        row_data = line.split('      ')
        row['f1_score'] = float(row_data[3])
        report_data.append(float(row_data[3]))
    #dataframe = pd.DataFrame.from_dict(report_data)
    return report_data

# Defining fit_algorithm function
def fit_algorithm(alg, X_train, X_test, y_train, y_test, parameters, cv = 5):
    """
    This function will split our dataset into training and testing subsets, fit cross-validated 
    GridSearch object, test it on the holdout set and return some statistics
    """
    f = open(file_name, "a")
    
    start_time = time.time()
    f.write("============================================================================\n")
    print("Applying {0}...".format(alg.__class__.__name__))
    f.write("Applying {0}...\n".format(alg.__class__.__name__))

    grid = GridSearchCV(alg, parameters, cv = cv, n_jobs=-1, scoring = 'accuracy')
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
        
    # Get Detailed Cross-Validation Scores
    d = []
    n_param_sets = len(grid.cv_results_['params'])
    for i in range(0, n_param_sets):
        d.append({'CV Score': grid.cv_results_['mean_test_score'][i], 'Parameter Set': grid.cv_results_['params'][i]})
    param_scores = pd.DataFrame(d)
    
    elapsed_time = time.time() - start_time
    print("Fitting is complete in {0} \n".format(elapsed_time)); f.write("Fitting is complete in {0} \n\n".format(elapsed_time));
    print("Results"); f.write("Results\n")
    print("Best CV Score: {0}".format(grid.best_score_)); f.write("Best CV Score: {0}\n".format(grid.best_score_))
    print("Test Accuracy: {0}".format(accuracy_score(y_test, y_pred))); print("")
    f.write("Test Accuracy: {0}".format(accuracy_score(y_test, y_pred))); f.write("\n\n")
    
    f.write("Confusion Matrix: \n")
    f.write(str(pd.crosstab(y_test, y_pred, rownames=['Actual Labels'], colnames=['Predicted Labels'])))
    f.write("\n\n")
    f.write("Detailed CV Scores: \n")
    f.write(str(param_scores)); f.write("\n\n");
    f.write("Classification Report: \n")
    f.write(metrics.classification_report(y_test, y_pred))
    f.write("\n\n")
    
    report = metrics.classification_report(y_test, y_pred)
    classification_report = classification_report_df(report)
    values = {
        "model": alg.__class__.__name__, 
        "cv_acc": np.around(grid.best_score_, decimals=3).astype(str),
        "test_acc": np.around(accuracy_score(y_test, y_pred), decimals=3).astype(str),
        "test_f1": np.around(f1_score(y_pred, y_test, average='weighted'),decimals=3).astype(str),
        #"f1_labels": np.around(f1_score(y_pred, y_test, average='weighted'),decimals=3).astype(str),
        "best_params": [grid.best_params_],
        }
    for i in range(0, len(classification_report)):
        values["f1_" + str(i)] = classification_report[i]
    summary = pd.DataFrame(pd.Series(values)).transpose()
    
    f.write("Algorithm Summary: \n")
    f.write(str(summary)); f.write("\n\n");
    f.close()
    return summary 

In [17]:
# Decision Tree
params = {'max_depth': [4, 5, 6], 'max_features': [4, 5, 6]}
model = fit_algorithm(DecisionTreeClassifier(), X_train, X_test, y_train, y_test, params, kf)
models = model

Applying DecisionTreeClassifier...
Fitting is complete in 3.651630401611328 

Results
Best CV Score: 0.756152972358955
Test Accuracy: 0.7549167927382754



In [18]:
# kNN
params = {'n_neighbors': np.arange(4, 5, 6).tolist()}
model = fit_algorithm(KNeighborsClassifier(), X_train, X_test, y_train, y_test, params, kf)
models = models.append(model, ignore_index=True)

Applying KNeighborsClassifier...
Fitting is complete in 2.7359657287597656 

Results
Best CV Score: 0.6925407042786823
Test Accuracy: 0.7276853252647504



In [19]:
# SVM
params = {'C': [0.001, 0.01, 0.1, 1, 10] , 'gamma': [0.001, 0.01, 0.1, 1], 
              'kernel': ['rbf', 'poly'], 'degree': [2, 3], #degree parameter ignored for kernels other than polynomial.
              'class_weight': ['balanced']}
model = fit_algorithm(svm.SVC(probability=True), X_train, X_test, y_train, y_test, params, kf)
models = models.append(model, ignore_index=True)

Applying SVC...
Fitting is complete in 387.0362477302551 

Results
Best CV Score: 0.7663763725861417
Test Accuracy: 0.7715582450832073



In [20]:
# Multilayer Perceptron
params = {'solver': ['adam'] , 'alpha': [0.001], 
          'hidden_layer_sizes': [(5,3), (4,2), (5,2)], 'activation': ['tanh']}
model = fit_algorithm(MLPClassifier(), X_train, X_test, y_train, y_test, params, kf)
models = models.append(model, ignore_index=True)

Applying MLPClassifier...
Fitting is complete in 6.808453321456909 

Results
Best CV Score: 0.7656190836804241
Test Accuracy: 0.7776096822995462



In [21]:
# Random Forest
params = { 'n_estimators': [200, 500], 'max_features': ['auto', 'sqrt', 'log2'], 'max_depth' : [4, 5, 6],
              'criterion' :['gini', 'entropy']}
model = fit_algorithm(RandomForestClassifier(), X_train, X_test, y_train, y_test, params, kf)
models = models.append(model, ignore_index=True)

Applying RandomForestClassifier...
Fitting is complete in 38.76346468925476 

Results
Best CV Score: 0.7731919727375994
Test Accuracy: 0.7791225416036308



In [22]:
# XGB
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [4, 5, 6]}
model = fit_algorithm(XGBClassifier(), X_train, X_test, y_train, y_test, params, kf)
models = models.append(model, ignore_index=True)

Applying XGBClassifier...
Fitting is complete in 113.62389874458313 

Results
Best CV Score: 0.794774706550549
Test Accuracy: 0.8245083207261724



In [23]:
# Artifical Neural Networks
number_of_features = len(all_features)
# Create function returning a compiled network
def create_network(optimizer='rmsprop'):
    
    # Start neural network
    network = models.Sequential()

    # Add fully connected layer with a ReLU activation function
    network.add(layers.Dense(units=16, activation='relu', input_shape=(number_of_features,)))

    # Add fully connected layer with a ReLU activation function
    network.add(layers.Dense(units=16, activation='relu'))

    # Add fully connected layer with a sigmoid activation function
    network.add(layers.Dense(units=1, activation='sigmoid'))

    # Compile neural network
    network.compile(loss='binary_crossentropy', # Cross-entropy
                    optimizer=optimizer, # Optimizer
                    metrics=['accuracy']) # Accuracy performance metric
    
    # Return compiled network
    return network

# Wrap Keras model so it can be used by scikit-learn
neural_network = KerasClassifier(build_fn=create_network, verbose=0)
# params = { 'epochs':[5, 10], 'batches' : [32, 128, 512], 'momentum' : [0.5, 0.7, 0.9], 'optimizers' : ['rmsprop', 'adam']}
params = { 'epochs':[10], 'batches' : [32]}
# model = fit_algorithm(neural_network, X_train, X_test, y_train, y_test, params) #no kf
# models = models.append(model, ignore_index=True)

In [24]:
models

Unnamed: 0,model,cv_acc,test_acc,test_f1,best_params,f1_0,f1_1
0,DecisionTreeClassifier,0.756,0.755,0.756,"[{'max_depth': 6, 'max_features': 4}]",0.71,0.8
1,KNeighborsClassifier,0.693,0.728,0.727,[{'n_neighbors': 4}],0.74,0.71
2,SVC,0.766,0.772,0.772,"[{'C': 10, 'class_weight': 'balanced', 'degree...",0.72,0.82
3,MLPClassifier,0.766,0.778,0.78,"[{'activation': 'tanh', 'alpha': 0.001, 'hidde...",0.68,0.87
4,RandomForestClassifier,0.773,0.779,0.781,"[{'criterion': 'gini', 'max_depth': 6, 'max_fe...",0.68,0.86
5,XGBClassifier,0.795,0.825,0.825,"[{'colsample_bytree': 1.0, 'gamma': 1, 'max_de...",0.78,0.86


#### Save Models to Excel

In [25]:
models = models.sort_values(by='cv_acc', ascending=False)
models.to_excel(file_name.replace('txt', 'xlsx'), encoding= 'utf8', header=True, index=False)