In [1]:
# Imports
import pandas as pd
import sqlite3
import numpy as np
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import statsmodels.api as sm
from matplotlib import pyplot as plt
import ast

%matplotlib inline

# Machine Learning Tools, Utilities, and Scoring Metrics
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

# Suite of Machine Learning Algorithms
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import xgboost as xgb

# Measure time and memory usage
import time
import psutil
import os

# Setup to Ignore Version Errors and Deprecations
import warnings
warnings.filterwarnings("ignore")

## Machine Learning Extension

In [2]:
df_relevant = pd.read_csv('data/machine_learning_data.csv')
df_relevant.head()

Unnamed: 0,primary_title,runtime_minutes,genres,averagerating,release_date,production_budget,revenue,roi,release_month
0,Foodfight!,91.0,"['Action', 'Animation', 'Comedy']",1.9,"Dec 31, 2012",45000000.0,73706.0,-99.836209,Dec
1,The Secret Life of Walter Mitty,114.0,"['Adventure', 'Comedy', 'Drama']",7.3,"Dec 25, 2013",91000000.0,187861200.0,106.44086,Dec
2,A Walk Among the Tombstones,114.0,"['Action', 'Crime', 'Drama']",6.5,"Sep 19, 2014",28000000.0,62108590.0,121.816382,Sep
3,Jurassic World,124.0,"['Action', 'Adventure', 'Sci-Fi']",7.0,"Jun 12, 2015",215000000.0,1648855000.0,666.909239,Jun
4,The Rum Diary,119.0,"['Comedy', 'Drama']",6.2,"Oct 28, 2011",45000000.0,21544730.0,-52.122818,Oct


In [3]:
df_relevant.isna().sum()

primary_title         0
runtime_minutes      80
genres                5
averagerating         0
release_date          0
production_budget     0
revenue               0
roi                   0
release_month         0
dtype: int64

In [4]:
df_relevant.dropna(inplace=True)

In [5]:
df_relevant['genres']

0         ['Action', 'Animation', 'Comedy']
1          ['Adventure', 'Comedy', 'Drama']
2              ['Action', 'Crime', 'Drama']
3         ['Action', 'Adventure', 'Sci-Fi']
4                       ['Comedy', 'Drama']
                       ...                 
2362    ['Adventure', 'Biography', 'Drama']
2363                        ['Documentary']
2365                              ['Drama']
2366                             ['Comedy']
2367                        ['Documentary']
Name: genres, Length: 2286, dtype: object

In [6]:
df_relevant['genres'] = df_relevant['genres'].str.split(',')
dummies = df_relevant['genres'].str.join('|').str.get_dummies()
dummies.head()

Unnamed: 0,'Adventure','Adventure'],'Animation','Animation'],'Biography','Biography'],'Comedy','Comedy'],'Crime','Crime'],...,['Mystery',['Mystery'],['Romance',['Romance'],['Sci-Fi',['Sci-Fi'],['Sport'],['Thriller'],['War'],['Western']
0,0,0,1,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
ml_prep = pd.concat([df_relevant, dummies], axis=1)
ml_prep.head()

Unnamed: 0,primary_title,runtime_minutes,genres,averagerating,release_date,production_budget,revenue,roi,release_month,'Adventure',...,['Mystery',['Mystery'],['Romance',['Romance'],['Sci-Fi',['Sci-Fi'],['Sport'],['Thriller'],['War'],['Western']
0,Foodfight!,91.0,"[['Action', 'Animation', 'Comedy']]",1.9,"Dec 31, 2012",45000000.0,73706.0,-99.836209,Dec,0,...,0,0,0,0,0,0,0,0,0,0
1,The Secret Life of Walter Mitty,114.0,"[['Adventure', 'Comedy', 'Drama']]",7.3,"Dec 25, 2013",91000000.0,187861200.0,106.44086,Dec,0,...,0,0,0,0,0,0,0,0,0,0
2,A Walk Among the Tombstones,114.0,"[['Action', 'Crime', 'Drama']]",6.5,"Sep 19, 2014",28000000.0,62108590.0,121.816382,Sep,0,...,0,0,0,0,0,0,0,0,0,0
3,Jurassic World,124.0,"[['Action', 'Adventure', 'Sci-Fi']]",7.0,"Jun 12, 2015",215000000.0,1648855000.0,666.909239,Jun,1,...,0,0,0,0,0,0,0,0,0,0
4,The Rum Diary,119.0,"[['Comedy', 'Drama']]",6.2,"Oct 28, 2011",45000000.0,21544730.0,-52.122818,Oct,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
"""
Bin roi as movie_success so we can focus use a classifier instead of regressor

0 = Flop
1 = Breakeven
2 = Blockbuster
"""


def movie_success_labeler(roi):
    if roi < 100:
        return 0
    elif 100 <= roi < 200:
        return 1
    else:
        return 2

In [9]:
ml_prep.loc[:, 'movie_success'] = ml_prep['roi'].map(movie_success_labeler)
ml_prep.head()

Unnamed: 0,primary_title,runtime_minutes,genres,averagerating,release_date,production_budget,revenue,roi,release_month,'Adventure',...,['Mystery'],['Romance',['Romance'],['Sci-Fi',['Sci-Fi'],['Sport'],['Thriller'],['War'],['Western'],movie_success
0,Foodfight!,91.0,"[['Action', 'Animation', 'Comedy']]",1.9,"Dec 31, 2012",45000000.0,73706.0,-99.836209,Dec,0,...,0,0,0,0,0,0,0,0,0,0
1,The Secret Life of Walter Mitty,114.0,"[['Adventure', 'Comedy', 'Drama']]",7.3,"Dec 25, 2013",91000000.0,187861200.0,106.44086,Dec,0,...,0,0,0,0,0,0,0,0,0,1
2,A Walk Among the Tombstones,114.0,"[['Action', 'Crime', 'Drama']]",6.5,"Sep 19, 2014",28000000.0,62108590.0,121.816382,Sep,0,...,0,0,0,0,0,0,0,0,0,1
3,Jurassic World,124.0,"[['Action', 'Adventure', 'Sci-Fi']]",7.0,"Jun 12, 2015",215000000.0,1648855000.0,666.909239,Jun,1,...,0,0,0,0,0,0,0,0,0,2
4,The Rum Diary,119.0,"[['Comedy', 'Drama']]",6.2,"Oct 28, 2011",45000000.0,21544730.0,-52.122818,Oct,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
ml_prep['release_month'] = ml_prep['release_month'].map({
    'Jan': 1,
    'Feb': 2,
    'Mar': 3,
    'Apr': 4,
    'May': 5,
    'Jun': 6,
    'Jul': 7,
    'Aug': 8,
    'Sep': 9,
    'Oct': 10,
    'Nov': 11,
    'Dec': 12
})

In [11]:
ml_prep['release_month'].value_counts()

release_month
12    281
10    227
8     196
11    191
3     190
9     189
4     188
6     187
7     180
2     161
5     149
1     147
Name: count, dtype: int64

In [12]:
# Drop primary_title, genres, release_date, production_budget, revenue, roi
try: ml_prep.drop(columns=['primary_title', 'genres', 'release_date', 'production_budget', 'revenue', 'roi'], inplace=True)
except: pass
ml_prep.head()

Unnamed: 0,runtime_minutes,averagerating,release_month,'Adventure','Adventure'],'Animation','Animation'],'Biography','Biography'],'Comedy',...,['Mystery'],['Romance',['Romance'],['Sci-Fi',['Sci-Fi'],['Sport'],['Thriller'],['War'],['Western'],movie_success
0,91.0,1.9,12,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,114.0,7.3,12,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,114.0,6.5,9,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,124.0,7.0,6,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,119.0,6.2,10,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
# Standardize runtime_minutes, averagerating
scaled_ml_prep = ml_prep.copy()
scaler = StandardScaler()
columns_to_scale = ['runtime_minutes', 'averagerating']
scaled_ml_prep[columns_to_scale] = scaler.fit_transform(scaled_ml_prep[columns_to_scale])
scaled_ml_prep.head()

Unnamed: 0,runtime_minutes,averagerating,release_month,'Adventure','Adventure'],'Animation','Animation'],'Biography','Biography'],'Comedy',...,['Mystery'],['Romance',['Romance'],['Sci-Fi',['Sci-Fi'],['Sport'],['Thriller'],['War'],['Western'],movie_success
0,-0.65958,-3.793672,12,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.493479,0.920929,12,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,0.493479,0.22247,9,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0.994809,0.659007,6,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2
4,0.744144,-0.039453,10,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
# Set roi as target; others as features
X, y = scaled_ml_prep.drop(columns='movie_success').values, scaled_ml_prep['movie_success']
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.8,
                                                    test_size=0.2,
                                                    random_state=42)

In [15]:
def test_classification_models(X_train, y_train):
    # Adjustments
    CV = 10
    SCORING = 'accuracy'
    
    
    # Define the models to be tested
    models = {
        "Logistic Regression": LogisticRegression(random_state=42),
        "K-Nearest Neighbors": KNeighborsClassifier(),
        "Naive Bayes": GaussianNB(),
        "Decision Tree": DecisionTreeClassifier(random_state=42),
        "Random Forest": RandomForestClassifier(random_state=42),
        "Gradient Boosting": GradientBoostingClassifier(random_state=42),
        "XGBoost": xgb.XGBClassifier(random_state=42),
        "Support Vector Machine": SVC(random_state=42)
    }
    
    # Initialize results dictionary to store metrics for each model
    results = {
        "Model": [],
        "Accuracy (%)": [],
        "Spread (std)": [],
        "Train Time (s)": [],
        "Memory Usage (MB)": []
    }
    
    # Perform model evaluation for each model
    for model_name, model in models.items():
        # Measure training time
        start_train_time = time.time()

        # Perform 10-fold cross-validation to evaluate the model on the training data
        cv_scores = cross_val_score(model, X_train, y_train, cv=CV, scoring=SCORING)

        end_train_time = time.time()
        train_time = end_train_time - start_train_time

        # Measure memory usage (in MB)
        memory_usage = psutil.Process(os.getpid()).memory_info().rss / (1024 * 1024)
        
        # Store the metrics in the results dictionary
        results["Model"].append(model_name)
        results["Accuracy (%)"].append(round(np.mean(cv_scores) * 100, 2))
        results["Spread (std)"].append(round(np.std(cv_scores), 4))
        results["Train Time (s)"].append(round(train_time, 4))
        results["Memory Usage (MB)"].append(round(memory_usage, 0))
        
    # Create a DataFrame to display the results
    results_df = pd.DataFrame(results)
    
    # Print the results
    return results_df

In [16]:
results_df = test_classification_models(X_train, y_train)
results_df

Unnamed: 0,Model,Accuracy (%),Spread (std),Train Time (s),Memory Usage (MB)
0,Logistic Regression,57.66,0.0267,0.6849,205.0
1,K-Nearest Neighbors,52.79,0.0267,0.2446,207.0
2,Naive Bayes,15.37,0.0157,0.0404,207.0
3,Decision Tree,44.69,0.0439,0.2589,206.0
4,Random Forest,58.1,0.0238,6.7254,204.0
5,Gradient Boosting,58.97,0.0173,12.0333,207.0
6,XGBoost,56.78,0.0251,2.9984,215.0
7,Support Vector Machine,55.8,0.0101,3.3701,217.0


In [17]:
results_df.sort_values(['Accuracy (%)'], ascending=False)

Unnamed: 0,Model,Accuracy (%),Spread (std),Train Time (s),Memory Usage (MB)
5,Gradient Boosting,58.97,0.0173,12.0333,207.0
4,Random Forest,58.1,0.0238,6.7254,204.0
0,Logistic Regression,57.66,0.0267,0.6849,205.0
6,XGBoost,56.78,0.0251,2.9984,215.0
7,Support Vector Machine,55.8,0.0101,3.3701,217.0
1,K-Nearest Neighbors,52.79,0.0267,0.2446,207.0
3,Decision Tree,44.69,0.0439,0.2589,206.0
2,Naive Bayes,15.37,0.0157,0.0404,207.0


### Model Selection
- Gradient Boosting shows best average accuracy and has a relatively low spread.
- Logistic Regression is earning comparable accuracy at a fraction of the training time

I will tune each of these models to see which has the best overall performance.