In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import sqlite3
from pathlib import Path
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import export_graphviz
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.utils import plot_model
import keras_tuner as kt
from joblib import dump, load
import graphviz
import xgboost as xgb


In [2]:
# Connect to DB and import data into a pandas dataframe
connection = sqlite3.connect('../data/db.sqlite')
ratings = pd.read_sql_query("SELECT * FROM ratings", connection)
clean = pd.read_sql_query("SELECT * FROM clean", connection)
dummies = pd.read_sql_query("SELECT * FROM dummies", connection)
clean2 = pd.read_sql_query("SELECT * FROM clean2", connection)
dummies2 = pd.read_sql_query("SELECT * FROM dummies2", connection)

connection.close()

## Neural Network Machine Learning

In [3]:
# Deep Learning Model Function
def preprocess_data(df, sampling=None):
    X = df.drop(['Binary Rating'], axis=1)
    y = df['Binary Rating']
    
    if sampling == 'over':
        oversample = RandomOverSampler(sampling_strategy='minority')
        X_resampled, y_resampled = oversample.fit_resample(X, y)
        print(y.describe())
    elif sampling == 'under':
        undersample = RandomUnderSampler(sampling_strategy='majority')
        X_resampled, y_resampled = undersample.fit_resample(X, y)
        print(y.describe())
    elif sampling == 'smote':
        smt = SMOTE()
        X_resampled, y_resampled = smt.fit_resample(X, y)
        print(y.describe())
    else:
        X_resampled, y_resampled = X, y
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=42)
    scaler = StandardScaler()
    X_scaler = scaler.fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, y_train, y_test

def build_model(X_train_scaled, num_layers, first_layer_neurons, middle_layer_neurons, last_layer_neurons, first_layer_activation, middle_layer_activation, last_layer_activation):
    number_input_features = len(X_train_scaled[0])
    nn = tf.keras.models.Sequential()
    # First layer
    nn.add(tf.keras.layers.Dense(units=first_layer_neurons, input_dim=number_input_features, activation=first_layer_activation))
    # Middle layers
    for _ in range(num_layers - 2):
        nn.add(tf.keras.layers.Dense(units=middle_layer_neurons, activation=middle_layer_activation))
    # Last layer
    nn.add(tf.keras.layers.Dense(units=last_layer_neurons, activation=last_layer_activation))
    nn.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return nn

def train_model(nn, X_train_scaled, y_train, EPOCHS, model):
    mini_path = f'../models/deeplearning/checkpoints/model{model}'
    checkpoint_filepath = mini_path + 'weights.epoch_{epoch:02d}.hdf5'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
        filepath=checkpoint_filepath,
        save_weights_only=True,
        monitor='val_accuracy',
        mode='max',
        save_freq=5,
        verbose=True)
    fit_model = nn.fit(X_train_scaled, y_train, epochs=EPOCHS, callbacks=[model_checkpoint_callback])
    return fit_model

def evaluate_model(nn, X_test_scaled, y_test, model):
    model_loss, model_accuracy = nn.evaluate(X_test_scaled, y_test, verbose=2)
    print(f"Model {model}-- Loss: {model_loss}, Accuracy: {model_accuracy}")
    
    predictions = nn.predict(X_test_scaled)
    predictions = np.argmax(predictions, axis=1)
    
    cm = confusion_matrix(y_test, predictions)
    print("Confusion Matrix:")
    print(cm)
    
    # Calculate precision, recall, and F1-score
    print("\nClassification Report:")
    print(classification_report(y_test, predictions))
    
    return cm

def plot_confusion_matrix(cm, model):
    sns.set()
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt="d", cmap='Blues', cbar=False)
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    plt.savefig(f'../img/models/deeplearning/model{model}_confusion_matrix.png')
    plt.show()

def save_model_plot(nn, model):
    plot_model(nn, to_file=f'../img/models/deeplearning/model{model}_plot.png', show_shapes=True, show_layer_names=True)

def save_model(nn, model):
    nn.save(f'../models/deeplearning/model{model}/model.h5')

def pipeline(df, num_layers, first_layer_neurons, middle_layer_neurons, last_layer_neurons,
             first_layer_activation, middle_layer_activation, last_layer_activation,
             EPOCHS, model, sampling=None):
    X_train_scaled, X_test_scaled, y_train, y_test = preprocess_data(df, sampling)
    nn = build_model(X_train_scaled, num_layers, first_layer_neurons, middle_layer_neurons, last_layer_neurons,
                     first_layer_activation, middle_layer_activation, last_layer_activation)
    trained_model = train_model(nn, X_train_scaled, y_train, EPOCHS, model)
    cm = evaluate_model(nn, X_test_scaled, y_test, model)
    plot_confusion_matrix(cm, model)
    save_model_plot(nn, model)
    save_model(nn, model)


Running it

In [4]:
# Model 1 Testing
pipeline(df=dummies2,
         num_layers=3,
         first_layer_neurons=64,
         middle_layer_neurons=32,
         last_layer_neurons=1,
         first_layer_activation='relu',
         middle_layer_activation='relu',
         last_layer_activation='sigmoid',
         EPOCHS=10,
         model=1,
         sampling='smote')

count    7805.000000
mean        0.653299
std         0.475950
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: Binary Rating, dtype: float64
Epoch 1/10


2024-03-24 22:44:53.032500: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


  1/239 [..............................] - ETA: 44s - loss: 0.6608 - accuracy: 0.4062
Epoch 1: saving model to ../models/deeplearning/checkpoints/model1weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model1weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model1weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model1weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model1weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model1weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model1weights.epoch_01.hdf5
 35/239 [===>..........................] - ETA: 0s - loss: 0.6665 - accuracy: 0.6429 
Epoch 1: saving model to ../models/deeplearning/checkpoints/model1weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model1weights.epoch_01.hdf5

Epoch 1: saving model to ../

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  plt.show()


In [5]:
# Model 2 Testing
pipeline(df=dummies2,
         num_layers=2,
         first_layer_neurons=8,
         middle_layer_neurons=4,
         last_layer_neurons=1,
         first_layer_activation='relu',
         middle_layer_activation='relu',
         last_layer_activation='sigmoid',
         EPOCHS=75,
         model=2,
         sampling='none')

Epoch 1/75
  1/183 [..............................] - ETA: 28s - loss: 0.6794 - accuracy: 0.6562
Epoch 1: saving model to ../models/deeplearning/checkpoints/model2weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model2weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model2weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model2weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model2weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model2weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model2weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model2weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model2weights.epoch_01.hdf5
Epoch 1: saving model to ../models/deeplearning/checkpoints/model2weights.epoch_01.hdf5

Epoch 1: savin

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  plt.show()


In [6]:
# Model 3 Testing
pipeline(df=dummies2,
         num_layers=3,
         first_layer_neurons=64,
         middle_layer_neurons=32,
         last_layer_neurons=1,
         first_layer_activation='relu',
         middle_layer_activation='relu',
         last_layer_activation='sigmoid',
         EPOCHS=10,
         model=3,
         sampling='over')

count    7805.000000
mean        0.653299
std         0.475950
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: Binary Rating, dtype: float64
Epoch 1/10
  1/239 [..............................] - ETA: 35s - loss: 0.7801 - accuracy: 0.3438
Epoch 1: saving model to ../models/deeplearning/checkpoints/model3weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model3weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model3weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model3weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model3weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model3weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model3weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model3weights.ep

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  plt.show()


In [7]:
# Model 3 Testing
pipeline(df=dummies2,
         num_layers=3,
         first_layer_neurons=64,
         middle_layer_neurons=32,
         last_layer_neurons=1,
         first_layer_activation='relu',
         middle_layer_activation='relu',
         last_layer_activation='sigmoid',
         EPOCHS=10,
         model=3,
         sampling='under')

count    7805.000000
mean        0.653299
std         0.475950
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: Binary Rating, dtype: float64
Epoch 1/10
  1/127 [..............................] - ETA: 21s - loss: 0.6945 - accuracy: 0.5312
Epoch 1: saving model to ../models/deeplearning/checkpoints/model3weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model3weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model3weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model3weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model3weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model3weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model3weights.epoch_01.hdf5
Epoch 1: saving model to ../models/deeplearning/checkpoints/model3weights.epo

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  plt.show()


In [8]:
# Model 4 Testing
pipeline(df=dummies2,
         num_layers=3,
         first_layer_neurons=64,
         middle_layer_neurons=32,
         last_layer_neurons=1,
         first_layer_activation='sigmoid',
         middle_layer_activation='sigmoid',
         last_layer_activation='sigmoid',
         EPOCHS=10,
         model=4,
         sampling='none')

Epoch 1/10
  1/183 [..............................] - ETA: 34s - loss: 0.8105 - accuracy: 0.3125
Epoch 1: saving model to ../models/deeplearning/checkpoints/model4weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model4weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model4weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model4weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model4weights.epoch_01.hdf5
 27/183 [===>..........................] - ETA: 0s - loss: 0.6808 - accuracy: 0.5498 
Epoch 1: saving model to ../models/deeplearning/checkpoints/model4weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model4weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model4weights.epoch_01.hdf5

Epoch 1: saving model to ../models/deeplearning/checkpoints/model4weights.epoch_01.hdf5

Epoch 1: saving m

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  plt.show()


The confusion matrix is suggesting that either the embalance in the data or the model design is making this model fail. Adjusting with over sampling, undersampling, and SMOTE techniques are not addressing the issue. Embalance in the features may also be contributing. 

## Random Forest Model

In [9]:
def preprocess_data(df, sampling=None):
    X = df.drop(['Binary Rating'], axis=1)
    y = df['Binary Rating']
    
    if sampling == 'over':
        oversample = RandomOverSampler(sampling_strategy='minority')
        X_resampled, y_resampled = oversample.fit_resample(X, y)
        print(y.describe())
    elif sampling == 'under':
        undersample = RandomUnderSampler(sampling_strategy='majority')
        X_resampled, y_resampled = undersample.fit_resample(X, y)
        print(y.describe())
    elif sampling == 'smote':
        smt = SMOTE()
        X_resampled, y_resampled = smt.fit_resample(X, y)
        print(y.describe())
    else:
        X_resampled, y_resampled = X, y
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=42)
    scaler = StandardScaler()
    X_scaler = scaler.fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, y_train, y_test, X.columns.tolist()

def train_random_forest(X_train_scaled, y_train, n_estimators=500, random_state=78):
    rf_model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state)
    rf_model = rf_model.fit(X_train_scaled, y_train)
    return rf_model

def evaluate_random_forest(rf_model, X_test_scaled, y_test):
    predictions = rf_model.predict(X_test_scaled)
    cm = confusion_matrix(y_test, predictions)
    acc_score = accuracy_score(y_test, predictions)
    print("Confusion Matrix:")
    print(pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy Score: {acc_score}")
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    return cm

def plot_confusion_matrix(cm, model):
    sns.set()
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Labels')
    plt.ylabel('Actual Labels')
    plt.savefig(f'../img/models/random_forest/model{model}_confusion_matrix.png')
    plt.show()

def plot_feature_importances(rf_model, feature_names, model):
    importances = rf_model.feature_importances_
    importances_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    importances_df = importances_df.sort_values(by='Importance', ascending=True)  
    plt.figure(figsize=(10, 6))
    plt.barh(importances_df['Feature'], importances_df['Importance'], color='lightgreen')
    plt.title(f"Model {model}'s Features Importances")
    plt.xlabel('Importance')
    plt.ylabel('Features')
    plt.savefig(f'../img/models/random_forest/model{model}_importances_plot.png', bbox_inches='tight')
    plt.show()

def export_random_forest_tree(rf_model, X, model):
    single_tree = rf_model.estimators_[0]
    dot_data = export_graphviz(single_tree, out_file=None, 
                               feature_names=X.columns, 
                               class_names=['Class 0', 'Class 1'], 
                               filled=True, rounded=True, 
                               special_characters=True)
    graph = graphviz.Source(dot_data)
    graph.view(filename=f'../img/models/random_forest/model{model}_random_tree')
    dump(rf_model, f'../models/random_forest/model{model}.joblib')

def run_forest_pipeline(df, sampling=None, n_estimators=500, random_state=78, model=1):
    X_train_scaled, X_test_scaled, y_train, y_test, feature_names = preprocess_data(df, sampling)
    rf_model = train_random_forest(X_train_scaled, y_train, n_estimators, random_state)
    cm = evaluate_random_forest(rf_model, X_test_scaled, y_test)
    plot_confusion_matrix(cm, model)
    plot_feature_importances(rf_model, feature_names, model)
    export_random_forest_tree(rf_model, df.drop('Binary Rating', axis=1), model)

In [10]:
run_forest_pipeline(
    df=clean2, 
    sampling='none', 
    n_estimators=1000, 
    random_state=42, 
    model=1)

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          581           69
Actual 1           62         1240
Accuracy Score: 0.9328893442622951
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.89      0.90       650
           1       0.95      0.95      0.95      1302

    accuracy                           0.93      1952
   macro avg       0.93      0.92      0.92      1952
weighted avg       0.93      0.93      0.93      1952



  plt.show()
  plt.show()


In [11]:
run_forest_pipeline(
    df=clean2, 
    sampling='over', 
    n_estimators=1000, 
    random_state=42, 
    model=2)

count    7805.000000
mean        0.653299
std         0.475950
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: Binary Rating, dtype: float64
Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         1201           60
Actual 1           88         1201
Accuracy Score: 0.9419607843137255
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1261
           1       0.95      0.93      0.94      1289

    accuracy                           0.94      2550
   macro avg       0.94      0.94      0.94      2550
weighted avg       0.94      0.94      0.94      2550



  plt.show()
  plt.show()


Test

Blah

## SVC

In [12]:

def train_svc(X_train_scaled, y_train, kernel='rbf', C=1.0, gamma='scale', random_state=None):
    svc_model = SVC(kernel=kernel, C=C, gamma=gamma, random_state=random_state)
    svc_model.fit(X_train_scaled, y_train)
    return svc_model

def preprocess_data(df, sampling=None):
    X = df.drop(['Binary Rating'], axis=1)
    y = df['Binary Rating']
    
    if sampling == 'over':
        oversample = RandomOverSampler(sampling_strategy='minority')
        X_resampled, y_resampled = oversample.fit_resample(X, y)
        print(y.describe())
    elif sampling == 'under':
        undersample = RandomUnderSampler(sampling_strategy='majority')
        X_resampled, y_resampled = undersample.fit_resample(X, y)
        print(y.describe())
    elif sampling == 'smote':
        smt = SMOTE()
        X_resampled, y_resampled = smt.fit_resample(X, y)
        print(y.describe())
    else:
        X_resampled, y_resampled = X, y
    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, random_state=42)
    scaler = StandardScaler()
    X_scaler = scaler.fit(X_train)
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, y_train, y_test, X.columns.tolist()

def evaluate_svc(svc_model, X_test_scaled, y_test):
    predictions = svc_model.predict(X_test_scaled)
    cm = confusion_matrix(y_test, predictions)
    acc_score = accuracy_score(y_test, predictions)
    print("Confusion Matrix:")
    print(pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy Score: {acc_score}")
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    return cm

def save_svc_model(svc_model, model_number):
    dump(svc_model, f'../models/svc/model{model_number}.joblib')

def svc_pipeline(data_frame, sampling=None, model_number=1, kernel='rbf', C=1.0, gamma='scale', random_state=None):
    X_train_scaled, X_test_scaled, y_train, y_test, X = preprocess_data(data_frame, sampling)
    svc_model = train_svc(X_train_scaled, y_train, kernel=kernel, C=C, gamma=gamma, random_state=random_state)
    cm = evaluate_svc(svc_model, X_test_scaled, y_test)
    plot_confusion_matrix(cm, model_number)
    plt.savefig(f'../img/models/svc/model{model_number}_confusion_matrix.png') 
    save_svc_model(svc_model, model_number)


In [13]:
# Model 1
svc_pipeline(
    data_frame=clean2, 
    sampling='none',
    model_number=1)

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          361          289
Actual 1           84         1218
Accuracy Score: 0.8089139344262295
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.56      0.66       650
           1       0.81      0.94      0.87      1302

    accuracy                           0.81      1952
   macro avg       0.81      0.75      0.76      1952
weighted avg       0.81      0.81      0.80      1952



  plt.show()


In [14]:
# Model 2
svc_pipeline(
    data_frame=clean2, 
    sampling='over',
    model_number=2)

count    7805.000000
mean        0.653299
std         0.475950
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: Binary Rating, dtype: float64
Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          916          345
Actual 1          249         1040
Accuracy Score: 0.7670588235294118
Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.73      0.76      1261
           1       0.75      0.81      0.78      1289

    accuracy                           0.77      2550
   macro avg       0.77      0.77      0.77      2550
weighted avg       0.77      0.77      0.77      2550



  plt.show()


In [15]:
# Model 3
svc_pipeline(
    data_frame=clean2, 
    sampling='under',
    model_number=3)

count    7805.000000
mean        0.653299
std         0.475950
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: Binary Rating, dtype: float64
Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          522          184
Actual 1          123          524
Accuracy Score: 0.7730968218773097
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.74      0.77       706
           1       0.74      0.81      0.77       647

    accuracy                           0.77      1353
   macro avg       0.77      0.77      0.77      1353
weighted avg       0.78      0.77      0.77      1353



  plt.show()


Blah

In [16]:


def train_gbm(X_train_scaled, y_train, n_estimators=100, learning_rate=0.1, max_depth=3, random_state=None):
    gbm_model = GradientBoostingClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=random_state)
    gbm_model.fit(X_train_scaled, y_train)
    return gbm_model

def evaluate_gbm(gbm_model, X_test_scaled, y_test):
    predictions = gbm_model.predict(X_test_scaled)
    cm = confusion_matrix(y_test, predictions)
    acc_score = accuracy_score(y_test, predictions)
    print("Confusion Matrix:")
    print(pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy Score: {acc_score}")
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    return cm

def save_gbm_model(gbm_model, model_number):
    dump(gbm_model, f'../models/gbm/model{model_number}.joblib')

def gbm_pipeline(data_frame, sampling=None, model_number=1, n_estimators=100, learning_rate=0.1, max_depth=3, random_state=None):
    X_train_scaled, X_test_scaled, y_train, y_test, X = preprocess_data(data_frame, sampling)
    gbm_model = train_gbm(X_train_scaled, y_train, n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=random_state)
    cm = evaluate_gbm(gbm_model, X_test_scaled, y_test)
    plot_confusion_matrix(cm, model_number)
    
    # Explicitly display the plot before saving
    plt.show()
    
    # Now save the figure
    plt.savefig(f'../img/models/gbm/model{model_number}_confusion_matrix.png')  
    save_gbm_model(gbm_model, model_number)

In [17]:
# Model 1
gbm_pipeline(data_frame=clean2, 
             model_number=1,
             sampling='none')

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          457          193
Actual 1           91         1211
Accuracy Score: 0.8545081967213115
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.70      0.76       650
           1       0.86      0.93      0.90      1302

    accuracy                           0.85      1952
   macro avg       0.85      0.82      0.83      1952
weighted avg       0.85      0.85      0.85      1952



  plt.show()
  plt.show()


In [18]:
# Model 2
gbm_pipeline(data_frame=clean2, 
             model_number=2,
             sampling='over')

count    7805.000000
mean        0.653299
std         0.475950
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: Binary Rating, dtype: float64
Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         1025          236
Actual 1          161         1128
Accuracy Score: 0.8443137254901961
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.81      0.84      1261
           1       0.83      0.88      0.85      1289

    accuracy                           0.84      2550
   macro avg       0.85      0.84      0.84      2550
weighted avg       0.85      0.84      0.84      2550



  plt.show()
  plt.show()


In [19]:
# Model 3
gbm_pipeline(data_frame=clean2, 
             model_number=3,
             sampling='under')

count    7805.000000
mean        0.653299
std         0.475950
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: Binary Rating, dtype: float64
Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          570          136
Actual 1           91          556
Accuracy Score: 0.8322246858832225
Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.81      0.83       706
           1       0.80      0.86      0.83       647

    accuracy                           0.83      1353
   macro avg       0.83      0.83      0.83      1353
weighted avg       0.83      0.83      0.83      1353



  plt.show()
  plt.show()


Blah

## XGBoost

To install use conda install -c conda-forge py-xgboost

In [20]:


def train_xgboost(X_train_scaled, y_train, n_estimators=100, learning_rate=0.1, max_depth=3, random_state=None):
    xgb_model = xgb.XGBClassifier(n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=random_state)
    xgb_model.fit(X_train_scaled, y_train)
    return xgb_model

def evaluate_xgboost(xgb_model, X_test_scaled, y_test):
    predictions = xgb_model.predict(X_test_scaled)
    cm = confusion_matrix(y_test, predictions)
    acc_score = accuracy_score(y_test, predictions)
    print("Confusion Matrix:")
    print(pd.DataFrame(cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]))
    print(f"Accuracy Score: {acc_score}")
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    return cm

def save_xgboost_model(xgb_model, model_number):
    dump(xgb_model, f'../models/xgboost/model{model_number}.joblib')


def plot_confusion_matrix(cm, model_number):
    sns.set()
    plt.figure(figsize=(8, 6))
    ax = sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=['Predicted 0', 'Predicted 1'])
    ax.set_title('Confusion Matrix')
    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('Actual Labels')
    ax.set_yticklabels(['Actual 0', 'Actual 1'], rotation=0)     
    plt.savefig(f'../img/models/xgboost/model{model_number}_confusion_matrix.png')
    plt.show()

def xgboost_pipeline(data_frame, sampling, model_number, n_estimators=100, learning_rate=0.1, max_depth=3, random_state=None):
    X_train_scaled, X_test_scaled, y_train, y_test, X = preprocess_data(data_frame, sampling)
    xgb_model = train_xgboost(X_train_scaled, y_train, n_estimators=n_estimators, learning_rate=learning_rate, max_depth=max_depth, random_state=random_state)
    cm = evaluate_xgboost(xgb_model, X_test_scaled, y_test)
    plot_confusion_matrix(cm, model_number)
    plt.savefig(f'../img/models/xgboost/model{model_number}_confusion_matrix.png') 
    save_xgboost_model(xgb_model, model_number)



In [21]:
# Model 1
xgboost_pipeline(
    data_frame=clean2, 
    model_number=1, 
    sampling='none')

Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          450          200
Actual 1           91         1211
Accuracy Score: 0.850922131147541
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.69      0.76       650
           1       0.86      0.93      0.89      1302

    accuracy                           0.85      1952
   macro avg       0.85      0.81      0.82      1952
weighted avg       0.85      0.85      0.85      1952



  plt.show()


In [22]:
# Model 2
xgboost_pipeline(
    data_frame=clean2, 
    model_number=2, 
    sampling='over')

count    7805.000000
mean        0.653299
std         0.475950
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: Binary Rating, dtype: float64
Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0         1016          245
Actual 1          147         1142
Accuracy Score: 0.8462745098039216
Classification Report:
              precision    recall  f1-score   support

           0       0.87      0.81      0.84      1261
           1       0.82      0.89      0.85      1289

    accuracy                           0.85      2550
   macro avg       0.85      0.85      0.85      2550
weighted avg       0.85      0.85      0.85      2550



  plt.show()


In [23]:
# Model 3
xgboost_pipeline(
    data_frame=clean2, 
    model_number=3, 
    sampling='under')

count    7805.000000
mean        0.653299
std         0.475950
min         0.000000
25%         0.000000
50%         1.000000
75%         1.000000
max         1.000000
Name: Binary Rating, dtype: float64
Confusion Matrix:
          Predicted 0  Predicted 1
Actual 0          553          153
Actual 1           77          570
Accuracy Score: 0.8300073909830007
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.78      0.83       706
           1       0.79      0.88      0.83       647

    accuracy                           0.83      1353
   macro avg       0.83      0.83      0.83      1353
weighted avg       0.84      0.83      0.83      1353



  plt.show()
