# Classical/Traditional ML Algorithms

In [9]:
import os
from scipy.sparse import load_npz
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import multiprocessing
import pickle

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelBinarizer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import RocCurveDisplay
import matplotlib.pyplot as plt
from itertools import cycle

DATASET_COLUMNS = ['Id', 'Review', 'Sentiment']
senti_labels_dict = {1: 'Negative', 2: 'Neutral', 3: 'Positive'}
senti_labels = list(senti_labels_dict.values())
NUM_of_CLASSES = 3

In [10]:
script_dir = os.path.dirname(os.path.abspath('classical_ml.ipynb'))
data_path = os.path.join(script_dir, 'Thesis_Jupyter_Final/src/')
os.getcwd()
print(data_path)

input_folder_path = os.path.join(data_path, 'input')
processed_folder_path = os.path.join(data_path, 'input/processed')
results_folder_path = "results"

# Create the folder if it doesn't exist
if not os.path.exists(results_folder_path):
    os.makedirs(results_folder_path)

/home2/s3985113/Thesis_Jupyter_Final/src/


In [11]:
def load_tfidf_data():
    train = pd.read_csv(os.path.join(input_folder_path, "train.csv"))
    val = pd.read_csv(os.path.join(input_folder_path, "val.csv"))
    test = pd.read_csv(os.path.join(input_folder_path, "test.csv"))
    y_train = train['y'].values
    y_val = val['y'].values
    y_test = test['y'].values

    x_train = load_npz(os.path.join(processed_folder_path, "train_tfidf.npz"))
    x_val = load_npz(os.path.join(processed_folder_path, "val_tfidf.npz"))
    x_test = load_npz(os.path.join(processed_folder_path, "test_tfidf.npz"))

    return x_train, y_train, x_val, y_val, x_test, y_test

x_train_tfidf, y_train, x_val_tfidf, y_val, x_test_tfidf, y_test = load_tfidf_data()
print(x_train_tfidf)
print(y_train)
print(x_train_tfidf.shape, y_train.shape)
print(x_val_tfidf.shape, y_val.shape)
print(x_test_tfidf.shape, y_test.shape)


  (0, 7674)	0.395003810820927
  (0, 3005)	0.34143327148057967
  (0, 1288)	0.27328925675237814
  (0, 1188)	0.2829202064426357
  (0, 1101)	0.2536964070332996
  (0, 1049)	0.27732852008681114
  (0, 991)	0.26382586223050886
  (0, 415)	0.23324650632724564
  (0, 290)	0.20200629366260997
  (0, 277)	0.21488490008358963
  (0, 251)	0.1974681787157983
  (0, 228)	0.21928861491756943
  (0, 216)	0.22843796632232524
  (0, 139)	0.21136211720563308
  (0, 93)	0.19095121354511638
  (1, 1588)	0.5953189332250153
  (1, 586)	0.43462336394189666
  (1, 475)	0.4942271599285218
  (1, 450)	0.46090933343624846
  (2, 2300)	0.44977045753398254
  (2, 992)	0.4105805964320454
  (2, 880)	0.3947925563428013
  (2, 681)	0.39686466837363094
  (2, 471)	0.33840318259967433
  (2, 307)	0.33679611495704714
  :	:
  (40997, 184)	0.18125419221060537
  (40997, 156)	0.18858806671003178
  (40997, 113)	0.1777082593605502
  (40997, 21)	0.17117366078775023
  (40997, 8)	0.16351368155517368
  (40997, 5)	0.15315447164729
  (40997, 3)	0.14651

## Evaluation Functions

In [12]:
def one_hot_encode(y):
    y_encoded = np.zeros((len(y), NUM_of_CLASSES))
    for i, label in enumerate(y):
        y_encoded[i, label - 1] = 1

    return y_encoded

def calculate_metrics(y, y_pred):
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='weighted', labels=np.unique(y_pred))
    recall = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted', labels=np.unique(y_pred))

    print(f"Accuracy: {accuracy:.2f}%, Precision: {precision:.2f}, Recall: {recall:.2f}, f1-score: {f1:.2f}")
    
    return accuracy, precision, recall, f1

def calculate_classification_report(y, y_pred):
    return classification_report(y, y_pred)

def plot_confusion_matrix(y_true, y_pred, labels, res_path):
    cnf_mat = confusion_matrix(y_true, y_pred)
    mat_disp = ConfusionMatrixDisplay(confusion_matrix=cnf_mat, display_labels=labels)
    mat_disp = mat_disp.plot(cmap='Blues', xticks_rotation='vertical')
    plt.title(f'Confusion Matrix')
    plt.savefig(os.path.join(res_path, "confusion_matrix.png"))
    plt.close()

def plot_roc_curve(prob_test_vec, y_test, labels, res_path):
    fig, ax = plt.subplots(figsize=(10, 10))
    labels = labels
    colors = cycle(['limegreen', 'dodgerblue', 'red'])
    for senti, color in zip(range(NUM_of_CLASSES), colors):
        RocCurveDisplay.from_predictions(
            y_test[:, senti],
            prob_test_vec[:, senti],
            name=f"ROC curve for {labels[senti]}",
            color=color,
            ax=ax,
        )
    plt.savefig(os.path.join(res_path, "roc_curve.png"))
    plt.close()
        
def calculate_OvR_roc_auc_score(model, model_name, x, y, x_test, y_test, labels, res_path): #average??
    y = one_hot_encode(y)
    y_test = one_hot_encode(y_test)

    ovr_model = OneVsRestClassifier(model).fit(x, y)
    prob_test_vec = ovr_model.predict_proba(x_test)
    
    fpr, tpr, thresholds, auc_score = [], [], [], []
    for _ in range(NUM_of_CLASSES):
        fpr.append(0)
        tpr.append(0)
        thresholds.append(0)
        auc_score.append(0)
    
    for i in range(NUM_of_CLASSES):
        fpr[i], tpr[i], thresholds[i] = roc_curve(y_test[:, i], prob_test_vec[:, i])
        auc_score[i] = auc(fpr[i], tpr[i])

    averaged_auc_score = (sum(auc_score) / NUM_of_CLASSES)
    # Save AUC to results.txt
    with open(os.path.join(res_path, f"{model_name}_results.txt"), "a") as f:
        f.write(f"AUC score: {auc_score}\n")
        f.write(f"Averaged AUC score: {averaged_auc_score:.2f}\n")

    plot_roc_curve(prob_test_vec, y_test, labels, res_path=res_path)


# TODO:
def plot_feature_imp(model, res_path):
    processed_folder_path = "./pls/Thesis_Jupyter_Final/src/input/processed"
    vect_file_path = os.path.join(processed_folder_path, 'tfidf_vectorizer.joblib')
    loaded_tfidf_vectorizer = load(vect_file_path)
    importances = model.feature_importances_
    feature_names = loaded_tfidf_vectorizer.get_feature_names_out()
    feature_importances = pd.Series(importances, index=feature_names)

    fig, ax = plt.subplots()
    feature_importances.nlargest(20).plot.bar(ax=ax)
    ax.set_title("Top 20 Most Predictive Features")
    ax.set_xlabel('Feature')
    ax.set_ylabel('Importance')
    fig.tight_layout()
    plt.savefig(os.path.join(res_path, "feature_importance.png"))
    plt.close()


def evaluate_model(y_pred, model_name, x, y, params, labels, res_path, only_metrics, model=None):
    if not os.path.exists(res_path):
        os.makedirs(res_path)

    with open(os.path.join(res_path, f"{model_name}_results.txt"), "w") as f:
        f.write(f"*{model_name}\n")
        f.write(f"Params: {params}\n\n")

        accuracy, precision, recall, f1 = calculate_metrics(y, y_pred)
        f.write(f"Accuracy: {accuracy:.2f}%\n")
        f.write(f"Precision: {precision:.2f}\n")
        f.write(f"Recall: {recall:.2f}\n")
        f.write(f"f1-score: {f1:.2f}\n\n")

        if not only_metrics:
            report = calculate_classification_report(y, y_pred)
            f.write("Classification Report:\n")
            f.write(report)
            f.write("\n")

            plot_confusion_matrix(y, y_pred, labels=labels, res_path=res_path)

            if model_name == 'RF':
                plot_feature_imp(model, res_path)

In [13]:
def print_top3_models(top3_models):    
    # Print the sorted list of mean test scores and standard deviation of test scores
    print("\nTop 3 parameter combinations ranked by performance (from best to worst):")
    for index, row in top3_models.iterrows():
        mean_score = row['mean_test_score']
        std_score = row['std_test_score']
        params = row['params']
        print(f"Mean Test Score: {mean_score:.4f} (±{std_score:.4f}) for {params}")

## ML Methods

In [14]:
def print_top3_models(top3_models):  
    print("*Printing top 3 models...")
    # Print the sorted list of mean test scores and standard deviation of test scores
    print("Top 3 parameter combinations ranked by performance (from best to worst):")
    for index, row in top3_models.iterrows():
        mean_score = row['mean_test_score']
        std_score = row['std_test_score']
        params = row['params']
        print(f"Mean Test Score: {mean_score:.4f} (±{std_score:.4f}) for {params}")


def perform_grid_search(model, param_grid, x_train, y_train):
    print("*Performing grid search...")
    # Perform grid search
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=5)
    grid_search.fit(x_train, y_train)

    # Get the mean test scores and standard deviations of test scores for all parameter combinations
    results_df = pd.DataFrame(grid_search.cv_results_)
    sorted_results = results_df.sort_values(by=['mean_test_score', 'std_test_score'], ascending=[False, True])
    top3_models = sorted_results[:3]
    print_top3_models(top3_models)
    top3_params = top3_models['params'].values

    return top3_params


def train_and_evaluate_models(model_type, top3_params, x_train, y_train, x_val, y_val):
    print("*Training and Evaluating Top 3 Models...")
    trained_models = []
    best_accuracy = 0
    for i in range(3):
        if model_type == "RF":
            model = RandomForestClassifier(**top3_params[i])
        elif model_type == "NB":
            model = MultinomialNB(**top3_params[i])
        elif model_type == "SVM":
            model = SVC(**top3_params[i])
        else:
            print(f"Unknown model type: {model_type}")
            return
        model.fit(x_train, y_train)
        # Get accuracy for the validation set (.score calls .predict() internally)
        val_accuracy = model.score(x_val, y_val)
        if val_accuracy > best_accuracy:
            # Store the best model
            best_model = model
            best_params = top3_params[i]
            best_accuracy = val_accuracy
            idx = i
        trained_models.append((model, top3_params[i]))
    
    print(f"Model {idx}-{best_params} gives highest validation accuracy {best_accuracy:.2f}%")

    # Return the fitted models and their respective params for more in-depth evaluation
    return trained_models, best_model, best_params

## 1. Naive Bayes

In [15]:
nb = MultinomialNB()

# Define the parameter grid for grid search
nb_param_grid = {
    'alpha': [0.001, 0.01, 0.1],  # Smoothing parameter for MultinomialNB
    'fit_prior': [True, False]
}


top3_params = perform_grid_search(nb, nb_param_grid, x_train_tfidf, y_train) # Save top 3 models

# Fit the top 3 models, find the model among top 3 with the highest validation accuracy, and store it
trained_models, nb_best_model, nb_best_params = train_and_evaluate_models("NB", top3_params, x_train_tfidf, y_train, x_val_tfidf, y_val)

# Evaluate and print metrics for each model in fitted models for more in-depth analysis
subfolder_path = "NB_results/NB_trained"
res_path = os.path.join(results_folder_path, subfolder_path)
for i, (model, params) in enumerate(trained_models):
    y_pred = model.predict(x_train_tfidf)
    print(i)
    print("-Training: ")
    evaluate_model(y_pred, f"Training-NB-{i}", x_train_tfidf, y_train, params, senti_labels, res_path, only_metrics=True)
    y_pred = model.predict(x_val_tfidf)
    print("-Validation:")
    evaluate_model(y_pred, f"Validation-NB-{i}", x_val_tfidf, y_val, params, senti_labels, res_path, only_metrics=True)
print()

# Use the best model to evaluate on the test set
print(f"*Best model: {nb_best_model}")
y_pred = nb_best_model.predict(x_test_tfidf)
print(np.bincount(y_pred))
subfolder_path = "NB_results/NB_best"
res_path = os.path.join(results_folder_path, "NB_results/NB_best")
model_type = "NB-best"
evaluate_model(y_pred, model_type, x_test_tfidf, y_test, nb_best_params, senti_labels, res_path, only_metrics=False)
calculate_OvR_roc_auc_score(nb_best_model, model_type, x_train_tfidf, y_train, x_test_tfidf, y_test, senti_labels, res_path)

*Performing grid search...
Fitting 3 folds for each of 6 candidates, totalling 18 fits


*Printing top 3 models...
Top 3 parameter combinations ranked by performance (from best to worst):
Mean Test Score: 0.7908 (±0.0084) for {'alpha': 0.01, 'fit_prior': True}
Mean Test Score: 0.7905 (±0.0096) for {'alpha': 0.001, 'fit_prior': True}
Mean Test Score: 0.7863 (±0.0077) for {'alpha': 0.001, 'fit_prior': False}
*Training and Evaluating Top 3 Models...
Model 0-{'alpha': 0.01, 'fit_prior': True} gives highest validation accuracy 0.82%
0
-Training: 
Accuracy: 0.83%, Precision: 0.83, Recall: 0.83, f1-score: 0.83
-Validation:
Accuracy: 0.82%, Precision: 0.85, Recall: 0.82, f1-score: 0.83
1
-Training: 
Accuracy: 0.83%, Precision: 0.83, Recall: 0.83, f1-score: 0.83
-Validation:
Accuracy: 0.82%, Precision: 0.84, Recall: 0.82, f1-score: 0.83
2
-Training: 
Accuracy: 0.83%, Precision: 0.84, Recall: 0.83, f1-score: 0.83
-Validation:
Accuracy: 0.71%, Precision: 0.85, Recall: 0.71, f1-score: 0.76

*Best model: MultinomialNB(alpha=0.01)
[   0 1096 1106 9697]
Accuracy: 0.81%, Precision: 0.82, 

## 2. SVM

In [17]:
svm = SVC()

# Define the parameter grid for grid search
svm_param_grid = {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf'],
        'gamma': [0.1, 1, 'scale']
}

top3_params = perform_grid_search(svm, svm_param_grid, x_train_tfidf, y_train) # Save top 3 models

# Fit the top 3 models, find the model among top 3 with the highest validation accuracy, and store it
trained_models, svm_best_model, svm_best_params = train_and_evaluate_models("SVM", top3_params, x_train_tfidf, y_train, x_val_tfidf, y_val)

# Evaluate and print metrics for each model in fitted models for more in-depth analysis
subfolder_path = "SVM_results/SVM_trained"
res_path = os.path.join(results_folder_path, subfolder_path)
for i, (model, params) in enumerate(trained_models):
    y_pred = model.predict(x_train_tfidf)
    print(i)
    print("-Training: ")
    evaluate_model(y_pred, f"Training-SVM-{i}", x_train_tfidf, y_train, params, senti_labels, res_path, only_metrics=True)
    y_pred = model.predict(x_val_tfidf)
    print("-Validation:")
    evaluate_model(y_pred, f"Validation-SVM-{i}", x_val_tfidf, y_val, params, senti_labels, res_path, only_metrics=True)
print()

# Use the best model to evaluate on the test set
print(f"*Best model: {svm_best_model}")
y_pred = svm_best_model.predict(x_test_tfidf)
print(np.bincount(y_pred))
subfolder_path = "SVM_results/SVM_best"
res_path = os.path.join(results_folder_path, subfolder_path)
model_type = "SVM-best"
evaluate_model(y_pred, model_type, x_test_tfidf, y_test, svm_best_params, senti_labels, res_path, only_metrics=False)
#calculate_OvR_roc_auc_score(svm_best_model, model_type, x_train_tfidf, y_train, x_test_tfidf, y_test, senti_labels, res_path)

*Performing grid search...
Fitting 3 folds for each of 18 candidates, totalling 54 fits


KeyboardInterrupt: 

In [None]:
#calculate_OvR_roc_auc_score(svm_best_model, model_type, x_train_tfidf, y_train, x_test_tfidf, y_test, senti_labels, res_path)

: 

## 3. Random Forest

In [None]:
rf = RandomForestClassifier()

# Define the parameter grid for grid search
rf_param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 3, 5, 7, 10],  # Limit maximum depth of the trees
        'min_samples_split': [2, 5, 10, 20],  # Higher values will prevent a model from learning relations which might be highly specific to the particular sample selected for a tree.
        'min_samples_leaf': [1, 2, 5, 10, 15],  # Higher values prevent a model from getting too complex
}

top3_params = perform_grid_search(rf, rf_param_grid, x_train_tfidf, y_train) # Save top 3 models

# Fit the top 3 models, find the model among top 3 with the highest validation accuracy, and store it
trained_models, rf_best_model, rf_best_params = train_and_evaluate_models("RF", top3_params, x_train_tfidf, y_train, x_val_tfidf, y_val)

# Evaluate and print metrics for each model in fitted models for more in-depth analysis
subfolder_path = "RF_results/RF_trained"
res_path = os.path.join(results_folder_path, subfolder_path)
for i, (model, params) in enumerate(trained_models):
    y_pred = model.predict(x_train_tfidf)
    print(i)
    print("-Training: ")
    evaluate_model(y_pred, f"Training-RF-{i}", x_train_tfidf, y_train, params, senti_labels, res_path, only_metrics=True, model=model)
    y_pred = model.predict(x_val_tfidf)
    print("-Validation:")
    evaluate_model(y_pred, f"Validation-RF-{i}", x_val_tfidf, y_val, params, senti_labels, res_path, only_metrics=True, model=model)
print()

# Use the best model to evaluate on the test set
print(f"*Best model: {rf_best_model}")
y_pred = rf_best_model.predict(x_test_tfidf)
print(np.bincount(y_pred))
subfolder_path =  "RF_results/RF_best"
res_path = os.path.join(results_folder_path, subfolder_path)
model_type = "RF-best"
evaluate_model(y_pred, model_type, x_test_tfidf, y_test, rf_best_params, senti_labels, res_path, only_metrics=False, model=rf_best_model)
calculate_OvR_roc_auc_score(rf_best_model, model_type, x_train_tfidf, y_train, x_test_tfidf, y_test, senti_labels, res_path)

: 