# Classical/Traditional ML Algorithms

In [None]:
import os
from scipy.sparse import load_npz
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import multiprocessing
import pickle

from sklearn.base import clone
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelBinarizer

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import RocCurveDisplay
import matplotlib.pyplot as plt
from itertools import cycle

In [None]:
script_dir = os.path.dirname(os.path.abspath('classical_ml.ipynb'))
data_path = os.path.join(script_dir, 'Thesis_Jupyter_Final/src/')
os.getcwd()
print(data_path)

input_folder_path = os.path.join(data_path, 'input')
processed_folder_path = os.path.join(data_path, 'input/processed/neg_tagged')
results_folder_path =  os.path.join(data_path, "results")

# Create the folder if it doesn't exist
if not os.path.exists(results_folder_path):
    os.makedirs(results_folder_path)

/home2/s3985113/Thesis_Jupyter_Final/src/


In [None]:
senti_labels_dict = {1: 'Negative', 2: 'Neutral', 3: 'Positive'}
senti_labels_names = list(senti_labels_dict.values())
senti_labels_nums = list(senti_labels_dict.keys())
NUM_of_CLASSES = 3

In [None]:
def load_tfidf_data():
    train = pd.read_csv(os.path.join(processed_folder_path, "train.csv"))
    val = pd.read_csv(os.path.join(processed_folder_path, "val.csv"))
    test = pd.read_csv(os.path.join(processed_folder_path, "test.csv"))
    y_train = train['y'].values
    y_val = val['y'].values
    y_test = test['y'].values

    with open(os.path.join(processed_folder_path, "train_tfidf.pkl"), "rb") as file:
        x_train_tfidf = pickle.load(file)
    with open(os.path.join(processed_folder_path, "val_tfidf.pkl"), "rb") as file:
        x_val_tfidf = pickle.load(file)
    with open(os.path.join(processed_folder_path, "test_tfidf.pkl"), "rb") as file:
        x_test_tfidf = pickle.load(file)

    return x_train_tfidf, y_train, x_val_tfidf, y_val, x_test_tfidf, y_test

x_train_tfidf, y_train, x_val_tfidf, y_val, x_test_tfidf, y_test = load_tfidf_data()
print(x_train_tfidf)
print(y_train)
print(x_train_tfidf.shape, y_train.shape)
print(x_val_tfidf.shape, y_val.shape)
print(x_test_tfidf.shape, y_test.shape)


  (0, 4944)	0.333611549281506
  (0, 2133)	0.2961339731660829
  (0, 1962)	0.2790813018609294
  (0, 1829)	0.2845491982794516
  (0, 1738)	0.2471451738558906
  (0, 1627)	0.27254171122483684
  (0, 784)	0.24895427348562024
  (0, 581)	0.2096952161674913
  (0, 438)	0.21087359433125996
  (0, 400)	0.21511850812086866
  (0, 337)	0.22498142205493177
  (0, 307)	0.22522954040809726
  (0, 223)	0.21748277299714208
  (0, 207)	0.2005277841281215
  (0, 41)	0.16507539015093728
  (0, 37)	0.3065855262224691
  (1, 3132)	0.48745999184528344
  (1, 1117)	0.3794568593358208
  (1, 1065)	0.41963408220421816
  (1, 811)	0.43121465736774295
  (1, 38)	0.2770028355754138
  (1, 22)	0.27822837453512
  (1, 20)	0.3197105022731662
  (2, 3942)	0.3961231024356585
  (2, 2937)	0.3742032094232427
  :	:
  (40997, 8925)	0.4335846690233216
  (40997, 4246)	0.3889950728579958
  (40997, 2499)	0.36388494758151557
  (40997, 1887)	0.34690683112213216
  (40997, 1080)	0.32385818576697817
  (40997, 665)	0.29932936951200767
  (40997, 211)	0.

## Evaluation Functions

In [None]:
def one_hot_encode(y):
    y_encoded = np.zeros((len(y), NUM_of_CLASSES))
    for i, label in enumerate(y):
        y_encoded[i, label - 1] = 1

    return y_encoded

def calculate_metrics(y, y_pred):
    accuracy = accuracy_score(y, y_pred)
    precision = precision_score(y, y_pred, average='weighted', labels=np.unique(y_pred))
    recall = recall_score(y, y_pred, average='weighted')
    f1 = f1_score(y, y_pred, average='weighted', labels=np.unique(y_pred))

    print(f"Accuracy: {accuracy:.2f}%, Precision: {precision:.2f}, Recall: {recall:.2f}, f1-score: {f1:.2f}")
    
    return accuracy, precision, recall, f1

def calculate_classification_report(y, y_pred):
    return classification_report(y, y_pred, labels=senti_labels_nums, target_names=senti_labels_names)

def plot_confusion_matrix(y_true, y_pred, res_path):
    cnf_mat = confusion_matrix(y_true, y_pred)
    mat_disp = ConfusionMatrixDisplay(confusion_matrix=cnf_mat, display_labels=senti_labels_names)
    mat_disp = mat_disp.plot(cmap='Blues', xticks_rotation='vertical')
    plt.title(f'Confusion Matrix')
    plt.tight_layout()
    plt.savefig(os.path.join(res_path, "confusion_matrix.png"))
    plt.close()

def plot_roc_curve(model_name, prob_test_vec, y_test, res_path):
    print("Plotting ROC...")
    fig, ax = plt.subplots(figsize=(10, 10))
    colors = cycle(['limegreen', 'dodgerblue', 'red'])
    for senti, color in zip(range(NUM_of_CLASSES), colors):
        RocCurveDisplay.from_predictions(
            y_test[:, senti],
            prob_test_vec[:, senti],
            name=f"ROC curve for {senti_labels_names[senti]}",
            color=color,
            ax=ax,
        )
    plt.savefig(os.path.join(res_path, f"{model_name}_roc_curve.png"))
    plt.close()
        
def calculate_OvR_roc_auc_score(ovr_model, model_name, x_test, y_test, res_path): #average??
    prob_test_vec = ovr_model.predict_proba(x_test)
    
    fpr, tpr, thresholds, auc_score = [], [], [], []
    for _ in range(NUM_of_CLASSES):
        fpr.append(0)
        tpr.append(0)
        thresholds.append(0)
        auc_score.append(0)
    
    # Determine class proportions which will be the weights for the average
    lb = LabelBinarizer()
    y_test_bin = lb.fit_transform(y_test)
    class_proportions = y_test_bin.mean(axis=0)
    
    for i in range(NUM_of_CLASSES):
        fpr[i], tpr[i], thresholds[i] = roc_curve(y_test[:, i], prob_test_vec[:, i])
        auc_score[i] = auc(fpr[i], tpr[i])

    weighted_avg_auc_score = np.average(auc_score, weights=class_proportions)
    # Save AUC to results.txt
    with open(os.path.join(res_path, f"{model_name}_results.txt"), "a") as f:
        auc_score_str = ', '.join(f'{score:.2f}' for score in auc_score)
        f.write(f"AUC score: [{auc_score_str}]\n")
        f.write(f"Weighted average AUC score: {weighted_avg_auc_score:.2f}\n")
        print("Saved AUC score results...")

    plot_roc_curve(model_name, prob_test_vec, y_test, res_path)

def plot_feature_imp(model, res_path):
    vect_file_path = os.path.join(processed_folder_path, 'tfidf_vectorizer.joblib')
    loaded_tfidf_vectorizer = joblib.load(vect_file_path)
    importances = model.feature_importances_
    feature_names = loaded_tfidf_vectorizer.get_feature_names_out()
    feature_importances = pd.Series(importances, index=feature_names)

    fig, ax = plt.subplots()
    feature_importances.nlargest(20).plot.bar(ax=ax)
    ax.set_title("Top 20 Most Predictive Features")
    ax.set_xlabel('Feature')
    ax.set_ylabel('Importance')
    fig.tight_layout()
    plt.savefig(os.path.join(res_path, "feature_importance.png"))
    plt.close()

def get_results(y_pred, y, model_name, params, res_path, only_metrics, model=None):
    if not os.path.exists(res_path):
        os.makedirs(res_path)

    with open(os.path.join(res_path, f"{model_name}_results.txt"), "w") as f:
        f.write(f"*{model_name}\n")
        f.write(f"Params: {params}\n\n")

        accuracy, precision, recall, f1 = calculate_metrics(y, y_pred)
        f.write(f"Accuracy: {accuracy:.2f}%\n")
        f.write(f"Precision: {precision:.2f}\n")
        f.write(f"Recall: {recall:.2f}\n")
        f.write(f"f1-score: {f1:.2f}\n\n")

        if not only_metrics:
            report = calculate_classification_report(y, y_pred)
            f.write("Classification Report:\n")
            f.write(report)
            f.write("\n")

            plot_confusion_matrix(y, y_pred, res_path)

            if model_name == 'RF':
                plot_feature_imp(model, res_path)

## ML Methods

In [None]:
def print_top3_models(top3_models):  
    # Print the sorted list of mean test scores and standard deviation of test scores
    print("Top 3 Param Combinations (on training set) (best to worst):")
    for index, row in top3_models.iterrows():
        mean_score = row['mean_test_score']
        std_score = row['std_test_score']
        params = row['params']
        print(f"Mean Test Score: {mean_score:.4f} (±{std_score:.4f}) for {params}")
    print()


def perform_grid_search(model, param_grid, x_train, y_train):
    print("*Performing grid search...")
    # Perform grid search
    grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2)
    grid_search.fit(x_train, y_train)

    # Get the mean test scores and standard deviations of test scores for all parameter combinations
    results_df = pd.DataFrame(grid_search.cv_results_)
    sorted_results = results_df.sort_values(by=['mean_test_score', 'std_test_score'], ascending=[False, True])
    top3_models = sorted_results[:3]
    print_top3_models(top3_models)
    top3_params = top3_models['params'].values

    return top3_params


def train_and_evaluate_models(model_type, top3_params, x_train, y_train, x_val, y_val):
    print("*Train with Top 3 Params and Evaluate:")
    trained_models = []
    best_accuracy = 0
    for i in range(3):
        if model_type == "RF":
            model = RandomForestClassifier(**top3_params[i])
        elif model_type == "NB":
            model = MultinomialNB(**top3_params[i])
        elif model_type == "SVM":
            model = SVC(**top3_params[i])
        else:
            print(f"Unknown model type: {model_type}")
            return
        model.fit(x_train, y_train)
        # Get accuracy for the validation set (.score calls .predict() internally)
        val_accuracy = model.score(x_val, y_val)
        if val_accuracy > best_accuracy:
            # Store the best model
            best_model = model
            best_params = top3_params[i]
            best_accuracy = val_accuracy
            idx = i
        trained_models.append((model, top3_params[i]))
    
    print(f"--> Model {idx} - {best_params} gives highest val accuracy {best_accuracy:.2f}%\n")

    # Return the fitted models and their respective params for more in-depth evaluation
    return trained_models, best_model, best_params

In [7]:
def setup_ml(model, model_name, param_grid, x_train, y_train, x_val, y_val, x_test, y_test):
    '''
    # Perform grid search and save top 3 models
    top3_params = perform_grid_search(model, param_grid, x_train, y_train) 

    # Fit the top 3 models, get the model with highest validation accuracy
    trained_models, best_model, best_params = train_and_evaluate_models(model_name, top3_params, x_train, y_train, x_val, y_val)

    # Get training and validation results for models with top 3 params for observation only
    subfolder_path = f"{model_name}_results/{model_name}_trained"
    res_path = os.path.join(results_folder_path, subfolder_path)

    for i, (model, params) in enumerate(trained_models):
        y_pred_train = model.predict(x_train)
        y_pred_val = model.predict(x_val)

        print(f"Model {i}: {params}")
        print("- Training: ")
        get_results(y_pred_train, y_train, f"Training-{model_name}-{i}", params, res_path, only_metrics=True)
        print("- Validation:")
        get_results(y_pred_val, y_val, f"Validation-{model_name}-{i}", params, res_path, only_metrics=True)
        print()

    # Print the params of the best model that had the highest val accuracy
    for i, (model, params) in enumerate(trained_models):
        if model == best_model:
            print(f"Best Model = model {i} with {params}")
            print()
            break
   
    best_model = model #TODO: delete
    best_params = best_model.get_params() #TODO: delete
    print("fitting model...")
    best_model.fit(x_train, y_train) #TODO: delete
    print("fit complete...")
     '''
    
    # Get test set results for the best model 
    subfolder_path = f"{model_name}_results/{model_name}_best"
    res_path = os.path.join(results_folder_path, subfolder_path)
    model_type = f"{model_name}-best"

    #joblib.dump(best_model, os.path.join(res_path, f'{model_name}_best_model.joblib'))  # Save the best model
    
    best_model = joblib.load( os.path.join(res_path, f'{model_name}_best_model.joblib'))    #TODO: delete
    best_params = best_model.get_params() #TODO: delete

    print("predicting starts now...")
    y_pred = best_model.predict(x_test)
    print("accuracy: ", accuracy_score(y_test, y_pred)) #TODO: delete

    '''
    print(f"Class Predictions: {np.bincount(y_pred)}")
    print("Test Evaluation: ")
    get_results(y_pred, y_test, model_type, best_params, res_path, only_metrics=False, model=best_model)
    '''

    # AUC and ROC curve
    print("auc roc calculation starting now...")
    y_train_one_hot = one_hot_encode(y_train)
    y_test_one_hot = one_hot_encode(y_test)
    
    if model_name == "SVM":
        best_model = clone(best_model)
        best_model.probability = True
        print("training best model again...")
        best_model.fit(x_train, y_train)
        print("train complete...")

    ovr_model = OneVsRestClassifier(best_model).fit(x_train, y_train_one_hot)
    calculate_OvR_roc_auc_score(ovr_model, model_type, x_test, y_test_one_hot, res_path)

    if model_name == "RF":
        plot_feature_imp(best_model, res_path)

   

## 1. Naive Bayes

In [8]:
nb = MultinomialNB()
nb_param_grid = {
    'alpha': [0.001, 0.01, 0.1],  # Smoothing parameter for MultinomialNB
    'fit_prior': [True, False]
}
setup_ml(nb, "NB", nb_param_grid, x_train_tfidf, y_train, x_val_tfidf, y_val, x_test_tfidf, y_test)

*Performing grid search...
Fitting 3 folds for each of 6 candidates, totalling 18 fits


Top 3 Param Combinations (on training set) (best to worst):
Mean Test Score: 0.7752 (±0.0079) for {'alpha': 0.001, 'fit_prior': False}
Mean Test Score: 0.7683 (±0.0074) for {'alpha': 0.01, 'fit_prior': False}
Mean Test Score: 0.7549 (±0.0055) for {'alpha': 0.001, 'fit_prior': True}

*Train with Top 3 Params and Evaluate:
--> Model 2 - {'alpha': 0.001, 'fit_prior': True} gives highest val accuracy 0.84%

Model 0: {'alpha': 0.001, 'fit_prior': False}
- Training: 
Accuracy: 0.81%, Precision: 0.81, Recall: 0.81, f1-score: 0.81
- Validation:
Accuracy: 0.80%, Precision: 0.86, Recall: 0.80, f1-score: 0.82

Model 1: {'alpha': 0.01, 'fit_prior': False}
- Training: 
Accuracy: 0.80%, Precision: 0.80, Recall: 0.80, f1-score: 0.80
- Validation:
Accuracy: 0.79%, Precision: 0.86, Recall: 0.79, f1-score: 0.82

Model 2: {'alpha': 0.001, 'fit_prior': True}
- Training: 
Accuracy: 0.78%, Precision: 0.78, Recall: 0.78, f1-score: 0.78
- Validation:
Accuracy: 0.84%, Precision: 0.85, Recall: 0.84, f1-score: 0

## 2. SVM

In [None]:
svm = SVC(probability=True)
svm_param_grid = {
        'C': [0.1, 1, 10],
        'gamma': [0.01, 0.1, 1, 'scale'],
        'kernel': ['linear', 'rbf'],
}
svm.set_params(C=1.0, kernel='rbf', gamma='scale')
setup_ml(svm, "SVM", svm_param_grid, x_train_tfidf, y_train, x_val_tfidf, y_val, x_test_tfidf, y_test)

predicting starts now...
accuracy:  0.8295445013002265
auc roc calculation starting now...
training best model again...
train complete...


## 3. Random Forest

In [9]:
rf = RandomForestClassifier()

# Define the parameter grid for grid search
rf_param_grid = {
        'n_estimators': [50, 100, 200],
        'max_depth': [None, 3, 5, 7, 10],  # Limit maximum depth of the trees
        'min_samples_split': [2, 5, 10, 20],  # Higher values will prevent a model from learning relations which might be highly specific to the particular sample selected for a tree.
        'min_samples_leaf': [1, 2, 5, 10, 15],  # Higher values prevent a model from getting too complex
}
setup_ml(rf, "RF", rf_param_grid, x_train_tfidf, y_train, x_val_tfidf, y_val, x_test_tfidf, y_test)

*Performing grid search...
Fitting 3 folds for each of 300 candidates, totalling 900 fits
[CV] END .......................alpha=0.001, fit_prior=False; total time=   0.0s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=50; total time=  24.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time= 1.6min
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=50; total time=  17.7s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=100; total time=  36.1s
[CV] END max_depth=None, min_samples_leaf=1, min_samples_split=10, n_estimators=200; total time= 1.2min
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=50; total time=  11.3s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=  22.6s
[CV] END max_depth=None, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=  44.3s
[CV] 