### IS4242 Group 8 Project

### Baseline Models 
- Multinomial Naive Bayes
- PassiveAgressive
- XGBoost
- LightGBM

In [17]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
from sklearn.pipeline import Pipeline

import xgboost as xgb
import lightgbm as lgb
import time
import sys 

import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
sns.set_theme()


In [2]:
# Constants
RANDOM_STATE = 420
TEST_SIZE = 0.2


In [4]:
# Loading dataset
df = pd.read_csv('../../data/processed/df_combined_processed_no_reddit_title_2023-04-09.csv')
df


Unnamed: 0,body,hate,privacy,sexual,impersonation,illegal,advertisement,ai
0,kalleron design refus collect recycl site hear...,0,0,0,0,0,0,0
1,hotmeringue2880 hi econom graduat year experi ...,0,0,0,0,0,0,0
2,milenakowalska hey year finish bachelor degre ...,0,0,0,0,0,0,0
3,rstonerphd studi make employe want leav job im...,0,0,0,0,0,1,0
4,temporarysection50 hi accept job offer earlier...,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
49093,contract mobil 11 mnth latest motorola nokia e...,0,0,0,0,0,1,0
49094,want latest video handset 750 anytim network m...,0,0,0,0,0,1,0
49095,week free membership 100000 prize jackpot wwwd...,0,0,0,0,0,1,0
49096,get 250 pound free call credit detail great of...,0,0,0,0,0,1,0


In [24]:
def model_generator_cv(clf_model, X, y):
    '''
    Function to generate a model and print the classification report evaluated using cross-validation
    
    Parameters:
    model: Model to be used for classification
    X_train: Training data
    y_train: Training labels
    X_test: Testing data
    y_test: Testing labels
    
    Returns:
    model: Trained model
    '''
    start = time.time()
    vectorizer = TfidfVectorizer()
    model = MultiOutputClassifier(clf_model)


    pipeline = Pipeline([('vectorizer', vectorizer), ('classifier', model)])


    # Perform 5-fold stratified cross-validation
    cv = KFold(n_splits=5, random_state=RANDOM_STATE, shuffle=True)
    scores = cross_val_score(pipeline, X, y, cv=cv, scoring='accuracy')
    
    y_pred = cross_val_predict(pipeline, X, y, cv=cv)
    model_name = clf_model.__class__.__name__
    print(f'Classification report for {model_name} model: ')
    print(classification_report(y, y_pred, zero_division=0, target_names=y.columns.tolist()))

    # One-vs-Rest ROC AUC for multiple columns, printing in a table
    print(f'ROC AUC score for {model_name} model: ')
    roc_auc = pd.DataFrame(columns=['label', 'roc_auc_score'])
    for i in range(y.shape[1]):
        roc_auc.loc[i] = [y.columns[i], roc_auc_score(y.iloc[:, i], y_pred[:, i])]
    print(roc_auc)
    print()


    # Generating ROC sns curves for each column, save to png
    plt.figure(figsize=(10, 8))

    for i in range(y.shape[1]):
        fpr, tpr, _ = roc_curve(y.iloc[:, i], y_pred[:, i])
        # plot fpr and tpr using sns
        sns.lineplot(x=fpr, y=tpr, label=y.columns[i])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for {model_name} model')
    plt.legend()
    plt.savefig(f'roc_curves/{model_name}_roc_curve.png')
    plt.close()
    
    # saving model
    with open(f'models/{model_name}_model.pkl', 'wb') as f:
        pickle.dump(model, f)

    # time
    end = time.time()
    print(f'Time taken to train {model_name} model: {round(end-start, 3)} seconds')
    print()
    print("=============================================================")
    print()

    return model, vectorizer

In [25]:
X = df['body']
y = df.drop(['body'], axis=1)

baseline_models = [MultinomialNB(), PassiveAggressiveClassifier(), xgb.XGBClassifier()]

stdoutOrigin=sys.stdout 
sys.stdout = open("baseline_model_performance_cv.txt", "w")

trained_models = []

for model in baseline_models:
    trained_model, vectorizer = model_generator_cv(model, X, y)
    model_dict = {"model": trained_model, "vectorizer": vectorizer, "target_list": y.columns.tolist()}
    # saving model
    with open(f'models/{model.__class__.__name__}_model_cv_dict.pkl', 'wb') as f:
        pickle.dump(model_dict, f)
    trained_models.append(trained_model)

sys.stdout.close()
sys.stdout=stdoutOrigin