### IS4242 Group 8 Project

### Baseline Models 
- Multinomial Naive Bayes
- PassiveAgressive
- XGBoost
- LightGBM

In [1]:
# importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve

import xgboost as xgb
import lightgbm as lgb
import time
import sys 

import warnings
warnings.filterwarnings('ignore')
plt.style.use('ggplot')
sns.set_theme()


In [2]:
# Constants
RANDOM_STATE = 420
TEST_SIZE = 0.2

In [3]:
# Loading dataset
df = pd.read_csv('data/df_combined_processed_2023-03-30.csv')
df


Unnamed: 0,body,hate,privacy,sexual,impersonation,illegal,advertisement,ai
0,career career kalleron design refus collect re...,0,0,0,0,0,0,0
1,career career hotmeringue2880 hi econom gradua...,0,0,0,0,0,0,0
2,career career milenakowalska hey year finish b...,0,0,0,0,0,0,0
3,career career rstonerphd studi make employe wa...,0,0,0,0,0,1,0
4,career career temporarysection50 hi accept job...,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
41198,you muthafin lie 8220lifeask 20pearl coreyeman...,1,0,0,0,0,0,0
41199,youv gone broke wrong heart babi drove redneck...,0,0,0,0,0,0,0
41200,young buck wanna eat dat nigguh like aint fuck...,1,0,0,0,0,0,0
41201,youu got wild bitch tellin lie,1,0,0,0,0,0,0


In [4]:
# Splitting dataset into train and test

X = df['body']
y = df.drop(['body', 'privacy', 'impersonation'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE)

# Vectorizing text data

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [5]:
def model_generator(clf_model, X_train, y_train, X_test, y_test):
    '''
    Function to generate a model and print the classification report
    
    Parameters:
    model: Model to be used for classification
    X_train: Training data
    y_train: Training labels
    X_test: Testing data
    y_test: Testing labels
    
    Returns:
    model: Trained model
    '''
    start = time.time()
    model = MultiOutputClassifier(clf_model)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    model_name = clf_model.__class__.__name__
    print(f'Classification report for {model_name} model: ')
    print(classification_report(y_test, y_pred, zero_division=0, target_names=y.columns.tolist()))

    # One-vs-Rest ROC AUC for multiple columns, printing in a table
    print(f'ROC AUC score for {model_name} model: ')
    roc_auc = pd.DataFrame(columns=['label', 'roc_auc_score'])
    for i in range(y_test.shape[1]):
        roc_auc.loc[i] = [y_test.columns[i], roc_auc_score(y_test.iloc[:, i], y_pred[:, i])]
    print(roc_auc)
    print()

    # Generating ROC sns curves for each column, save to png
    plt.figure(figsize=(10, 8))

    for i in range(y_test.shape[1]):
        fpr, tpr, _ = roc_curve(y_test.iloc[:, i], y_pred[:, i])
        # plot fpr and tpr using sns
        sns.lineplot(x=fpr, y=tpr, label=y_test.columns[i])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'ROC Curve for {model_name} model')
    plt.legend()
    plt.savefig(f'roc_curves/{model_name}_roc_curve.png')
    plt.close()
    
    # saving model
    with open(f'models/{model_name}_model.pkl', 'wb') as f:
        pickle.dump(model, f)

    # time
    end = time.time()
    print(f'Time taken to train {model_name} model: {round(end-start, 3)} seconds')
    print()

    return model

In [6]:
baseline_models = [MultinomialNB(), PassiveAggressiveClassifier(), xgb.XGBClassifier(), SGDClassifier(), lgb.LGBMClassifier()]

stdoutOrigin=sys.stdout 
sys.stdout = open("model_performance.txt", "w")

trained_models = []

for model in baseline_models:
    trained_model = model_generator(model, X_train, y_train, X_test, y_test)
    model_dict = {"model": trained_model, "vectorizer": vectorizer, "target_list": y.columns.tolist()}
    # saving model
    with open(f'models/{model.__class__.__name__}_model_dict.pkl', 'wb') as f:
        pickle.dump(model_dict, f)
    trained_models.append(trained_model)

sys.stdout.close()
sys.stdout=stdoutOrigin