# Training and testing phase

#### This phase intends to take the pre-processed dataset in a csv file and trains and tests various models

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.neighbors import NearestCentroid

# Read the preprocessed dataset
df = pd.read_csv('complaints_preprocessed.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'complaints_preprocessed.csv'

### First step: Train-test split

The first step is to split the dataset into training and testing. Here, we decided on a 80-20 split.

In [None]:
# Split the dataset (80% training, 20% testing)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Target'])

train_df['narrative'] = train_df['narrative'].fillna("")
test_df['narrative'] = test_df['narrative'].fillna("")

# Separate features (X) and target labels (y)
X_train, y_train = train_df['narrative'], train_df['Target']
X_test, y_test = test_df['narrative'], test_df['Target']

### Second step: TF-IDF conversion

Here, we chose to use only 10000 words since, otherwise, the models would take too long to train.

In [None]:
# Convert text to TF-IDF features
tfidf = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

### Third step: Using the models

We used several models to compare results, and although we explored a few more during the semester, the ones we ended up using were:
- Logistic Regression
- Naïve Bayes
- Stochastic Gradient Descent
- LightGBM
- Ridge Classifier
- XGBoost
- Nearest Centroid

In [None]:
# Initialize models
models = {
    "Logistic Regression": LogisticRegression(),
    "Naive Bayes": MultinomialNB(),
    "Stochastic Gradient Descent": SGDClassifier(loss='hinge', random_state=42),
    "LightGBM": LGBMClassifier(n_estimators=50),
    "Ridge Classifier": RidgeClassifier(),
    "XGBoost": XGBClassifier(n_estimators=50, random_state=42),
    "Nearest Centroid": NearestCentroid(metric='euclidean')
}

# Variables to store model results
model_names = models.keys()
accuracies = []
f1_scores = []
precision_scores = []
recall_scores = []

In [None]:
# Logistic Regression
models["Logistic Regression"].fit(X_train_tfidf, y_train)
y_pred = models["Logistic Regression"].predict(X_test_tfidf)

acc = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=True)
accuracies.append(acc)
f1_scores.append(class_report['weighted avg']['f1-score'])
precision_scores.append(class_report['weighted avg']['precision'])
recall_scores.append(class_report['weighted avg']['recall'])

print("Logistic Regression")
print("Accuracy:", acc)
print(classification_report(y_test, y_pred))

In [None]:
# Naive Bayes
models["Naive Bayes"].fit(X_train_tfidf, y_train)
y_pred = models["Naive Bayes"].predict(X_test_tfidf)

acc = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=True)
accuracies.append(acc)
f1_scores.append(class_report['weighted avg']['f1-score'])
precision_scores.append(class_report['weighted avg']['precision'])
recall_scores.append(class_report['weighted avg']['recall'])

print("Naive Bayes")
print("Accuracy:", acc)
print(classification_report(y_test, y_pred))

In [None]:
# Stochastic Gradient Descent
models["Stochastic Gradient Descent"].fit(X_train_tfidf, y_train)
y_pred = models["Stochastic Gradient Descent"].predict(X_test_tfidf)

acc = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=True)
accuracies.append(acc)
f1_scores.append(class_report['weighted avg']['f1-score'])
precision_scores.append(class_report['weighted avg']['precision'])
recall_scores.append(class_report['weighted avg']['recall'])

print("Stochastic Gradient Descent")
print("Accuracy:", acc)
print(classification_report(y_test, y_pred))

In [None]:
# LightGBM
models["LightGBM"].fit(X_train_tfidf, y_train)
y_pred = models["LightGBM"].predict(X_test_tfidf)

acc = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=True)
accuracies.append(acc)
f1_scores.append(class_report['weighted avg']['f1-score'])
precision_scores.append(class_report['weighted avg']['precision'])
recall_scores.append(class_report['weighted avg']['recall'])

print("LightGBM")
print("Accuracy:", acc)
print(classification_report(y_test, y_pred))

In [None]:
# Ridge Classifier
models["Ridge Classifier"].fit(X_train_tfidf, y_train)
y_pred = models["Ridge Classifier"].predict(X_test_tfidf)

acc = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=True)
accuracies.append(acc)
f1_scores.append(class_report['weighted avg']['f1-score'])
precision_scores.append(class_report['weighted avg']['precision'])
recall_scores.append(class_report['weighted avg']['recall'])

print("Ridge Classifier")
print("Accuracy:", acc)
print(classification_report(y_test, y_pred))

In [None]:
# XGBoost
models["XGBoost"].fit(X_train_tfidf, y_train)
y_pred = models["XGBoost"].predict(X_test_tfidf)

acc = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=True)
accuracies.append(acc)
f1_scores.append(class_report['weighted avg']['f1-score'])
precision_scores.append(class_report['weighted avg']['precision'])
recall_scores.append(class_report['weighted avg']['recall'])

print("XGBoost")
print("Accuracy:", acc)
print(classification_report(y_test, y_pred))

In [None]:
# Nearest Centroid
models["Nearest Centroid"].fit(X_train_tfidf, y_train)
y_pred = models["Nearest Centroid"].predict(X_test_tfidf)

acc = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred, output_dict=True)
accuracies.append(acc)
f1_scores.append(class_report['weighted avg']['f1-score'])
precision_scores.append(class_report['weighted avg']['precision'])
recall_scores.append(class_report['weighted avg']['recall'])

print("Nearest Centroid")
print("Accuracy:", acc)
print(classification_report(y_test, y_pred))

### Fourth step: plot the results

In this cell, we simply create a plot to compare the accuracies, f1-score, precision and recall of all the models

In [None]:
# Plot Model Comparison
fig, ax = plt.subplots(figsize=(12, 6))
bar_width = 0.2
x = np.arange(len(model_names))

ax.bar(x - 1.5*bar_width, accuracies, bar_width, label='Accuracy', color='skyblue')
ax.bar(x - 0.5*bar_width, f1_scores, bar_width, label='F1-Score', color='lightcoral')
ax.bar(x + 0.5*bar_width, precision_scores, bar_width, label='Precision', color='lightgreen')
ax.bar(x + 1.5*bar_width, recall_scores, bar_width, label='Recall', color='gold')

ax.set_xlabel("Models")
ax.set_ylabel("Scores")
ax.set_title("Model Performance Comparison")
ax.set_xticks(x)
ax.set_xticklabels(model_names, rotation=30, ha='right')
ax.legend()
ax.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()