# Spam Detection - Model Training

- Add the project's root directory (two levels up) to the Python path so the modules can be imported, even if they arent in the current working directory:

In [None]:
import sys
import os

project_root = os.path.abspath(os.path.join('..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

- Import the required libraries and modules, as well as our utility functions:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from src.utils import load_config, get_project_root, get_confusion_matrix

- Load the config using the utility function. Get paths to relevant folders/files needed to save and retrieve files:

In [None]:
config = load_config()

train_path = config['data']['task1']['processed']['train']
model_path = config['data']['task1']['models']

processed_train_path = os.path.join(get_project_root(), train_path.replace('/', os.sep), "spam_detection_train_processed_features.csv")
selected_model_path = os.path.join(get_project_root(), model_path.replace('/', os.sep))

train_df = pd.read_csv(processed_train_path)

- Specify the columns we want to use for our features. These exist in the processed and featured data files, we just need to specify what we want to use to train models:

In [None]:
feature_cols = [
    'text_length', 'word_count', 'special_char_count', 'exclamation_density',
    'uppercase_ratio', 'avg_sentence_length', 'punctuation_density',
    'vocabulary_richness', 'marketing_keyword_count'
]

- Split data into testing and validation sets. This will also allow us to assess accuracy based on accuracy, precision, recall and f1:

In [None]:
X = train_df[feature_cols]
y = train_df['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=15)

- Specify models in a dictionary - this is for clear organisation but also so we can easily iterate through it and test them all:

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(max_depth=5),
    'Bernoulli Naive Bayes': BernoulliNB(),
    'SVC': SVC(kernel='sigmoid'),
    'Random Forest': RandomForestClassifier(random_state=2)
}

- A list to store results, and variables to help us choose the best model based primarily on f1 score. Whilst it may be chosen using only f1 score, I will make sure that the choice is sensible from a bar chart diagram which compares all results:

In [None]:
results = []
best_model = None
best_f1 = 0

- Iterate through the dictionary, make predictions, evaluate this based on the validation set, and append this to our results list along with the name of the model (as a dict):

In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    
    results.append({'Model': name, 'Accuracy': acc, 'Precision': prec, 'Recall': rec, 'F1': f1})
    
    print(f"\n{name} Classification Report:")
    print(classification_report(y_val, y_pred))

    conf_matrix = get_confusion_matrix(y_val, y_pred)
    print(f"\nConfusion Matrix for {name}:\n{conf_matrix}")

    if f1 > best_f1:
        best_f1 = f1
        best_model = model

- Make a dataframe for easy plotting. The dataframe is easy to use as our results list is constructed in a sensible way. Plot a bar chart with all metrics:

In [None]:
results_df = pd.DataFrame(results)
results_df.set_index('Model').plot(kind='bar', figsize=(12,6), title='Model Comparison')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

- Print out which model is best - we will choose this one. It is already trained, so ready to save:

In [None]:
print(f"Best Model: {best_model}")

- Save the best model to the chosen file location. This will allow us to access it when it comes to testing:

In [None]:
filename = "best_model.pkl"

os.makedirs(selected_model_path, exist_ok=True)
    
full_path = os.path.join(selected_model_path, filename)

joblib.dump(best_model, full_path)
print(f"Model Saved: {full_path}")