# Spam Detection - Model Training

In [None]:
import sys
import os

project_root = os.path.abspath(os.path.join('..', '..'))
if project_root not in sys.path:
    sys.path.append(project_root)

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from src.utils import load_config, get_project_root, get_confusion_matrix

In [None]:
config = load_config()

train_path = config['data']['task1']['processed']['train']
model_path = config['data']['task1']['models']

processed_train_path = os.path.join(get_project_root(), train_path.replace('/', os.sep), "spam_detection_train_processed_features.csv")
selected_model_path = os.path.join(get_project_root(), model_path.replace('/', os.sep))

train_df = pd.read_csv(processed_train_path)

In [None]:
feature_cols = [
    'text_length', 'word_count', 'special_char_count', 'exclamation_density',
    'uppercase_ratio', 'avg_sentence_length', 'punctuation_density',
    'vocabulary_richness', 'marketing_keyword_count'
]
X = train_df[feature_cols]
y = train_df['label']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(max_depth=5),
    'Bernoulli Naive Bayes': BernoulliNB(),
    'SVC': SVC(kernel='sigmoid'),
    'Random Forest': RandomForestClassifier(random_state=2)
}

In [None]:
results = []
best_model = None
best_f1 = 0

In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    
    acc = accuracy_score(y_val, y_pred)
    prec = precision_score(y_val, y_pred)
    rec = recall_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    
    results.append({'Model': name, 'Accuracy': acc, 'Precision': prec, 'Recall': rec, 'F1': f1})
    
    print(f"\n{name} Classification Report:")
    print(classification_report(y_val, y_pred))

    conf_matrix = get_confusion_matrix(y_val, y_pred)
    print(f"\nConfusion Matrix for {name}:\n{conf_matrix}")

    if f1 > best_f1:
        best_f1 = f1
        best_model = model

In [None]:
results_df = pd.DataFrame(results)
results_df.set_index('Model').plot(kind='bar', figsize=(12,6), title='Model Comparison')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
print(f"Best Model: {best_model}")

In [None]:
filename = "best_model.pkl"

os.makedirs(selected_model_path, exist_ok=True)
    
full_path = os.path.join(selected_model_path, filename)

joblib.dump(best_model, full_path)
print(f"Model Saved: {full_path}")