In [1]:
# Spam SMS Detection - Model Training
# CodSoft ML Internship - Task 4
# Author: Chandan Kumar

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import (classification_report, confusion_matrix, 
                             accuracy_score, precision_score, recall_score, 
                             f1_score, roc_auc_score, roc_curve)
import joblib
import json
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("SPAM SMS DETECTION - MODEL TRAINING")
print("="*70)

SPAM SMS DETECTION - MODEL TRAINING


In [2]:
# Load processed data
print("\nüìÇ Loading processed data...")
df = pd.read_csv('../data/spam_processed.csv')
print(f"‚úÖ Dataset loaded: {df.shape}")


üìÇ Loading processed data...
‚úÖ Dataset loaded: (5169, 11)


In [3]:
# Prepare data
X = df['cleaned_message']
y = df['label_encoded']
print(f"   Messages: {len(X):,}")
print(f"   Spam: {y.sum():,} ({y.sum()/len(y)*100:.2f}%)")

   Messages: 5,169
   Spam: 653 (12.63%)


In [4]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"\n‚úÖ Split: Train={len(X_train):,}, Test={len(X_test):,}")


‚úÖ Split: Train=4,135, Test=1,034


In [5]:
# TF-IDF Vectorization
print("\nüîÑ Cleaning data and applying TF-IDF...")

# Ensure there are no NaNs in the data (converts NaNs to '')
X_train = X_train.fillna('')
X_test = X_test.fillna('')

vectorizer = TfidfVectorizer(max_features=3000, 
                             ngram_range=(1, 2), 
                             min_df=2, 
                             max_df=0.8)

# Fit and transform
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print(f"‚úÖ TF-IDF shape: {X_train_tfidf.shape}")


üîÑ Cleaning data and applying TF-IDF...
‚úÖ TF-IDF shape: (4135, 3000)


In [6]:
# Save vectorizer
joblib.dump(vectorizer, '../artifacts/tfidf_vectorizer.pkl')

['../artifacts/tfidf_vectorizer.pkl']

In [7]:
# Baseline Models
results = {}

def train_evaluate(model, name, X_tr, y_tr, X_te, y_te):
    print(f"\nüîÑ Training {name}...")
    model.fit(X_tr, y_tr)
    y_pred = model.predict(X_te)
    y_proba = model.predict_proba(X_te)[:, 1] if hasattr(model, 'predict_proba') else None
    
    acc = accuracy_score(y_te, y_pred)
    prec = precision_score(y_te, y_pred)
    rec = recall_score(y_te, y_pred)
    f1 = f1_score(y_te, y_pred)
    auc = roc_auc_score(y_te, y_proba) if y_proba is not None else None
    
    print(f"   Accuracy: {acc:.4f} | Precision: {prec:.4f} | Recall: {rec:.4f} | F1: {f1:.4f}")
    
    return {'model': model, 'accuracy': acc, 'precision': prec, 
            'recall': rec, 'f1_score': f1, 'roc_auc': auc,
            'predictions': y_pred, 'probabilities': y_proba}

# Train baseline models
results['Naive Bayes (Baseline)'] = train_evaluate(
    MultinomialNB(), "Naive Bayes", X_train_tfidf, y_train, X_test_tfidf, y_test)

results['Logistic Regression (Baseline)'] = train_evaluate(
    LogisticRegression(max_iter=1000, random_state=42), 
    "Logistic Regression", X_train_tfidf, y_train, X_test_tfidf, y_test)

results['Linear SVM (Baseline)'] = train_evaluate(
    LinearSVC(max_iter=1000, random_state=42), 
    "Linear SVM", X_train_tfidf, y_train, X_test_tfidf, y_test)


üîÑ Training Naive Bayes...
   Accuracy: 0.9681 | Precision: 0.9900 | Recall: 0.7557 | F1: 0.8571

üîÑ Training Logistic Regression...
   Accuracy: 0.9652 | Precision: 0.9897 | Recall: 0.7328 | F1: 0.8421

üîÑ Training Linear SVM...
   Accuracy: 0.9787 | Precision: 0.9739 | Recall: 0.8550 | F1: 0.9106


In [8]:
# Hyperparameter Tuning
print("\n" + "="*70)
print("HYPERPARAMETER TUNING")
print("="*70)

# Naive Bayes
print("\nüîç Tuning Naive Bayes...")
nb_grid = GridSearchCV(MultinomialNB(), {'alpha': [0.1, 0.5, 1.0, 2.0]}, 
                       cv=3, scoring='f1', n_jobs=2, verbose=1)
nb_grid.fit(X_train_tfidf, y_train)
print(f"‚úÖ Best: {nb_grid.best_params_}, F1: {nb_grid.best_score_:.4f}")

results['Naive Bayes (Tuned)'] = train_evaluate(
    nb_grid.best_estimator_, "Naive Bayes (Tuned)", 
    X_train_tfidf, y_train, X_test_tfidf, y_test)
results['Naive Bayes (Tuned)']['best_params'] = nb_grid.best_params_

# Logistic Regression
print("\nüîç Tuning Logistic Regression...")
lr_grid = GridSearchCV(
    LogisticRegression(max_iter=1000, random_state=42),
    {'C': [0.1, 1, 10], 'penalty': ['l2']}, 
    cv=3, scoring='f1', n_jobs=2, verbose=1)
lr_grid.fit(X_train_tfidf, y_train)
print(f"‚úÖ Best: {lr_grid.best_params_}, F1: {lr_grid.best_score_:.4f}")

results['Logistic Regression (Tuned)'] = train_evaluate(
    lr_grid.best_estimator_, "Logistic Regression (Tuned)", 
    X_train_tfidf, y_train, X_test_tfidf, y_test)
results['Logistic Regression (Tuned)']['best_params'] = lr_grid.best_params_


HYPERPARAMETER TUNING

üîç Tuning Naive Bayes...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
‚úÖ Best: {'alpha': 0.1}, F1: 0.9130

üîÑ Training Naive Bayes (Tuned)...
   Accuracy: 0.9797 | Precision: 0.9741 | Recall: 0.8626 | F1: 0.9150

üîç Tuning Logistic Regression...
Fitting 3 folds for each of 3 candidates, totalling 9 fits
‚úÖ Best: {'C': 10, 'penalty': 'l2'}, F1: 0.9007

üîÑ Training Logistic Regression (Tuned)...
   Accuracy: 0.9768 | Precision: 0.9735 | Recall: 0.8397 | F1: 0.9016


In [9]:
# Select best model
best_name = max([k for k in results.keys() if 'Tuned' in k], 
                key=lambda x: results[x]['f1_score'])
best_model = results[best_name]['model']
best_metrics = results[best_name]

print(f"\nüèÜ BEST MODEL: {best_name}")
print(f"   F1-Score: {best_metrics['f1_score']:.4f}")
print(f"   Precision: {best_metrics['precision']:.4f}")
print(f"   Recall: {best_metrics['recall']:.4f}")


üèÜ BEST MODEL: Naive Bayes (Tuned)
   F1-Score: 0.9150
   Precision: 0.9741
   Recall: 0.8626


In [10]:
# Save best model
joblib.dump(best_model, '../models/spam_detector_model.pkl')
print("\n‚úÖ Model saved: ../models/spam_detector_model.pkl")


‚úÖ Model saved: ../models/spam_detector_model.pkl


In [11]:
# Confusion Matrix
cm = confusion_matrix(y_test, best_metrics['predictions'])
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
plt.title(f'Confusion Matrix - {best_name}')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.savefig('../images/confusion_matrix_spam.png', dpi=300)
plt.close()

In [12]:
# Save metrics
with open('../artifacts/training_metrics.json', 'w') as f:
    json.dump({'best_model': best_name, 
               'metrics': {k: {m: float(v) for m, v in r.items() 
                              if isinstance(v, (int, float))} 
                          for k, r in results.items() if 'Tuned' in k}}, f, indent=4)

print("\n‚úÖ Training completed! All files saved.")


‚úÖ Training completed! All files saved.
