# Assignment 1 - train.ipynb
Train, validate, and compare three benchmark models for SMS spam classification.

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    average_precision_score,
    confusion_matrix,
)

## Functions

In [2]:
def build_vectorizer() -> TfidfVectorizer:
    return TfidfVectorizer(stop_words='english', max_features=5000)


def get_model(model_name: str):
    models = {
        'nb': MultinomialNB(),
        'lr': LogisticRegression(max_iter=1000, random_state=42),
        'svm': LinearSVC(max_iter=2000, random_state=42),
    }
    if model_name not in models:
        raise ValueError(f'Unknown model_name={model_name}. Use one of {list(models.keys())}.')
    return models[model_name]


def fit_model(model_name: str, X_train_text: pd.Series, y_train: pd.Series) -> dict:
    vectorizer = build_vectorizer()
    X_train_vec = vectorizer.fit_transform(X_train_text)
    model = get_model(model_name)
    model.fit(X_train_vec, y_train)
    return {'model_name': model_name, 'model': model, 'vectorizer': vectorizer}


def _score_values(model, X_vec):
    if hasattr(model, 'predict_proba'):
        return model.predict_proba(X_vec)[:, 1]
    if hasattr(model, 'decision_function'):
        return model.decision_function(X_vec)
    return None


def evaluate_predictions(y_true, y_pred) -> dict:
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return {'tn': int(tn), 'fp': int(fp), 'fn': int(fn), 'tp': int(tp)}


def score_model(bundle: dict, X_text: pd.Series, y_true: pd.Series) -> dict:
    model = bundle['model']
    vectorizer = bundle['vectorizer']
    X_vec = vectorizer.transform(X_text)

    y_pred = model.predict(X_vec)
    y_score = _score_values(model, X_vec)

    metrics = {
        'accuracy': accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred, zero_division=0),
        'recall': recall_score(y_true, y_pred, zero_division=0),
        'f1': f1_score(y_true, y_pred, zero_division=0),
        'aucpr': average_precision_score(y_true, y_score) if y_score is not None else np.nan,
    }
    metrics.update(evaluate_predictions(y_true, y_pred))
    return metrics


def validate_model(bundle: dict, train_df: pd.DataFrame, validation_df: pd.DataFrame) -> pd.DataFrame:
    train_metrics = score_model(bundle, train_df['message'], train_df['target'])
    val_metrics = score_model(bundle, validation_df['message'], validation_df['target'])
    return pd.DataFrame([train_metrics, val_metrics], index=['train', 'validation'])

## Load Splits

In [3]:
train_df = pd.read_csv('train.csv')
validation_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')

print(train_df.shape, validation_df.shape, test_df.shape)

(3900, 2) (836, 2) (836, 2)


## Fit, Score, Evaluate (Train + Validation)

In [4]:
benchmark_models = ['nb', 'lr', 'svm']
validation_rows = []
all_bundles = {}

for model_name in benchmark_models:
    bundle = fit_model(model_name, train_df['message'], train_df['target'])
    all_bundles[model_name] = bundle

    model_summary = validate_model(bundle, train_df, validation_df)
    print(f"\n===== {model_name.upper()} =====")
    display(model_summary)

    val_metrics = score_model(bundle, validation_df['message'], validation_df['target'])
    validation_rows.append({'model': model_name, **val_metrics})

validation_results = pd.DataFrame(validation_rows).sort_values(['aucpr', 'f1'], ascending=False).reset_index(drop=True)
print('Validation ranking (model selection based on AUCPR):')
display(validation_results[['model', 'aucpr', 'f1', 'precision', 'recall', 'accuracy']])

best_model_name = validation_results.iloc[0]['model']
print(f'Best model selected: {best_model_name}')


===== NB =====


Unnamed: 0,accuracy,precision,recall,f1,aucpr,tn,fp,fn,tp
train,0.984872,1.0,0.887189,0.940223,0.991423,3377,0,59,464
validation,0.9689,1.0,0.767857,0.868687,0.971811,724,0,26,86



===== LR =====


  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)
  norm2_w = weights @ weights if weights.ndim == 1 else squared_norm(weights)


Unnamed: 0,accuracy,precision,recall,f1,aucpr,tn,fp,fn,tp
train,0.969744,0.987952,0.783939,0.8742,0.990233,3372,5,113,410
validation,0.964115,1.0,0.732143,0.845361,0.97063,724,0,30,82



===== SVM =====


Unnamed: 0,accuracy,precision,recall,f1,aucpr,tn,fp,fn,tp
train,0.999744,1.0,0.998088,0.999043,0.999463,3377,0,1,522
validation,0.982057,0.989899,0.875,0.92891,0.97833,723,1,14,98


Validation ranking (model selection based on AUCPR):


Unnamed: 0,model,aucpr,f1,precision,recall,accuracy
0,svm,0.97833,0.92891,0.989899,0.875,0.982057
1,nb,0.971811,0.868687,1.0,0.767857,0.9689
2,lr,0.97063,0.845361,1.0,0.732143,0.964115


Best model selected: svm


## Test Benchmark Scores and Final Selection

In [5]:
test_rows = []
for model_name, bundle in all_bundles.items():
    test_metrics = score_model(bundle, test_df['message'], test_df['target'])
    test_rows.append({'model': model_name, **test_metrics})

test_results = pd.DataFrame(test_rows).sort_values(['aucpr', 'f1'], ascending=False).reset_index(drop=True)
print('Test scores for 3 benchmark models:')
display(test_results[['model', 'aucpr', 'f1', 'precision', 'recall', 'accuracy']])

best_bundle = all_bundles[best_model_name]
best_test_metrics = score_model(best_bundle, test_df['message'], test_df['target'])
print(f"\nSelected best model ({best_model_name}) on test set:")
print(best_test_metrics)

Test scores for 3 benchmark models:


Unnamed: 0,model,aucpr,f1,precision,recall,accuracy
0,svm,0.966318,0.948837,0.990291,0.910714,0.986842
1,nb,0.965782,0.907317,1.0,0.830357,0.977273
2,lr,0.964955,0.862944,1.0,0.758929,0.967703



Selected best model (svm) on test set:
{'accuracy': 0.9868421052631579, 'precision': 0.9902912621359223, 'recall': 0.9107142857142857, 'f1': 0.9488372093023256, 'aucpr': np.float64(0.966317806638291), 'tn': 723, 'fp': 1, 'fn': 10, 'tp': 102}
