In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [2]:
def load_data():
    """Load the prepared data splits"""
    train = pd.read_csv('train.csv')
    validation = pd.read_csv('validation.csv')
    test = pd.read_csv('test.csv')
    return train, validation, test

In [3]:
def handle_nan_data(data):
    """Handling NaN values in text data"""
    data = data.copy()
    data['message'] = data['message'].fillna('')
    return data

In [4]:
def fit_model(model, train_data):
    """Fit a model on training data"""
    # Handling NaNs
    train_data = handle_nan_data(train_data)
    
    # Create and fit TF-IDF vectorizer
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_data['message'])
    y_train = train_data['label']
    
    # Fit the model
    model.fit(X_train, y_train)
    
    return model, vectorizer

In [5]:
def score_model(model, vectorizer, data):
    """Score a model on given data"""
    # Clean text data
    data = handle_nan_data(data)
    X = vectorizer.transform(data['message'])
    return model.predict(X)

In [6]:
def evaluate_predictions(y_true, y_pred, dataset_name=""):
    """Evaluate model predictions with comprehensive metrics"""
    print(f"\nEvaluation on {dataset_name} dataset:")
    
    # Calculate all metrics
    acc = accuracy_score(y_true, y_pred)
    report = classification_report(y_true, y_pred, output_dict=True)
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    # Print all metrics
    print(f"Accuracy: {acc:.4f}")
    print(f"Precision (spam): {report['1']['precision']:.4f}")
    print(f"Recall (spam): {report['1']['recall']:.4f}")
    print(f"F1-score (spam): {report['1']['f1-score']:.4f}")
    
    print("\nConfusion Matrix:")
    print(conf_matrix)
    
    # Return F1-score for spam class
    return report['1']['f1-score']

In [7]:
def fine_tune_model(model, param_grid, train_data, validation_data):
    """Fine-tune model hyperparameters using the validation set"""
    # Handle NaN values
    train_data = handle_nan_data(train_data)
    validation_data = handle_nan_data(validation_data)
    
    # Create TF-IDF features
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train = vectorizer.fit_transform(train_data['message'])
    y_train = train_data['label']
    
    X_val = vectorizer.transform(validation_data['message'])
    y_val = validation_data['label']
    
    # Perform grid search using validation set with F1 score as metric
    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    
    # Evaluate the best model on validation data
    val_pred = best_model.predict(X_val)
    val_f1 = evaluate_predictions(y_val, val_pred, "Validation")
    
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Validation F1-score: {val_f1:.4f}")
    
    return best_model, vectorizer

In [8]:
def benchmark_models(train_data, validation_data, test_data):
    """Train and evaluate three benchmark models with hyperparameter tuning"""
    # Initialize models and their hyperparameter grids
    models = {
        'Naive Bayes': (MultinomialNB(), {'alpha': [0.1, 0.5, 1.0]}),
        'Linear SVM': (LinearSVC(random_state=42), {'C': [0.01, 0.1, 1, 10]}),
        'Random Forest': (RandomForestClassifier(n_estimators=100, random_state=42), 
                          {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20]})
    }
    
    results = {}

    for name, (model, param_grid) in models.items():
        print(f"\nFine-tuning {name}...")

        # Fine-tune model using validation set
        best_model, vectorizer = fine_tune_model(model, param_grid, train_data, validation_data)
        
        # Score and evaluate on test data
        test_pred = score_model(best_model, vectorizer, test_data)
        test_f1 = evaluate_predictions(test_data['label'], test_pred, f"Test ({name})")
        
        results[name] = {
            'model': best_model,
            'vectorizer': vectorizer,
            'f1_score': test_f1
        }
        
        print(f"{name} Test F1-score: {test_f1:.4f}")
    
    # Select best model based on F1-score
    best_model_name = max(results.items(), key=lambda x: x[1]['f1_score'])[0]
    print(f"\nBest model: {best_model_name} (F1-score: {results[best_model_name]['f1_score']:.4f})")
    
    return results[best_model_name]['model'], results[best_model_name]['vectorizer']

In [9]:
def main():
    train_data, validation_data, test_data = load_data()
    best_model, best_vectorizer = benchmark_models(train_data, validation_data, test_data)

if __name__ == "__main__":
    main()


Fine-tuning Naive Bayes...

Evaluation on Validation dataset:
Accuracy: 0.9848
Precision (spam): 0.9854
Recall (spam): 0.9000
F1-score (spam): 0.9408

Confusion Matrix:
[[963   2]
 [ 15 135]]
Best parameters: {'alpha': 0.1}
Validation F1-score: 0.9408

Evaluation on Test (Naive Bayes) dataset:
Accuracy: 0.9785
Precision (spam): 0.9921
Recall (spam): 0.8456
F1-score (spam): 0.9130

Confusion Matrix:
[[965   1]
 [ 23 126]]
Naive Bayes Test F1-score: 0.9130

Fine-tuning Linear SVM...





Evaluation on Validation dataset:
Accuracy: 0.9758
Precision (spam): 0.9424
Recall (spam): 0.8733
F1-score (spam): 0.9066

Confusion Matrix:
[[957   8]
 [ 19 131]]
Best parameters: {'C': 10}
Validation F1-score: 0.9066

Evaluation on Test (Linear SVM) dataset:
Accuracy: 0.9830
Precision (spam): 0.9710
Recall (spam): 0.8993
F1-score (spam): 0.9338

Confusion Matrix:
[[962   4]
 [ 15 134]]
Linear SVM Test F1-score: 0.9338

Fine-tuning Random Forest...

Evaluation on Validation dataset:
Accuracy: 0.9695
Precision (spam): 1.0000
Recall (spam): 0.7733
F1-score (spam): 0.8722

Confusion Matrix:
[[965   0]
 [ 34 116]]
Best parameters: {'max_depth': None, 'n_estimators': 50}
Validation F1-score: 0.8722

Evaluation on Test (Random Forest) dataset:
Accuracy: 0.9668
Precision (spam): 1.0000
Recall (spam): 0.7517
F1-score (spam): 0.8582

Confusion Matrix:
[[966   0]
 [ 37 112]]
Random Forest Test F1-score: 0.8582

Best model: Linear SVM (F1-score: 0.9338)


It turns out that Linear SVM is the bes