In [17]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [18]:
def load_split_data(train_path='/content/train.csv', val_path='/content/validation.csv', test_path='/content/test.csv'):
    train_df = pd.read_csv(train_path)
    val_df = pd.read_csv(val_path)
    test_df = pd.read_csv(test_path)
    return train_df, val_df, test_df

# Train model
def train_model(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model

# Evaluate model
def evaluate_model(model, X, y):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    report = classification_report(y, y_pred)
    return accuracy, report


In [19]:
# Main training pipeline
def train_and_evaluate():
    train_df, val_df, test_df = load_split_data()

    vectorizer = TfidfVectorizer()
    X_train = vectorizer.fit_transform(train_df['message'])
    X_val = vectorizer.transform(val_df['message'])
    X_test = vectorizer.transform(test_df['message'])

    y_train, y_val, y_test = train_df['label'], val_df['label'], test_df['label']

    models = {
        'Naive Bayes': MultinomialNB(),
        'SVM': SVC(),
        'Random Forest': RandomForestClassifier(n_estimators=100)
    }

    best_model = None
    best_acc = 0

    for name, model in models.items():
        print(f'\n==============================')
        print(f'Training {name}...')
        print(f'==============================\n')

        model = train_model(model, X_train, y_train)
        acc, report = evaluate_model(model, X_val, y_val)

        print(f'\nValidation Results for {name}:')
        print(f'------------------------------')
        print(f'Accuracy: {acc:.4f}\n')
        print(report)
        print('==============================\n')

        if acc > best_acc:
            best_acc = acc
            best_model = model

    print(f'Evaluating best model i.e {best_model} on test set...')
    print('====================================\n')
    test_acc, test_report = evaluate_model(best_model, X_test, y_test)
    print(f'Test Accuracy: {test_acc:.4f}\n')
    print(test_report)
    print('====================================\n')

In [20]:
train_and_evaluate()


Training Naive Bayes...


Validation Results for Naive Bayes:
------------------------------
Accuracy: 0.9564

              precision    recall  f1-score   support

           0       0.95      1.00      0.98      1013
           1       1.00      0.68      0.81       157

    accuracy                           0.96      1170
   macro avg       0.98      0.84      0.89      1170
weighted avg       0.96      0.96      0.95      1170



Training SVM...


Validation Results for SVM:
------------------------------
Accuracy: 0.9838

              precision    recall  f1-score   support

           0       0.98      1.00      0.99      1013
           1       0.99      0.89      0.94       157

    accuracy                           0.98      1170
   macro avg       0.99      0.94      0.96      1170
weighted avg       0.98      0.98      0.98      1170



Training Random Forest...


Validation Results for Random Forest:
------------------------------
Accuracy: 0.9735

              precis