In [1]:
# ==============================
# ðŸ“— SMS Spam Detection - Combined Dataset
# ==============================

# Step 1: Import Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# Step 2: Load Dataset
df = pd.read_csv("/content/combined_data.csv")

# Step 3: Check and clean dataset
df = df[['text', 'label']].dropna()

# Step 5: Split into Train-Test
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Step 6: Text Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.9)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Step 7: Train Models
# Model 1: Multinomial Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_vec, y_train)
y_pred_nb = nb_model.predict(X_test_vec)

# Model 2: Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_vec, y_train)
y_pred_lr = lr_model.predict(X_test_vec)

# Model 3: Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train_vec, y_train)
y_pred_svm = svm_model.predict(X_test_vec)

# Model 4: Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_vec, y_train)
y_pred_rf = rf_model.predict(X_test_vec)

# Model 5: Decision Tree
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train_vec, y_train)
y_pred_dt = dt_model.predict(X_test_vec)

# Step 8: Evaluation Function
def evaluate_model(y_true, y_pred, model_name):
    print(f"\n===== {model_name} Evaluation =====")
    print("Accuracy:", round(accuracy_score(y_true, y_pred), 4))
    print("Precision:", round(precision_score(y_true, y_pred), 4))
    print("Recall:", round(recall_score(y_true, y_pred), 4))
    print("F1 Score:", round(f1_score(y_true, y_pred), 4))
    print("\nConfusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nClassification Report:\n", classification_report(y_true, y_pred))

# Step 9: Evaluate All Models
evaluate_model(y_test, y_pred_nb, "Multinomial Naive Bayes")
evaluate_model(y_test, y_pred_lr, "Logistic Regression")
evaluate_model(y_test, y_pred_svm, "Support Vector Machine (SVM)")
evaluate_model(y_test, y_pred_rf, "Random Forest")
evaluate_model(y_test, y_pred_dt, "Decision Tree")

# Step 10: Compare All Models
print("\n" + "="*60)
print("ðŸ“Š MODEL COMPARISON SUMMARY")
print("="*60)

models = {
    "Naive Bayes": y_pred_nb,
    "Logistic Regression": y_pred_lr,
    "SVM": y_pred_svm,
    "Random Forest": y_pred_rf,
    "Decision Tree": y_pred_dt
}

results = []
for name, y_pred in models.items():
    results.append({
        'Model': name,
        'Accuracy': round(accuracy_score(y_test, y_pred), 4),
        'Precision': round(precision_score(y_test, y_pred), 4),
        'Recall': round(recall_score(y_test, y_pred), 4),
        'F1 Score': round(f1_score(y_test, y_pred), 4)
    })

results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))
print("="*60)


===== Multinomial Naive Bayes Evaluation =====
Accuracy: 0.9726
Precision: 0.9854
Recall: 0.9617
F1 Score: 0.9734

Confusion Matrix:
 [[1955   31]
 [  83 2086]]

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.98      0.97      1986
           1       0.99      0.96      0.97      2169

    accuracy                           0.97      4155
   macro avg       0.97      0.97      0.97      4155
weighted avg       0.97      0.97      0.97      4155


===== Logistic Regression Evaluation =====
Accuracy: 0.9779
Precision: 0.9667
Recall: 0.9917
F1 Score: 0.9791

Confusion Matrix:
 [[1912   74]
 [  18 2151]]

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.96      0.98      1986
           1       0.97      0.99      0.98      2169

    accuracy                           0.98      4155
   macro avg       0.98      0.98      0.98      4155
weighted avg       0.98      0.