In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Load the data
data = pd.read_csv('spam.csv', encoding='latin-1')

# Select relevant columns and rename them
data = data[['v1', 'v2']]
data.columns = ['Class', 'Message']

# Encode the labels
data['Class'] = data['Class'].map({'ham': 0, 'spam': 1})

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(data['Message'], data['Class'], test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)

# Choose the classifier 
classifier = MultinomialNB()  # Naive Bayes

# Create a pipeline
pipeline = Pipeline([
    ('tfidf', tfidf),
    ('classifier', classifier)
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
print(classification_report(y_test, y_pred))

# Save predictions to a file
test_results = pd.DataFrame({'Message': X_test, 'Actual': y_test, 'Predicted': y_pred})
test_results.to_csv('spam_predictions.csv', index=False)

print("Predictions saved to 'spam_predictions.csv'")


              precision    recall  f1-score   support

           0       0.97      1.00      0.98       965
           1       1.00      0.79      0.88       150

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115

Predictions saved to 'spam_predictions.csv'
