In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [21]:
# Load data
raw_mail_data = pd.read_csv('/content/spam.csv', encoding='ISO-8859-1')

In [22]:
# Handle missing values
mail_data = raw_mail_data.fillna('')

In [23]:
# Convert category labels to numeric (spam: 0, ham: 1)
mail_data['v1'] = mail_data['v1'].map({'spam': 0, 'ham': 1})

In [24]:
# Split data into features (X) and labels (Y)
X = mail_data['v2']
Y = mail_data['v1']

In [25]:
# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

In [26]:
# Feature extraction using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_train_features = tfidf_vectorizer.fit_transform(X_train)
X_test_features = tfidf_vectorizer.transform(X_test)

In [27]:
# Define logistic regression model
logistic_regression = LogisticRegression(max_iter=1000)

In [28]:
# Define hyperparameters for grid search
param_grid = {'C': [0.01, 0.1, 1, 10, 100]}

In [29]:
# Perform grid search with cross-validation
grid_search = GridSearchCV(logistic_regression, param_grid, cv=5, scoring='f1', verbose=1, n_jobs=-1)
grid_search.fit(X_train_features, Y_train)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [30]:
# Retrieve best model from grid search
best_model = grid_search.best_estimator_

In [31]:
# Train best model on full training data
best_model.fit(X_train_features, Y_train)

In [32]:
# Predictions
Y_train_pred = best_model.predict(X_train_features)
Y_test_pred = best_model.predict(X_test_features)

In [33]:
# Evaluate model
train_accuracy = accuracy_score(Y_train, Y_train_pred)
test_accuracy = accuracy_score(Y_test, Y_test_pred)
print('Training accuracy:', train_accuracy)
print('Testing accuracy:', test_accuracy)

print('Classification Report on Test Data:')
print(classification_report(Y_test, Y_test_pred))

print('Confusion Matrix:')
print(confusion_matrix(Y_test, Y_test_pred))

Training accuracy: 0.9982417582417582
Testing accuracy: 0.9780316344463972
Classification Report on Test Data:
              precision    recall  f1-score   support

           0       0.96      0.89      0.92       169
           1       0.98      0.99      0.99       969

    accuracy                           0.98      1138
   macro avg       0.97      0.94      0.96      1138
weighted avg       0.98      0.98      0.98      1138

Confusion Matrix:
[[150  19]
 [  6 963]]


In [40]:
# Example prediction
#
input_mail = ["for free"]
input_data_features = tfidf_vectorizer.transform(input_mail)
prediction = best_model.predict(input_data_features)

if prediction[0] == 1:
    print('Predicted: Ham mail')
else:
    print('Predicted: Spam mail')

Predicted: Spam mail


In [39]:
from google.colab import files
import joblib

# Save the trained model
joblib.dump(best_model, 'spam_classifier_model.joblib')

# Save the TF-IDF vectorizer
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.joblib')

# Download the model files
files.download('spam_classifier_model.joblib')
files.download('tfidf_vectorizer.joblib')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>