In [10]:
# Step 1: Importing necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Step 2: Load the dataset and preprocess it
df = pd.read_csv('mail_data.csv')  # Load the dataset
data = df.where((pd.notnull(df)), '')  # Replace missing values with empty strings

# Replacing spam with 0 and ham with 1
data.loc[data['Category'] == 'spam', 'Category'] = 0
data.loc[data['Category'] == 'ham', 'Category'] = 1

# Separate the messages (X) and labels (Y)
X = data['Message']  # Input features (email messages)
Y = data['Category'].astype('int')  # Output labels (spam/ham as 0/1)

# Step 3: Feature extraction using TfidfVectorizer
feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
X_features = feature_extraction.fit_transform(X)  # Transform all data

# Step 4: Applying 10-fold cross-validation
model = LogisticRegression()  # Initialize Logistic Regression model

# Perform 10-fold cross-validation
cv_scores = cross_val_score(model, X_features, Y, cv=10, scoring='accuracy')

# Step 5: Display the results
# Accuracy for each fold
print('Cross-validation scores for each fold:', cv_scores)

# Mean accuracy across all folds
print('Mean accuracy across all folds:', np.mean(cv_scores))

# Optional: Test with a custom email message
input_mail = ["XXXMobileMovieClub: To use your credit, click the WAP link in the next txt message or click here>> http://wap.xxxmobilemovieclub.com?n=QJKGIGHJJGCBL"]

# Convert the custom email to feature format using the same TfidfVectorizer
input_data_features = feature_extraction.transform(input_mail)

# Train the model on the entire data before making predictions on new data
model.fit(X_features, Y)

# Prediction on the new email input
prediction = model.predict(input_data_features)

# Output the result
print(prediction)

if prediction[0] == 1:
    print('Not Spam Mail')
else:
    print('Spam Mail')


Cross-validation scores for each fold: [0.96594982 0.95698925 0.9497307  0.96768402 0.96050269 0.95332136
 0.9443447  0.9551167  0.95152603 0.95332136]
Mean accuracy across all folds: 0.9558486644401973
[0]
Spam Mail
