## Importing necessary libraries

In [4]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/faizan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Loading Dataset

In [3]:
data = pd.read_csv('spam.csv',encoding='latin-1')

## Data Cleansing and Preprocessing

In [6]:
# Data Cleaning and Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(f'[{string.punctuation}]', '', text)
    text = re.sub('\w*\d\w*', '', text)
    tokens = text.split()
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    stemmer = SnowballStemmer("english")
    tokens = [stemmer.stem(word) for word in tokens]
    return ' '.join(tokens)

data['v2'] = data['v2'].apply(preprocess_text)

## Feature Extraction

In [7]:
# Feature Extraction
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(data['v2'])
y = data['v1']

## Splitting Dataset into test sets and train sets

In [8]:
# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Training

### Naive Bayes

In [9]:
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_preds = nb_model.predict(X_test)

### SVM

In [11]:
# SVM
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train, y_train)
svm_preds = svm_model.predict(X_test)

## Model Evaluation

### Naive Bayes

In [12]:
# Model Evaluation
print("Naive Bayes Classifier:")
print("Accuracy:", accuracy_score(y_test, nb_preds))
print(classification_report(y_test, nb_preds))
print(confusion_matrix(y_test, nb_preds))

Naive Bayes Classifier:
Accuracy: 0.9721973094170404
              precision    recall  f1-score   support

         ham       0.97      1.00      0.98       965
        spam       1.00      0.79      0.88       150

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115

[[965   0]
 [ 31 119]]


### SVM

In [13]:
print("\nSupport Vector Machine:")
print("Accuracy:", accuracy_score(y_test, svm_preds))
print(classification_report(y_test, svm_preds))
print(confusion_matrix(y_test, svm_preds))


Support Vector Machine:
Accuracy: 0.9829596412556054
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.99      0.89      0.93       150

    accuracy                           0.98      1115
   macro avg       0.98      0.94      0.96      1115
weighted avg       0.98      0.98      0.98      1115

[[963   2]
 [ 17 133]]


## Hyper Parameter tunning 

### Naive Bayes

In [15]:
# Naive Bayes with Hyperparameter Tuning
nb_param_grid = {'alpha': [0.1, 0.5, 1, 5, 10]}
nb_model = GridSearchCV(MultinomialNB(), nb_param_grid, refit=True, verbose=2)
nb_model.fit(X_train, y_train)
nb_preds = nb_model.predict(X_test)

print("Best parameters for Naive Bayes found by GridSearch:", nb_model.best_params_)
print("Naive Bayes Classifier:")
print("Accuracy:", accuracy_score(y_test, nb_preds))
print(classification_report(y_test, nb_preds))
print(confusion_matrix(y_test, nb_preds))

Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.1; total time=   0.0s
[CV] END ..........................................alpha=0.5; total time=   0.0s
[CV] END ..........................................alpha=0.5; total time=   0.0s
[CV] END ..........................................alpha=0.5; total time=   0.0s
[CV] END ..........................................alpha=0.5; total time=   0.0s
[CV] END ..........................................alpha=0.5; total time=   0.0s
[CV] END ............................................alpha=1; total time=   0.0s
[CV] END ........................................

### SVM

In [16]:
# SVM with Hyperparameter Tuning
svm_param_grid = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01], 'kernel': ['linear', 'rbf']}
svm_model = GridSearchCV(SVC(), svm_param_grid, refit=True, verbose=2)
svm_model.fit(X_train, y_train)
svm_preds = svm_model.predict(X_test)

print("Best parameters for SVM found by GridSearch:", svm_model.best_params_)
print("Support Vector Machine:")
print("Accuracy:", accuracy_score(y_test, svm_preds))
print(classification_report(y_test, svm_preds))
print(confusion_matrix(y_test, svm_preds))

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.4s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.4s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.4s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.4s
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time=   0.4s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.7s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.7s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.7s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.7s
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time=   0.7s
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time=   0.4s
[CV] END ....................C=0.1, gamma=0.1, k