In [13]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.preprocessing import scale

In [14]:
df = pd.read_csv("emails.csv")

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5172 entries, 0 to 5171
Columns: 3002 entries, Email No. to Prediction
dtypes: int64(3001), object(1)
memory usage: 118.5+ MB


In [16]:
df.head()

Unnamed: 0,Email No.,the,to,ect,and,for,of,a,you,hou,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,Email 1,0,0,1,0,0,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Email 2,8,13,24,6,6,2,102,1,27,...,0,0,0,0,0,0,0,1,0,0
2,Email 3,0,0,1,0,0,0,8,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Email 4,0,5,22,0,5,1,51,2,10,...,0,0,0,0,0,0,0,0,0,0
4,Email 5,7,6,17,1,5,2,57,0,9,...,0,0,0,0,0,0,0,1,0,0


In [17]:
df.isnull().sum()

Email No.     0
the           0
to            0
ect           0
and           0
             ..
military      0
allowing      0
ff            0
dry           0
Prediction    0
Length: 3002, dtype: int64

In [9]:
df.columns

Index(['Email No.', 'the', 'to', 'ect', 'and', 'for', 'of', 'a', 'you', 'hou',
       ...
       'connevey', 'jay', 'valued', 'lay', 'infrastructure', 'military',
       'allowing', 'ff', 'dry', 'Prediction'],
      dtype='object', length=3002)

In [18]:
df.dropna(inplace=True)
df.drop(['Email No.'], axis=1, inplace=True)

In [19]:
df.head()

Unnamed: 0,the,to,ect,and,for,of,a,you,hou,in,...,connevey,jay,valued,lay,infrastructure,military,allowing,ff,dry,Prediction
0,0,0,1,0,0,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,8,13,24,6,6,2,102,1,27,18,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,8,0,0,4,...,0,0,0,0,0,0,0,0,0,0
3,0,5,22,0,5,1,51,2,10,1,...,0,0,0,0,0,0,0,0,0,0
4,7,6,17,1,5,2,57,0,9,3,...,0,0,0,0,0,0,0,1,0,0


In [20]:
X = df.drop(['Prediction'], axis=1)  # Features
y = df['Prediction']  # Target variable

In [23]:
print("\nFeatures (X) and Target (y) separated:")
print(X.head())
print(y.head())


Features (X) and Target (y) separated:
   the  to  ect  and  for  of    a  you  hou  in  ...  enhancements  connevey  \
0    0   0    1    0    0   0    2    0    0   0  ...             0         0   
1    8  13   24    6    6   2  102    1   27  18  ...             0         0   
2    0   0    1    0    0   0    8    0    0   4  ...             0         0   
3    0   5   22    0    5   1   51    2   10   1  ...             0         0   
4    7   6   17    1    5   2   57    0    9   3  ...             0         0   

   jay  valued  lay  infrastructure  military  allowing  ff  dry  
0    0       0    0               0         0         0   0    0  
1    0       0    0               0         0         0   1    0  
2    0       0    0               0         0         0   0    0  
3    0       0    0               0         0         0   0    0  
4    0       0    0               0         0         0   1    0  

[5 rows x 3000 columns]
0    0
1    0
2    0
3    0
4    0
Name: Predi

In [24]:
# Normalize the feature set
X = scale(X)

In [25]:
# Split the data into training and testing sets (70% training, 30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [26]:
print("\nTraining and Testing data shapes:")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


Training and Testing data shapes:
X_train shape: (3620, 3000)
X_test shape: (1552, 3000)
y_train shape: (3620,)
y_test shape: (1552,)


In [27]:
# KNN Classifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)


In [28]:
# KNN Evaluation
print("\nKNN Classifier Results:")
print("Predictions:", y_pred_knn)  # 1 for spam, 0 for not spam
print("KNN Accuracy:", metrics.accuracy_score(y_test, y_pred_knn))
print("KNN Confusion Matrix:\n", metrics.confusion_matrix(y_test, y_pred_knn))


KNN Classifier Results:
Predictions: [0 0 1 ... 0 1 1]
KNN Accuracy: 0.8518041237113402
KNN Confusion Matrix:
 [[894 203]
 [ 27 428]]


In [29]:
# SVM Classifier
svm = SVC()  # Support Vector Machine model
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

In [30]:
# SVM Evaluation
print("\nSVM Classifier Results:")
print("Predictions:", y_pred_svm)
print("SVM Accuracy:", metrics.accuracy_score(y_test, y_pred_svm))
print("SVM Confusion Matrix:\n", metrics.confusion_matrix(y_true=y_test, y_pred=y_pred_svm))


SVM Classifier Results:
Predictions: [0 0 1 ... 0 0 1]
SVM Accuracy: 0.9381443298969072
SVM Confusion Matrix:
 [[1091    6]
 [  90  365]]


In [31]:
print("\nKNN Classifier - Additional Metrics:")
print("Precision:", metrics.precision_score(y_test, y_pred_knn))
print("Recall:", metrics.recall_score(y_test, y_pred_knn))
print("F1-Score:", metrics.f1_score(y_test, y_pred_knn))


KNN Classifier - Additional Metrics:
Precision: 0.6782884310618067
Recall: 0.9406593406593406
F1-Score: 0.7882136279926335


In [32]:
print("\nSVM Classifier - Additional Metrics:")
print("Precision:", metrics.precision_score(y_test, y_pred_svm))
print("Recall:", metrics.recall_score(y_test, y_pred_svm))
print("F1-Score:", metrics.f1_score(y_test, y_pred_svm))


SVM Classifier - Additional Metrics:
Precision: 0.9838274932614556
Recall: 0.8021978021978022
F1-Score: 0.8837772397094431
