Goal: classificate an e-mail as spam or not

# Importing libraries

In [18]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Loading dataset

In [3]:
email = pd.read_csv('dataset/spam.csv')

In [4]:
email.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Exploratory data analysis

In [5]:
email.describe()

Unnamed: 0,Label,EmailText
count,5572,5572
unique,2,5169
top,ham,"Sorry, I'll call later"
freq,4825,30


# Preprocessing

## Enconding label

label map: <br>
ham = 0 <br>
spam = 1

In [6]:
label_mapping = {'ham': 0, 'spam': 1}

email['Label'] = email['Label'].map(label_mapping)

In [7]:
email.head(2)

Unnamed: 0,Label,EmailText
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...


## Enconding Email text

each word'll convert to a feature. The feature values will be a count how many times that word appear in the text.

In [27]:
cv = CountVectorizer()
features = cv.fit_transform(email['EmailText'])

## Spliting the dataframe

In [30]:
x = features.copy()

In [45]:
target = email['Label']
target.head()

0    0
1    0
2    1
3    0
4    0
Name: Label, dtype: int64

## Checking the dataset

In [38]:
x.shape[0]

5572

In [39]:
target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   Label   5572 non-null   int64
dtypes: int64(1)
memory usage: 43.7 KB


# Modeling

In [40]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from sklearn.metrics import auc, roc_curve, precision_recall_curve

import numpy as np

## Cross validation kfold

In [41]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict

In [42]:
k_fold = KFold(n_splits=100, shuffle=True, random_state=0)

## SVM

In [47]:
clf_svm = SVC(gamma='scale')
scoring = 'accuracy'
score = cross_val_score(clf_svm, x, target, cv=k_fold, n_jobs=1, scoring=scoring)

print(score)
print(np.mean(score)*100)

[0.98214286 0.92857143 1.         0.98214286 1.         0.96428571
 1.         0.98214286 0.96428571 0.98214286 1.         0.92857143
 0.98214286 1.         1.         0.96428571 0.98214286 0.98214286
 0.98214286 1.         0.98214286 1.         0.98214286 0.98214286
 0.96428571 0.96428571 0.98214286 0.96428571 1.         0.98214286
 0.94642857 1.         0.98214286 1.         0.98214286 0.94642857
 1.         0.98214286 0.96428571 1.         0.96428571 0.98214286
 0.98214286 0.98214286 1.         1.         0.96428571 0.98214286
 1.         0.96428571 0.96428571 0.98214286 1.         0.96428571
 0.98214286 0.94642857 0.98214286 1.         0.98214286 0.98214286
 0.98214286 1.         1.         0.94642857 1.         1.
 0.96428571 0.98214286 1.         0.94642857 0.98214286 1.
 0.98181818 1.         0.98181818 0.90909091 0.96363636 0.98181818
 0.96363636 0.94545455 0.98181818 0.98181818 0.96363636 0.98181818
 0.98181818 0.96363636 1.         0.98181818 0.98181818 0.98181818
 0.98181818