### Importing libraries

In [1]:
from PIL import Image
import pandas as pd, numpy as np, time, string, re, nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import xgboost

from sklearn.preprocessing import LabelEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
import shap

### Loading the dataset

In [2]:
df = pd.read_csv("./../data/email-messages-analysis.csv", encoding= 'unicode_escape')

Filtering only the full description and email type columns for this analysis.

In [3]:
df = df[['Full_Text','Email_Type']].copy()

In [4]:
f"Missing values: {df.isna().sum().sum()}"

'Missing values: 0'

In [5]:
f"Dataframe shape: {df.shape}"

'Dataframe shape: (5574, 2)'

The description column will suffer some changes. So let's keep a copy of the original description for later analysis.

In [6]:
df['Full_Text_Orig'] = df['Full_Text']
df = df[['Full_Text_Orig','Full_Text','Email_Type']]

### Preprocessing the description field

The original description contains elements that are not relevant for the classification of the email content. So let's remove them.   

Convert all characters to lower case

In [None]:
column = 'Full_Text'
df[column] = df[column].str.lower()

Remove special characters

In [None]:
df[column] = df[column].str.normalize('NFKD')\
            .str.encode('ascii', errors='ignore')\
            .str.decode('utf-8')

In [None]:
Remove substrings containing only numbers, or a mixture of numbers and letters

In [None]:
# remove substrings containing letters and numbers
df[column] = df[column].str.replace(r'\b(\d+[A-Z]|[A-Z]+\d)[A-Z\d]*\b', '', regex=True)
# remove substrings containing only numbers
df[column] = df[column].str.replace(r'\b\d+\b','', regex=True)

Remove punctuation and sequences of repetitions of dots or white spaces 

In [None]:
# remove punctuation
df[column] = df[column].apply(lambda x: re.sub(r'[^\w\s]','',x))
# remove sequence of '.'
df[column] = df[column].str.replace(r'\.+', ' ', regex=True)
# remove sequence of white spaces
df[column] = df[column].str.replace(r'\s+', ' ', regex=True).str.strip()

Remove the stopwords (commonly used words that does not have a significant contribution to the understanding of a sentence)

In [7]:
english_stopwords = stopwords.words('english')
# remove stopwords
df[column] = df[column].apply(lambda x: [elem for elem in x.split() if elem not in english_stopwords])
df[column] = df[column].str.join(' ')

# remove substrings of length 1
#df[column] = df[column].apply(lambda x: [elem for elem in x.split() if len(elem)>1])
#df[column] = df[column].str.join(' ')

In [8]:
df.sample(10)

Unnamed: 0,Full_Text_Orig,Full_Text,Email_Type
2146,Erm Â ill pick you up at about 6.45pm. That'l...,erm ill pick 45pm thatll give enough time get ...,Email
2210,You are right. Meanwhile how's project twins c...,right meanwhile hows project twins comin,Email
2819,"Haha yeah I see that now, be there in a sec",haha yeah see sec,Email
3065,Alright took the morphine. Back in yo.,alright took morphine back yo,Email
4051,You have been specially selected to receive a ...,specially selected receive award call lines cl...,Spam Email
207,Tell my bad character which u Dnt lik in me. ...,tell bad character u dnt lik ill try change lt...,Email
1166,"Sorry, I'll call later",sorry ill call later,Email
3980,"Storming msg: Wen u lift d phne, u say ""HELLO""...",storming msg wen u lift phne u say hello u knw...,Email
1564,Oh god. I'm gonna Google nearby cliffs now.,oh god im gonna google nearby cliffs,Email
4883,"Mila, age23, blonde, new in UK. I look sex wit...",mila age23 blonde new uk look sex uk guys u li...,Spam Email


### Encode the categorical data

In [9]:
le = LabelEncoder()
df["Email_Type"] = le.fit_transform(df["Email_Type"])
print(f"Email type: {le.classes_}")

Email type: ['Email' 'Spam Email']


### Defining our target **Email_Type** and feature **Full_Text** (email description)

In [10]:
X = df.drop(['Email_Type','Full_Text_Orig'], axis=1)
y = df['Email_Type']

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y,test_size=0.2, stratify=y, random_state=42)

### Applying the vectorizer Tf-idf to the email description feature

In [11]:
tfidf = TfidfVectorizer()
Xtrain_vet = tfidf.fit_transform(Xtrain.Full_Text)
Xtest_vet = tfidf.transform(Xtest.Full_Text)

In [12]:
sgdc = SGDClassifier().fit(Xtrain_vet, ytrain)
ypred = sgdc.predict(Xtest_vet)

In [13]:
print(classification_report(ypred, ytest, target_names=le.classes_))

              precision    recall  f1-score   support

       Email       1.00      0.98      0.99       980
  Spam Email       0.89      0.99      0.94       135

    accuracy                           0.98      1115
   macro avg       0.95      0.98      0.96      1115
weighted avg       0.99      0.98      0.98      1115



In [14]:
svc = SVC().fit(Xtrain_vet, ytrain)
ypred = svc.predict(Xtest_vet)

print(classification_report(ypred, ytest, target_names=le.classes_))

              precision    recall  f1-score   support

       Email       1.00      0.97      0.98       996
  Spam Email       0.79      0.99      0.88       119

    accuracy                           0.97      1115
   macro avg       0.90      0.98      0.93      1115
weighted avg       0.98      0.97      0.97      1115



In [15]:
params = {
    'kernel': ('linear','rbf'),
    'C': [1,10,100]
}

clf = RandomizedSearchCV(svc, params, random_state=0)
search = clf.fit(Xtrain_vet, ytrain)
search.best_params_

The total space of parameters 6 is smaller than n_iter=10. Running 6 iterations. For exhaustive searches, use GridSearchCV.


{'kernel': 'linear', 'C': 10}

In [16]:
svc = SVC(kernel='linear', C=10).fit(Xtrain_vet, ytrain)
ypred = svc.predict(Xtest_vet)
print(classification_report(ypred, ytest, target_names=le.classes_))

              precision    recall  f1-score   support

       Email       1.00      0.98      0.99       983
  Spam Email       0.88      0.99      0.93       132

    accuracy                           0.98      1115
   macro avg       0.94      0.99      0.96      1115
weighted avg       0.98      0.98      0.98      1115

