## Email Spam-Ham dectector

In [1]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

df = pd.read_csv('datasets/spam_ham_dataset.csv')

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\linh9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Prepare data & Training

In [22]:
df['label_num'].value_counts()

0    3672
1    1499
Name: label_num, dtype: int64

In [23]:
df.head()

Unnamed: 0.1,Unnamed: 0,label,text,label_num
0,605,ham,Subject: enron methanol ; meter # : 988291\r\n...,0
1,2349,ham,"Subject: hpl nom for january 9 , 2001\r\n( see...",0
2,3624,ham,"Subject: neon retreat\r\nho ho ho , we ' re ar...",0
3,4685,spam,"Subject: photoshop , windows , office . cheap ...",1
4,2030,ham,Subject: re : indian springs\r\nthis deal is t...,0


In [24]:
def pre_process(orig_text):

    # Remove punctuations
    text_w_no_punc = [char for char in orig_text if char not in string.punctuation]
    text_w_no_punc = ''.join(text_w_no_punc)    # list back to string

    # Remove stopwords
    clean_text = [word for word in text_w_no_punc.split()
                    if word.lower() not in stopwords.words('english')]

    return clean_text

In [25]:
# Bag_of_words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(analyzer=pre_process)
texts_bow = cv.fit_transform(df['text'])

In [27]:
# Split data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(texts_bow, df['label_num'], train_size=0.8, random_state=0)

In [39]:
# Naive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB().fit(X_train, y_train)

In [40]:
classifier.get_params()

{'alpha': 1.0, 'class_prior': None, 'fit_prior': True}

### Evaluate

In [29]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

def evaluate(y_pred, y_test):
    print(classification_report(y_test, y_pred))
    print("Confusion matrix: \n", confusion_matrix(y_test, y_pred))
    print('Accuracy: ', accuracy_score(y_test, y_pred))

In [30]:
y_pred = classifier.predict(X_test)

evaluate(y_pred, y_test)

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       732
           1       0.95      0.96      0.96       303

    accuracy                           0.97      1035
   macro avg       0.97      0.97      0.97      1035
weighted avg       0.97      0.97      0.97      1035

Confusion matrix: 
 [[718  14]
 [ 13 290]]
Accuracy:  0.9739130434782609


##### Other algorithms

In [31]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

svc = SVC(kernel='sigmoid', gamma=1.0)
dtc = DecisionTreeClassifier(max_depth=5)

classifier_list = {'SVC': svc, "Decision Tree": dtc}

In [32]:
def train_classifier(classifier,X_train,y_train,X_test):
    classifier.fit(X_train,y_train)
    y_pred = classifier.predict(X_test)

    return y_pred

In [33]:
# train_classifier(svc,X_train,y_train,X_test,y_test)

for name,classifier in classifier_list.items():
    current_y_pred = train_classifier(classifier, X_train,y_train,X_test)
    
    print("Using ", name)
    evaluate(current_y_pred, y_test)
    print('=========\n')

Using  SVC
              precision    recall  f1-score   support

           0       0.86      0.87      0.87       732
           1       0.68      0.66      0.67       303

    accuracy                           0.81      1035
   macro avg       0.77      0.77      0.77      1035
weighted avg       0.81      0.81      0.81      1035

Confusion matrix: 
 [[640  92]
 [103 200]]
Accuracy:  0.8115942028985508

Using  Decision Tree
              precision    recall  f1-score   support

           0       0.99      0.78      0.87       732
           1       0.65      0.98      0.78       303

    accuracy                           0.84      1035
   macro avg       0.82      0.88      0.83      1035
weighted avg       0.89      0.84      0.85      1035

Confusion matrix: 
 [[573 159]
 [  6 297]]
Accuracy:  0.8405797101449275



#### Manual test

In [34]:
i = 534
X_sample = df.text[i]
y_sample = df.label_num[i]

print(X_sample)
X_sample = cv.transform([X_sample])

y_pred = classifier.predict(X_sample)
print(f'Pre= {y_pred}, Real= {y_sample}')

Subject: this has worked for me marrow enemy
i think there is a world market for maybe five computers . - thomas watson ( 1874 - 1956 ) ; chairman of ibm ; 1943
your highness ; i have no need of this hypothesis . - pierre laplace ( 1749 - 1827 ) ; to napoleon on why his works on celestial mechanics make no mention of god .
you got to be careful if you dont know where youre going ; because you might not get there . - yogi berra

Pre= [1], Real= 1


#### Custom input

In [37]:
# X_sample = 'daren do you have a problem if i copy the 2000 deal for pioneer and make a new one. they have flow in jan . this is giving me a meter allocation exception forjan . let me know thanks .'
# X_sample = 'Making test Data Mining 04/05/2022 in Room 304. Host: Ho Nhat Quang'

X_sample = 'The permanent fix to pen growth\nlimited offer :add at least 4 inches or get your money back !\n- - - - > visit us to learn more\nno more offers'
X_sample = cv.transform([X_sample])

y_pred = classifier.predict(X_sample)

if y_pred == 1:
    print("An email has been moved to the spam folder")
else:
    print("You have new email")

An email has been moved to the spam folder
