# Term Frequency - Inverse Document Frequency

In [1]:
import pandas as pd

In [2]:
messages = pd.read_csv('./spam_or_not_spam.csv')

In [3]:
messages.head()

Unnamed: 0,email,label
0,date wed NUMBER aug NUMBER NUMBER NUMBER NUMB...,0
1,martin a posted tassos papadopoulos the greek ...,0
2,man threatens explosion in moscow thursday aug...,0
3,klez the virus that won t die already the most...,0
4,in adding cream to spaghetti carbonara which ...,0


In [4]:
messages['label'].value_counts()

label
0    2500
1     500
Name: count, dtype: int64

In [5]:
messages[messages['label'] == 1].head(10)

Unnamed: 0,email,label
2500,save up to NUMBER on life insurance why spend...,1
2501,NUMBER fight the risk of cancer URL NUMBER sli...,1
2502,NUMBER fight the risk of cancer URL NUMBER sli...,1
2503,adult club offers free membership instant acc...,1
2504,i thought you might like these NUMBER slim dow...,1
2505,a powerhouse gifting program you don t want to...,1
2506,help wanted we are a NUMBER year old fortune N...,1
2507,hyperlink life can change in an instant that ...,1
2508,tired of the bull out there want to stop losin...,1
2509,dear ricardoNUMBER cost effective direct email...,1


In [6]:
## Data Cleaning and Preprocessing
import re
import nltk

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/diegoagd10/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [8]:
ps = PorterStemmer()
snowball_stemmer = SnowballStemmer("english")
lemmatizer = WordNetLemmatizer()

In [9]:
stopwords_set = set(stopwords.words('english'))

In [10]:
def clean_text(text):
    new_text = text.lower()
    new_text = re.sub('[^a-zA-z]', ' ', new_text)
    new_text = new_text.split()
    new_text = [lemmatizer.lemmatize(word) for word in new_text if not word in stopwords_set]
    new_text = ' '.join(new_text)
    return new_text

In [11]:
corpus = []

for i in range(0, len(messages)):
    text = str(messages['email'][i])
    corpus.append(clean_text(text))

corpus

['date wed number aug number number number number number chris garrigues cwg dated number numberfanumberd deepeddy com message id number number tmda deepeddy vircio com reproduce error repeatable like every time without fail debug log pick happening number number number pick_it exec pick inbox list lbrace lbrace subject ftp rbrace rbrace number number sequence mercury number number number exec pick inbox list lbrace lbrace subject ftp rbrace rbrace number number sequence mercury number number number ftoc_pickmsgs number hit number number number marking number hit number number number tkerror syntax error expression int note run pick command hand delta pick inbox list lbrace lbrace subject ftp rbrace rbrace number number sequence mercury number hit number hit come obviously version nmh using delta pick version pick nmh number number number compiled url sun mar number number number number ict number relevant part mh_profile delta mhparam pick seq sel list since pick command work sequence

In [12]:
y = messages['label']

In [13]:
## Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.20, random_state=0)

In [14]:
## Create the TF-IDF model
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf=TfidfVectorizer(max_features=2500, ngram_range=(1,2))

In [15]:
## Create the new dataset using the Bag of Words model
X_train = tfidf.fit_transform(X_train).toarray()
X_test = tfidf.transform(X_test).toarray()

In [16]:
((X_train.shape, y_train.shape), (X_test.shape, y_test.shape))

(((2400, 2500), (2400,)), ((600, 2500), (600,)))

In [17]:
tfidf.vocabulary_

{'url': 2324,
 'date': 476,
 'number': 1432,
 'numbertnumber': 1516,
 'find': 761,
 'missing': 1336,
 'last': 1137,
 'week': 2422,
 'url url': 2340,
 'url date': 2329,
 'date number': 478,
 'number number': 1473,
 'number numbertnumber': 1477,
 'numbertnumber number': 1517,
 'number url': 1499,
 'use': 2346,
 'perl': 1614,
 'daily': 469,
 'headline': 930,
 'mailer': 1246,
 'perlnumber': 1618,
 'august': 150,
 'numberst': 1514,
 'september': 1975,
 'posted': 1673,
 'monday': 1348,
 'summary': 2144,
 'review': 1877,
 'vnumber': 2390,
 'inumber': 1061,
 'news': 1408,
 'copyright': 428,
 'pudge': 1751,
 'right': 1882,
 'reserved': 1856,
 'received': 1797,
 'message': 1312,
 'subscribed': 2132,
 'stop': 2104,
 'receiving': 1800,
 'add': 27,
 'change': 310,
 'preference': 1687,
 'please': 1648,
 'go': 872,
 'user': 2355,
 'page': 1572,
 'log': 1216,
 'use perl': 2347,
 'perl daily': 1616,
 'september number': 1976,
 'monday september': 1349,
 'number news': 1472,
 'url copyright': 2328,
 'co

In [18]:
## Training model using Naive bayes classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score

In [19]:
def print_metrics(train, predicted, title):
    accuracy = accuracy_score(train, predicted)
    precision = precision_score(train, predicted)
    recall = recall_score(train, predicted)
    f1 = f1_score(train, predicted)
    roc_auc = roc_auc_score(train, predicted)
    cm = confusion_matrix(train, predicted)

    print('************************************************\n')
    print(title)
    print('- Accuracy: {:.4f}'.format(accuracy))
    print('- Precision: {:4f}'.format(precision))
    print('- Recall: {:4f}'.format(recall))
    print('- F1 Score: {:.4f}'.format(f1))
    print('- ROC AUC Score: {:4f}'.format(roc_auc))

    print('\nConfusion Matrix:\n')
    print(cm)
    print('\n************************************************')

In [20]:
model = MultinomialNB().fit(X_train, y_train)

y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

print_metrics(y_train, y_train_pred, 'Model performance on Training set')
print_metrics(y_test, y_test_pred, 'Model performance on Test set')

************************************************

Model performance on Training set
- Accuracy: 0.9829
- Precision: 0.982278
- Recall: 0.919431
- F1 Score: 0.9498
- ROC AUC Score: 0.957946

Confusion Matrix:

[[1971    7]
 [  34  388]]

************************************************
************************************************

Model performance on Test set
- Accuracy: 0.9883
- Precision: 0.973333
- Recall: 0.935897
- F1 Score: 0.9542
- ROC AUC Score: 0.966033

Confusion Matrix:

[[520   2]
 [  5  73]]

************************************************


In [21]:
## Predicting the new emails
new_emails = [
    'You have won a lottery. Claim your prize now. A lot of money is waiting for you.',
    'I love to see the sky with you',
    'I have photos of you which could compromise you. Send me money or I will publish them.',
    'Hi I am Peter, I am a recruiter at XL company. I have a job offer for you. Please contact me.',
    'Dragon Ball new episode is out. Watch it now.',
    'Give me money or you are dead.',
    'Click to claim your prize now.',
]

In [22]:
## Cleaning the new emails
new_emails_cleaned = [clean_text(email) for email in new_emails]
new_emails_cleaned

['lottery claim prize lot money waiting',
 'love see sky',
 'photo could compromise send money publish',
 'hi peter recruiter xl company job offer please contact',
 'dragon ball new episode watch',
 'give money dead',
 'click claim prize']

In [23]:
## Creating the new email vectors
new_email_vectors = tfidf.transform(new_emails_cleaned).toarray()
new_email_vectors

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [24]:
## Make predictions
predictions = model.predict(new_email_vectors)
predictions

array([0, 0, 0, 1, 0, 0, 1])