# Ham or Spam?

In [1]:
# when installing nltk for the first time we need to also download a few built in libraries
!pip install nltk
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')



[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/badewaaderogba/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/badewaaderogba/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/badewaaderogba/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
import pandas as pd
import string
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import pos_tag, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.metrics import confusion_matrix

In [3]:
df = pd.read_csv("emails.csv")

df.head()

Unnamed: 0,text,spam
0,Subject: naturally irresistible your corporate...,1
1,Subject: the stock trading gunslinger fanny i...,1
2,Subject: unbelievable new homes made easy im ...,1
3,Subject: 4 color printing special request add...,1
4,"Subject: do not have money , get software cds ...",1


The dataset is made up of email that are classified as ham [0] or spam[1]. You need to clean the dataset before training a prediction model.

## Remove Punctuation

👇 Create a function to remove the punctuation. Apply it to the entire data and add the output as a new column in the dataframe called `clean_text`

In [4]:
# locating the default characters considered as punctuations.

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [5]:
# Define a function to remove punctuation in our messages
def no_punctuation(text):
    text = "".join([char for char in text if char not in string.punctuation])
    return text

df['clean_text'] = df['text'].apply(lambda x: no_punctuation(x))

df.head()

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,Subject naturally irresistible your corporate ...
1,Subject: the stock trading gunslinger fanny i...,1,Subject the stock trading gunslinger fanny is...
2,Subject: unbelievable new homes made easy im ...,1,Subject unbelievable new homes made easy im w...
3,Subject: 4 color printing special request add...,1,Subject 4 color printing special request addi...
4,"Subject: do not have money , get software cds ...",1,Subject do not have money get software cds fr...


## Lower Case

👇 Create a function to lower case the text. Apply it to `clean_text`

In [6]:

def lower_text(clean_text):
    # converting clean text to lowercase
    clean_text = clean_text.lower()
    return clean_text

df['clean_text']=df['clean_text'].apply(lambda x: lower_text(x))
df.head()

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible your corporate ...
1,Subject: the stock trading gunslinger fanny i...,1,subject the stock trading gunslinger fanny is...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy im w...
3,Subject: 4 color printing special request add...,1,subject 4 color printing special request addi...
4,"Subject: do not have money , get software cds ...",1,subject do not have money get software cds fr...


## Remove Numbers

👇 Create a function to remove numbers from the text. Apply it to `clean_text`

In [7]:
#regex module
def remove_num(clean_text):
    # remove numbers
    clean_textnonum = re.sub(r'\d+', '', clean_text)
    return clean_textnonum
df['clean_text']=df['clean_text'].apply(lambda x: remove_num(x))
df.head()

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible your corporate ...
1,Subject: the stock trading gunslinger fanny i...,1,subject the stock trading gunslinger fanny is...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy im w...
3,Subject: 4 color printing special request add...,1,subject color printing special request addit...
4,"Subject: do not have money , get software cds ...",1,subject do not have money get software cds fr...


## Remove StopWords

👇 Create a function to remove stopwords from the text. Apply it to `clean_text`.

In [8]:
from nltk.corpus import stopwords

In [9]:
STOPWORD = nltk.corpus.stopwords.words('english')

STOPWORD.extend([ 'first', 'second', 'third', 'me', 'haha', 'lol', 'oof', 'cds'])#we added to our list of stopwords

In [10]:
print(len(STOPWORD))

187


In [11]:
def no_stopwords(clean_text):    
    clean_text = clean_text.apply(lambda x: ' ' .join([word for word in x.split() if word not in STOPWORD]))
    return clean_text

df['clean_text'] = no_stopwords(df['clean_text'])

df.head()

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible corporate ident...
1,Subject: the stock trading gunslinger fanny i...,1,subject stock trading gunslinger fanny merrill...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new homes made easy im wa...
3,Subject: 4 color printing special request add...,1,subject color printing special request additio...
4,"Subject: do not have money , get software cds ...",1,subject money get software software compatibil...


## Lemmatize

👇 Create a function to lemmatize the text. Make sure the output is a single string, not a list of words. Apply it to `clean_text`.

In [12]:
#create object for lemmatizer
lemmatizer = WordNetLemmatizer()

In [13]:
# Lemmatize
from nltk.tokenize import word_tokenize
def lemmatizing_text(clean_text):
    #words= nltk.word_tokenize(x)
    clean_text = clean_text.apply(lambda x: ' ' .join([lemmatizer.lemmatize(word) for word in nltk.word_tokenize(x) if word not in STOPWORD]))
    return clean_text

df['clean_text'] = lemmatizing_text(df['clean_text'])

df.head()
  

Unnamed: 0,text,spam,clean_text
0,Subject: naturally irresistible your corporate...,1,subject naturally irresistible corporate ident...
1,Subject: the stock trading gunslinger fanny i...,1,subject stock trading gunslinger fanny merrill...
2,Subject: unbelievable new homes made easy im ...,1,subject unbelievable new home made easy im wan...
3,Subject: 4 color printing special request add...,1,subject color printing special request additio...
4,"Subject: do not have money , get software cds ...",1,subject money get software software compatibil...


## Bag-of-words Modelling

👇 Vectorize the `clean_text` to a Bag-of-Words representation with a default CountVectorizer . Save as `X_bow`.

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

# Create CountVectorizer object
vectorizer = CountVectorizer(ngram_range=(1,1))
# (2,2) means we only search bigrams.
 # default range is (1,1), meaning we only search for unigrams. 
    # (1,2) means we search for both unigrams and bigrams. 

In [16]:
# Generate matrix of word vectors
ngrams_bowmatrix = vectorizer.fit_transform(df['clean_text'])

In [17]:
# Convert bow_matrix into a DataFrame
bow_df = pd.DataFrame(ngrams_bowmatrix.toarray())


In [18]:
# Map the column names to vocabulary
bow_df.columns = vectorizer.get_feature_names()

In [19]:
# Print bow_df
bow_df


Unnamed: 0,aa,aaa,aaaenerfax,aadedeji,aagrawal,aal,aaldous,aaliyah,aall,aanalysis,...,zwzm,zxghlajf,zyban,zyc,zygoma,zymg,zzmacmac,zzn,zzncacst,zzzz
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5723,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5724,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5725,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5726,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
# save list to file
def save_list(lines, filename):
	# convert lines to a single blob of text
	data = '\n'.join(lines)
	# open file
	file = open(filename, 'w')
	# write text
	file.write(data)
	# close file
	file.close()
 
# save tokens to a vocabulary file
save_list(bow_df, 'X_bow.txt')

👇 Cross-validate a MultinomialNB model with the Bag-of-words. Score the model's accuracy.

In [23]:
messg = df['clean_text'].values
counts = vectorizer.fit_transform(messg)

In [24]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classes = df['spam'].values
classifier.fit(counts, classes)

MultinomialNB()

In [25]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(classifier, counts, classes, cv=5)

# Print the accuracy of each fold:
print(scores)

# Print the mean accuracy of all 5 folds
print(scores.mean())

[0.98691099 0.9895288  0.991274   0.98777293 0.99213974]
0.9895252901681946


In [27]:
df['clean_text'].head()

0    subject naturally irresistible corporate ident...
1    subject stock trading gunslinger fanny merrill...
2    subject unbelievable new home made easy im wan...
3    subject color printing special request additio...
4    subject money get software software compatibil...
Name: clean_text, dtype: object

Evaluation Metrics

⚠️ Please push the exercise once you are done 🙃

## 🏁 