In [3]:
import string

import numpy as np
import pandas as pd

import nltk 
from nltk.corpus import stopwords #this is a list of words that are commonly used in a language and do not carry significant meaning
from nltk.stem.porter import PorterStemmer #a class to reduce words to their root form
from sklearn.feature_extraction.text import CountVectorizer #takes the text and converts into a matrix of token counts
from sklearn.model_selection import train_test_split #splits the dataset into training and testing sets
from sklearn.ensemble import RandomForestClassifier #a machine learning model that uses multiple decision trees to make predictions

In [4]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saket\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
df = pd.read_csv('spam_ham_dataset.csv')

In [6]:
#we will get rid of stuff like "\r", "\n" with white spaces
df['text'] = df['text'].apply(lambda x: x.replace('\r\n', ' '))

In [7]:
df.info() #to check if we have any missing values
df.text.iloc[2] #to look a specific example

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  5171 non-null   int64 
 1   label       5171 non-null   object
 2   text        5171 non-null   object
 3   label_num   5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


"Subject: neon retreat ho ho ho , we ' re around to that most wonderful time of the year - - - neon leaders retreat time ! i know that this time of year is extremely hectic , and that it ' s tough to think about anything past the holidays , but life does go on past the week of december 25 through january 1 , and that ' s what i ' d like you to think about for a minute . on the calender that i handed out at the beginning of the fall semester , the retreat was scheduled for the weekend of january 5 - 6 . but because of a youth ministers conference that brad and dustin are connected with that week , we ' re going to change the date to the following weekend , january 12 - 13 . now comes the part you need to think about . i think we all agree that it ' s important for us to get together and have some time to recharge our batteries before we get to far into the spring semester , but it can be a lot of trouble and difficult for us to get away without kids , etc . so , brad came up with a pote

In [8]:
#we will remove punctuation and convert to lower case
stemmer = PorterStemmer() #to reduce words to their root form
corpus = [] #to store the cleaned text

stopwords_set = set(stopwords.words('english')) #this will remove common words like "the", "is", "in", etc

for i in range(len(df)):
    text = df['text'].iloc[i].lower()
    text = text.translate(str.maketrans('', '', string.punctuation)).split() #removes punctuation and splits into words
    text = [stemmer.stem(word) for word in text if word not in stopwords_set] #take the word and reduce it to its root form if it is not a stop word
    text = ' '.join(text) #join the words back into a single string
    corpus.append(text) #add the cleaned text to corpus

In [9]:
#we have to vectorize the text
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(corpus).toarray() #this will convert the text into a matrix of token counts
y = df.label_num

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [10]:
clf = RandomForestClassifier(n_jobs=-1) #Uses all the available cores
clf.fit(X_train, y_train) #fit the model to the training data

In [11]:
clf.score(X_test, y_test)

0.9806763285024155

In [27]:
email_to_classify = "This is tell you that you've submitted your assignment late. Please make sure to submit it on time next time. Also your grade"

In [28]:
email_text = email_to_classify.lower().translate(str.maketrans('', '', string.punctuation)).split()
email_text = [stemmer.stem(word) for word in email_text if word not in stopwords_set]
email_text = ' '.join(email_text)

email_corpus = [email_text]

X_email = vectorizer.transform(email_corpus)

In [29]:
clf.predict(X_email)

array([0])

In [21]:
df.label_num.iloc[10]

np.int64(1)