# CODE TO DETECT SPAM E-MAILS USING NAIVE BAYES

# PROBLEM STATEMENT

- The SMS Spam Collection is a set of SMS tagged messages that have been collected for SMS Spam research. It contains one set of SMS messages in English of 5,574 messages, tagged acording being ham (legitimate) or spam.

- The files contain one message per line. Each line is composed by two columns: v1 contains the label (ham or spam) and v2 contains the raw text.


# STEP #0: LIBRARIES IMPORT


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from google.colab import files #library to upload files to colab notebook

%matplotlib inline

In [None]:
# Mount the drive
from google.colab import drive
drive.mount('/content/drive')

# STEP #1: IMPORT DATASET

In [None]:
spam_df = pd.read_csv('/content/drive/My Drive/Colab Notebooks/emails.csv')

In [None]:
spam_df.head(10)

In [None]:
spam_df.tail()

In [None]:
spam_df.describe()

In [None]:
spam_df.info()

# STEP #2: VISUALIZE DATASET

In [None]:
# Let's see which message is the most popular ham/spam message
spam_df.groupby('spam').describe()

In [None]:
# Let's get the length of the messages
spam_df['length'] = spam_df['text'].apply(len)

In [None]:
spam_df

In [None]:
sns.countplot(x = spam_df['spam'], label = "Count") 

# STEP #3: CREATE TESTING AND TRAINING DATASET/DATA CLEANING

# STEP 3.1 EXERCISE: REMOVE PUNCTUATION

In [None]:
import string
string.punctuation

In [None]:
Test = 'Hello Mr. Future, I am so happy to be learning AI now!!'

In [None]:
Test_punc_removed = [char for char in Test if char not in string.punctuation]
Test_punc_removed

In [None]:
# Join the characters again to form the string.
Test_punc_removed_join = ''.join(Test_punc_removed)
Test_punc_removed_join

# STEP 3.2 EXERCISE: REMOVE STOPWORDS

In [None]:
# You have to download stopwords Package to execute this command
from nltk.corpus import stopwords
stopwords.words('english')

In [None]:
Test_punc_removed_join

In [None]:
Test_punc_removed_join_clean = [word for word in Test_punc_removed_join.split() if word.lower() not in stopwords.words('english')]

In [None]:
Test_punc_removed_join_clean # Only important (no so common) words are left

# STEP 3.3 EXERCISE: COUNT VECTORIZER EXAMPLE 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
sample_data = ['This is the first document.','This document is the second document.','And this is the third one.','Is this the first document?']

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(sample_data)


In [None]:
print(vectorizer.get_feature_names())


In [None]:
print(X.toarray())  

# LET'S APPLY THE PREVIOUS THREE PROCESSES TO OUR SPAM/HAM EXAMPLE

In [None]:
# Let's define a pipeline to clean up all the messages 
# The pipeline performs the following: (1) remove punctuation, (2) remove stopwords

def message_cleaning(message):
    Test_punc_removed = [char for char in message if char not in string.punctuation]
    Test_punc_removed_join = ''.join(Test_punc_removed)
    Test_punc_removed_join_clean = [word for word in Test_punc_removed_join.split() if word.lower() not in stopwords.words('english')]
    return Test_punc_removed_join_clean

In [None]:
# Let's test the newly added function
spam_df_clean = spam_df['text'].apply(message_cleaning)

In [None]:
print(spam_df_clean[0])

In [None]:
print(spam_df['text'][0])

# LET'S APPLY COUNT VECTORIZER TO OUR MESSAGES LIST

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# Define the cleaning pipeline we defined earlier
vectorizer = CountVectorizer(analyzer = message_cleaning)
spamham_countvectorizer = vectorizer.fit_transform(spam_df['text'])


In [None]:
print(spamham_countvectorizer.toarray())  

In [None]:
spamham_countvectorizer.shape

# STEP#4: TRAINING THE MODEL WITH ALL DATASET

In [None]:
NB_classifier = MultinomialNB()
label = spam_df['spam'].values
NB_classifier.fit(spamham_countvectorizer, label)

In [None]:
testing_sample = ['Free money!!!', "Hi Kim, Please let me know if you need any further information. Thanks"]
testing_sample_countvectorizer = vectorizer.transform(testing_sample)


In [None]:
test_predict = NB_classifier.predict(testing_sample_countvectorizer)
test_predict

# STEP#4: DIVIDE THE DATA INTO TRAINING AND TESTING PRIOR TO TRAINING

In [None]:
X = spamham_countvectorizer
y = label

In [None]:
X.shape

In [None]:
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
NB_classifier = MultinomialNB()
NB_classifier.fit(X_train, y_train)

# STEP#5: EVALUATING THE MODEL 

In [None]:
y_predict_train = NB_classifier.predict(X_train)
y_predict_train
cm = confusion_matrix(y_train, y_predict_train)
sns.heatmap(cm, annot=True)

In [None]:
# Predicting the Test set results
y_predict_test = NB_classifier.predict(X_test)
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(y_test, y_predict_test))