In [None]:
# Name: Cynthia Nosiri
# Class: EEGR 565.M85 - Machine Learning Applications
# Build a spam classifier by two methods, first through unsupervised learning (K-Means Clustering) and 
# then by multinomial Naïve Bayes.  
# Google drive link: https://drive.google.com/drive/u/1/folders/1l9muVJQBVnnu4fbtM42EctmHOknhC0iE

In [None]:
# Import the libraries
import numpy as np
import pandas as pd

# read the csv file
file = pd.read_csv('spam.csv', delimiter = ',')

In [None]:
# print the first five rows of the data
first_five = file.head()
print(first_five)

In [None]:
# remove unneeded columns
file = file[['label', 'message']]
file.head()


In [None]:
# removing stopwords, make all lowercase
import nltk
from nltk.corpus import stopwords, names
nltk.download("all")

stopwords = stopwords.words('english')
file['message']  = file['message'].apply(lambda words: ' '.join(word.lower() for word in words.split() if word not in stopwords))
print(file.head())


In [None]:
# # # remove non-alphabetic tokens     

def alphabets(element):
    return element.isalpha()



In [None]:
# def lemmatized_text(doc):
#     new_message = []
#     for x in doc:
#         new_message.append("  ".join(lemmatizer.lemmatize(y.lower() for y in x.split() if alphabets(y) and y.lower() not in names.words())))                    
#     return new_message

In [None]:
# from nltk.stem import WordNetLemmatizer
# lemmatizer = WordNetLemmatizer()
# file['message'] = file['message'].apply(lambda x: '  '.join([lemmatizer.lemmatize(word.lower()) for word in x.split() if alphabets(word) and word not in names.words()]))


In [None]:
# print(file['message'])

In [None]:
# lemmatize and remove proper nouns
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

new_message = []
for x in file['message']:
    new_message.append(" ".join(lemmatizer.lemmatize(y.lower()) for y in x.split() if alphabets(y) and y not in names.words()))
    



In [None]:
new_message



In [None]:
file['message'] = new_message
file.head()

In [None]:
# convert label to numerical variable
file['label_id'] = file.label.map({'ham': 0, 'spam': 1})
file.head()

In [None]:
# get my important columns. X contains features while Y contains targets
X = file['message']
Y = file['label_id']
# print(X)
# print(Y)

In [None]:
from sklearn.model_selection import train_test_split

# split dataset into training set and test set
X_train, x_test, Y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 5)
# print(X_train.shape)
# print(x_test.shape)
# print(Y_train.shape)
# print(y_test.shape)

In [None]:
# print(X_train.head())


In [None]:
# text classification using tfidfvectorizer

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(analyzer = 'word', max_features=5000)

# fit and transform the training features
vec =  tfidf.fit_transform(X_train)

# transform the test features
vec_test = tfidf.transform(x_test)
# print(vec)




In [None]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters = 2, random_state = 5)
kmeans.fit(vec)
predictions = kmeans.predict(vec_test)

print('Accuracy: {}'.format((sum(predictions == y_test)/len(predictions))))

In [None]:
# MultinomialNB Classifier model

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

mnb = MultinomialNB()
mnb.fit(vec, Y_train)

predicted = mnb.predict(vec_test)

score = accuracy_score(y_test, predicted)
print('Accuracy Score: \n', (100*score), '%')


In [None]:
# Display the top 25 tokens from both clusters

In [None]:
# convert vec to an array
new_array = vec.toarray()
print(new_array)

In [None]:
# convert to a dataframe
feature_names = tfidf.get_feature_names()
df = pd.DataFrame(new_array, columns = feature_names)
df.head()

In [None]:
# create a new column of the labels
df['label'] = file['label']
df.head()

In [None]:
# group by 'ham' label
df1= df[df['label'] == 'ham'] 
df1

In [None]:
# remove the label column
df1 = df1.drop(['label'], axis = 1)
df1.head()

In [None]:
#gives the max over the index axis
ham = df1.max().reset_index()
ham

In [None]:
# sort the 0 column from big to small and get the first 25 rows
ham.sort_values(by = [0], ascending = False).head(25)

In [None]:

df2= df[df['label'] == 'spam'] 
# df2

In [None]:
df2 = df2.drop(['label'], axis = 1)
# df2

In [None]:
spam = df2.max().reset_index()

In [None]:
spam.sort_values(by = [0], ascending = False).head(25)


In [None]:
# use the wordcloud library to produce word clouds of the two 
# clusters.

In [None]:
#  group the label column by spam
spm = file[file['label'] == 'spam'] 
spm

In [None]:
#  group the label column by ham
ham = file[file['label'] == 'ham'] 
ham

In [None]:
# split each message identifying as ham into a list and append to the new_string
new_string = []
for i in ham['message']:
    for a in i.split():
        new_string.append(a)
new_string

In [None]:
# Join all the words in new_string together as one string
new_ham = " ".join(x for x in new_string)
new_ham

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

wordcloud = WordCloud().generate(new_ham)

plt.imshow(wordcloud)
plt.figure()

In [None]:
new_string1 = []
for i in spm['message']:
    for a in i.split():
        new_string1.append(a)

In [None]:
# Joining all the words in new_string together as one
new_spam = " ".join(x for x in new_string1)

In [None]:
wordcloud = WordCloud().generate(new_spam)

plt.imshow(wordcloud)
plt.figure()