## CSCI 183 Data Science
### Spam Filtering for Short Messages: Naive Bayes
#### Ryan Johnson, Grace Nguyen, and Raya Young




In [36]:
%matplotlib inline

import sklearn as sk
import nltk
import pandas as pd
import numpy as np
from itertools import chain
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer

#### Import the test data 
Separate into two arrays: spam and ham. These arrays will be processed individually in order to generate word clouds.

In [37]:
data = pd.read_csv("training-data/spamcollectiondata.tsv", sep='\t', names = ["Category", "Message"])
data.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


#### Importing Training Data 2
The first training data had a small spam sample, but a large ham sample, so we need a sample with lots of spam to compensate.

In [38]:
d2_messages = list()
d2_cats = list()
with open("training-data/english_big.csv", "r", errors='ignore') as test:
    lines = test.readlines()
    np.random.shuffle(lines)
    for l in lines:
        words = l.split(",")
        message = ""
        for w in words:
            if w == 'ham\n' or w=='spam\n':
                d2_messages.append(message)
                if w == 'ham\n':
                    d2_cats.append('ham')
                else:
                    d2_cats.append('spam')
            else:
                message = message + w

#### Converting to lowercase and combine lists

In [39]:
message_data = [word.lower() for word in data['Message']]
category = data['Category'].tolist()

message_data.extend(d2_messages)
category.extend(d2_cats)

#### Stopword removal and Stemming
Clean both sets of data by removing stopwords. This way, the word cloud will not be completely populated by common stop words. Stemming is also important to ensure eliminate the possibility of having multiple different forms of words.


In [40]:
stop = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')
training_set = []
i = 0
for message in message_data:
    sentence = message.split(" ")
    filtered = ""
    pr = []
    for word in sentence:
        if word.lower() not in stop:
            stemmed = stemmer.stem(word)
            filtered = filtered + " " + stemmed
    pr.append(filtered)
    pr.append(category[i])
    training_set.append(pr)
    i = i+1
    
#print first 10 elements in training_set to see format
print(training_set[:10])

[[' go jurong point, crazy.. avail bugi n great world la e buffet... cine got amor wat...', 'ham'], [' ok lar... joke wif u oni...', 'ham'], [' free entri 2 wkli comp win fa cup final tkts 21st may 2005. text fa 87121 receiv entri question(std txt rate)t&c appli 08452810075over18', 'spam'], [' u dun say earli hor... u c alreadi say...', 'ham'], [" nah don't think goe usf, live around though", 'ham'], [" freemsg hey darl it 3 week word back! i'd like fun still? tb ok! xxx std chgs send, £1.50 rcv", 'spam'], [' even brother like speak me. treat like aid patent.', 'ham'], [' per request mell mell (oru minnaminungint nurungu vettam) set callertun callers. press *9 copi friend callertun', 'ham'], [' winner!! valu network custom select receivea £900 prize reward! claim call 09061701461. claim code kl341. valid 12 hour only.', 'spam'], [' mobil 11 month more? u r entitl updat latest colour mobil camera free! call mobil updat co free 08002986030', 'spam']]


In [41]:
def getset():
    return training_set

Split the list in half. The first half will be the test set and the second half will be the training set.

In [43]:
length = len(training_set)/2
test_set = training_set[int(length):]
training_set = training_set[:int(length)]

#### Train Naive Bayes Classifier

In [None]:
vocab = set(chain(*[word_tokenize(i[0].lower()) for i in training_set]))
training_set = [({word: (word in word_tokenize(x[0])) for word in vocab}, x[1]) for x in training_set]

classifier = nltk.NaiveBayesClassifier.train(training_set)

#### Test on test half of dataset

In [None]:
total = 0
correct = 0
for sentence in test_set:
    test_sentence = clean(sentence[0])
    test_set_feat = {i:(i in word_tokenize(test_sentence.lower())) for i in vocab}
    answer = classifier.classify(test_set_feat)
    if answer == sentence[1]:
        correct = correct + 1
    print("Answer: " + answer + ", Actual: " + sentence[1])
    total = total + 1

print("Accuracy = ", float(correct)/float(total)*100)
