# Import packages and data

In [1]:
import random
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
spam = pd.read_csv('/Users/michelle/Desktop/final-project/SMSSpamCollection.txt', sep = "\t" ,names=["Label", "Message"])

In [3]:
print(spam.head())

  Label                                            Message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


# Convert the dataFrame into a List of tuples, each tuple(row) contains label or message

In [4]:
data_set = []
for index, row in spam.iterrows():
    data_set.append((row['Message'], row['Label']))

In [5]:
print(data_set[:5])

[('Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...', 'ham'), ('Ok lar... Joking wif u oni...', 'ham'), ("Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's", 'spam'), ('U dun say so early hor... U c already then say...', 'ham'), ("Nah I don't think he goes to usf, he lives around here though", 'ham')]


In [6]:
print(len(data_set))

5572


## Preprocessing
## remove stopwords/ tokenization/ stemmer or lemmatizer

In [7]:
stemmer = PorterStemmer()
wordnet_lemmatizer = WordNetLemmatizer()

In [8]:
def preprocess(document, stem=True):
    'changes document to lower case, remove stopwords and lemmatizes/stems '
    
    # changes sentence to lower case
    document = document.lower()
    
    # tokenize into words
    words = word_tokenize(document)
    
    # remove stop words
    words = [word for word in words if word not in stopwords.words("english")]
    
    if stem:
        words = [stemmer.stem(word) for word in words]
    else:
        words = [wordnet_lemmatizer.lemmatize(word, pos='v') for word in words]
        
    document = " ".join(words)
    return document

In [10]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/michelle/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michelle/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [14]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/michelle/nltk_data...


True

## Perform preprocessing step on data_Set

In [15]:
messages_set = []
for (message, label) in data_set:
    words_filtered = [e.lower() for e in preprocess(message, stem=False).split() if len(e) >=3]
    messages_set.append((words_filtered, label))

In [16]:
print(messages_set[:5])

[(['jurong', 'point', 'crazy', 'available', 'bugis', 'great', 'world', 'buffet', '...', 'cine', 'get', 'amore', 'wat', '...'], 'ham'), (['lar', '...', 'joke', 'wif', 'oni', '...'], 'ham'), (['free', 'entry', 'wkly', 'comp', 'win', 'cup', 'final', 'tkts', '21st', 'may', '2005.', 'text', '87121', 'receive', 'entry', 'question', 'std', 'txt', 'rate', 'apply', '08452810075over18'], 'spam'), (['dun', 'say', 'early', 'hor', '...', 'already', 'say', '...'], 'ham'), (['nah', "n't", 'think', 'usf', 'live', 'around', 'though'], 'ham')]


## Preparing to create features

In [18]:
def get_words_in_messages(messages):
    all_words = []
    for (message, label) in messages:
        all_words.extend(message)
    return all_words

In [19]:
def get_word_features(wordlist):
    wordlist = nltk.FreqDist(wordlist)
    word_features = wordlist.keys()
    return word_features

In [21]:
word_features = get_word_features(get_words_in_messages(messages_set))
print(len(word_features))

7995


## Preparing to create train and test datasets

In [23]:
sliceIndex = int((len(messages_set) * .8))

In [24]:
random.shuffle(messages_set)

In [25]:
train_messages, test_messages = messages_set[:sliceIndex], messages_set[sliceIndex:]

In [26]:
len(train_messages)
len(test_messages)

1115

## Preparing to create feature maps to train and test data

In [27]:
def extract_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [28]:
training_set = nltk.classify.apply_features(extract_features, train_messages)
testing_Set = nltk.classify.apply_features(extract_features, test_messages)

In [29]:
print ( 'Training set size: ', len(training_set))
print('Test set size: ', len(testing_Set))

Training set size:  4457
Test set size:  1115


## Training

In [31]:
classifier = nltk.NaiveBayesClassifier.train(training_set)

## Evaluation

In [33]:
print(nltk.classify.accuracy(classifier, training_set))

0.9928202827013687


In [34]:
print(nltk.classify.accuracy(classifier, testing_Set))

0.9757847533632287


In [35]:
messages = 'Congratulations!! Youve been selected to provide feedback. As a recent walgreens music, we are offering you the opportunity to receive a $100 Gift Card for answereing'
print('Classification result: ', classifier.classify(extract_features(messages.split())))

Classification result:  ham


In [36]:
print(classifier.show_most_informative_features(50))

Most Informative Features
         contains(award) = True             spam : ham    =    182.1 : 1.0
        contains(urgent) = True             spam : ham    =    138.2 : 1.0
         contains(await) = True             spam : ham    =    103.1 : 1.0
       contains(service) = True             spam : ham    =     84.1 : 1.0
         contains(nokia) = True             spam : ham    =     72.4 : 1.0
      contains(landline) = True             spam : ham    =     67.1 : 1.0
           contains(100) = True             spam : ham    =     63.6 : 1.0
      contains(delivery) = True             spam : ham    =     63.6 : 1.0
           contains(txt) = True             spam : ham    =     61.9 : 1.0
        contains(camera) = True             spam : ham    =     59.2 : 1.0
        contains(expire) = True             spam : ham    =     59.2 : 1.0
       contains(private) = True             spam : ham    =     59.2 : 1.0
        contains(todays) = True             spam : ham    =     59.2 : 1.0

## code is flowing better took time to figure out what I was missing in certain areas of the code
## heading to label work
## week 5 work
## still preparing rest of work