# Exercise 4

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from random import shuffle
from nltk.metrics.scores import accuracy
from nltk.metrics import ConfusionMatrix
import string
import nltk

from nltk.metrics.distance import jaccard_distance

In [2]:
with open('smsspamcollection/SMSSpamCollection','r') as f:
    raw_text = f.read()

Now we are going to clean our data to design our experiment. The experiment will be performed considering:
* Single validation. 50% of the set will be used for training and the other 50% for testing
* Randomly shuffled
* The punctuation will be removed
* All the strings will be lowered

In [3]:
#We will remove the punctuation
table_punct = str.maketrans({key: None for key in string.punctuation})
#table_digits = str.maketrans({key: None for key in string.digits})

#Separating each line
raw_lines = raw_text.split(sep='\n')
#Removing the last line
raw_lines.remove('')
#Removing punctuation
for i in range(len(raw_lines)):
    raw_lines[i] = raw_lines[i].translate(table_punct)
    
#Separating the features:
registers = [i.lower().split(sep='\t') for i in raw_lines]

#Now we will shuffle our strings:
shuffle(registers)

length = int(len(registers)/2)
#Half of the set will be for training
train = registers[:length]
#And the other half for testing
test = registers[length:]



## Bag of words

Using the bag of words, we are converting our train and test set into a vector of occurrences. This will be useful in order to use it as an input for a clasifier.

In [4]:
#First we are going to word tokenize our training and test sets:
for i in range(len(train)):
    train[i][1] = nltk.word_tokenize(train[i][1])
for i in range(len(test)):
    test[i][1] = nltk.word_tokenize(test[i][1])

#Creating a CountVectorizer()
cv = CountVectorizer()

#Here we create a matrix of the words observed (this will be our vocabulary)
Xtrn = cv.fit_transform([' '.join(ex[1]) for ex in train])
#For the test set, we will create the occurrence matrix using only the vocabulary seen in the previous line
Xtst = cv.transform([' '.join(ex[1]) for ex in test])
#Training labels
Ytrn = [ex[0] for ex in train]
#Test labels
Ytst = [ex[0] for ex in test]

## k Nearest Neighbors

The original classifier using kNN is:

In [5]:
clf = KNeighborsClassifier(1)
clf.fit(Xtrn,Ytrn)
preds = clf.predict(Xtst).tolist()
round(accuracy(Ytst,preds),3)

0.93

In [6]:
print(ConfusionMatrix(Ytst,preds).pretty_format())

     |         s |
     |    h    p |
     |    a    a |
     |    m    m |
-----+-----------+
 ham |<2388>   1 |
spam |  194 <204>|
-----+-----------+
(row = reference; col = test)



Although that, we want to implement this algorithm using the Jaccard distance.

In [7]:
def kNN(ex,d):
    return min(train, key=lambda x:d(ex[1], x[1]))[0]

def jaccard(a,b):
    return jaccard_distance(set(a), set(b))



In [8]:
preds = []
for i in test:
    preds.append(kNN(i,jaccard))

In [9]:
round(accuracy(Ytst,preds),3)

0.976

In [10]:
print(ConfusionMatrix(Ytst,preds).pretty_format())

     |         s |
     |    h    p |
     |    a    a |
     |    m    m |
-----+-----------+
 ham |<2381>   8 |
spam |   60 <338>|
-----+-----------+
(row = reference; col = test)

