In [1]:
# Connect to Google Drive in order to reach necessary .csv files.
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from csv import reader
import csv

In [3]:
def vocabularyAndFeatureSet():
    vocab = []
    feature_set = []
    
    # Read the tokenized_corpus.csv to create vocabulary
    with open('/content/drive/My Drive/Colab Notebooks/tokenized_corpus.csv', 'r') as read_obj:
        csv_reader = reader(read_obj)
        for row in csv_reader:
            for word in row:
                if word not in vocab:
                    vocab.append(word)

    
    # Now create the feature set.
    with open('/content/drive/My Drive/Colab Notebooks/tokenized_corpus.csv', 'r') as read_obj:
        csv_reader = reader(read_obj)
        for row in csv_reader:
            # Row variable is a list that represents a row in tokenized_corpus.csv

            # Create an array of size vocab that contains all zeros
            sms = [0] * len(vocab)
            for word in row:
                index = vocab.index(word)
                sms[index] = sms[index] + 1
            
            feature_set.append(sms)

    with open('/content/drive/My Drive/Colab Notebooks/feature_set.csv', 'w', newline = '') as f:
        writer = csv.writer(f)
        writer.writerows(feature_set)

    return feature_set, vocab
feature_set, vocab = vocabularyAndFeatureSet()

In [4]:
# Divide trainset and testset
# Put first 4460 instances to train set and the other 1112 instances to test set.
def splitTrainTest(feature_set):
    feature_set_path = '/content/drive/My Drive/Colab Notebooks/feature_set.csv'

    train_set = []
    test_set = []
    counter = 0

    # # Read the feature set to create train and test sets
    # with open(feature_set_path, 'r') as read_obj:
    #     csv_reader = reader(read_obj)
    #     for row in csv_reader:
    #         if counter < 4460:
    #             train_set.append(row)
    #             counter = counter + 1
    #         else:
    #             test_set.append(row)
    #             counter = counter + 1

    # train_set_final = []
    # test_set_final = []

    
    # # convert spams and hams into integer lists
    # for x in train_set:
    #     train_set_final.append(list(map(int, x)))
    # for x in test_set:
    #     test_set_final.append(list(map(int, x)))

    for i in feature_set:
        if counter < 4460:
            train_set.append(i)
            counter = counter + 1
        else:
            test_set.append(i)
            counter = counter + 1
    
    return train_set, test_set
            
train_set, test_set = splitTrainTest(feature_set)

In [5]:
# Save labels into a list because it will be needed when we train the model.
def getLabels():
    labels = []

    labels_path = '/content/drive/My Drive/Colab Notebooks/labels.csv'
    with open(labels_path, 'r') as read_obj:
        csv_reader = reader(read_obj)
        for row in csv_reader:
            labels.append(row[0])
            
    return labels

labels = getLabels()

In [6]:
# Create the Multinomial Naive Bayes model and train it
def TrainMultinomialNaiveBayes(train_set, labels, alpha): # Alpha is for Laplacian Smoothing

    # Separate spam and ham sms
    spams = []
    hams = []

    counter = 0
    # Separate spams and hams to count
    for x in train_set:
        if int(labels[counter]) == 1:
            spams.append(x)
        else:
            hams.append(x)
        counter = counter + 1

    # T_j_spam is the number of occurences of word j in spam SMSs in the training set
    T_j_spam = [0] * len(spams[0])
    for x in spams:
        T_j_spam = np.add(T_j_spam, x)
    T_j_spam = np.add(T_j_spam, alpha)

    # T_j_ham is the number of occurences of word j in spam SMSs in the training set
    T_j_ham = [0] * len(spams[0])
    for x in hams:
        T_j_ham = np.add(T_j_ham, x)
    T_j_ham = np.add(T_j_ham, alpha)

    # N_spam is the number of spam SMSs in the training set
    N_spam = len(spams)

    # N is the total number of SMSs in the traning set
    N = len(train_set)

    # P_spam estimates the probability that a particular SMS will be spam
    P_spam = np.log(N_spam / N) + alpha

    # P_ham estimates the probability that a particular SMS will be ham
    P_ham = np.log((N - N_spam) / N) + alpha
    
    # V is the vocabulary size
    V = len(train_set[0])

    # P_j_spam estimates the probability that a particular word in a spam SMS will be the j'th word of vocab
    P_j_spam = T_j_spam / (np.sum(T_j_spam) + V * alpha)

    # P_j_ham estimates the probability that a particular word in a ham SMS will be the j'th word of vocab
    P_j_ham = T_j_ham / (np.sum(T_j_ham) + V * alpha)

    # Takes logs to avoid overflows
    P_j_spam = np.array(np.log(P_j_spam))
    P_j_ham = np.array(np.log(P_j_ham))

    return P_j_spam, P_j_ham, P_spam, P_ham

alpha = 1
P_j_spam, P_j_ham, P_spam, P_ham = TrainMultinomialNaiveBayes(train_set, labels, alpha)

In [7]:
def predict(test_set, P_j_spam, P_j_ham, P_spam, P_ham):

    probs = np.zeros((2, len(test_set)))

    # Put probs[1] to spams and probs[0] to hams
    np.ndarray.fill(probs[1], P_spam)
    np.ndarray.fill(probs[0], P_ham)

    # Calculate the multinomial naive bayes formula
    probs[0] = probs[0] + np.nansum(test_set * P_j_ham, axis=1)
    probs[1] = probs[1] + np.nansum(test_set * P_j_spam, axis=1)
    
    # Calculate argmax using numpy's argmax, axis = 0
    predictions = np.argmax(probs, 0)

    # If prob of spam = prob of ham, set the prediction 1 (spam)
    for x in range(len(test_set)):
        if probs[0][x] == probs[1][x]:
            predictions[x] = 1

    return predictions            


predictions = predict(test_set, P_j_spam, P_j_ham, P_spam, P_ham)

In [8]:
def get_accuracy(predictions, labels, alpha):

    corrects = 0
    for i in range(len(predictions)):
        if predictions[i] == int(labels[i + len(train_set)]):
            corrects = corrects + 1

    accuracy = corrects / float(len(predictions)) * 100.0

    # Save the accuracy value to a .csv file
    if alpha == 0:
        with open('/content/drive/My Drive/Colab Notebooks/test_accuracy.csv', 'w') as acc:
            acc.write('{0};'.format(accuracy))
    if alpha == 1:
        with open('/content/drive/My Drive/Colab Notebooks/test_accuracy_laplace.csv', 'w') as acc:
            acc.write('{0};'.format(accuracy))

    return accuracy

get_accuracy(predictions, labels, alpha)

98.65107913669064

In [9]:
def get_accuracy_r(predictions, labels):

    corrects = 0
    for i in range(len(predictions)):
        if int(predictions[i]) == int(labels[i + len(train_set)]):
            corrects = corrects + 1
            
    accuracy = corrects / float(len(predictions)) * 100.0
    return accuracy


In [11]:
# Question 3 Feature Selection

def feature_selection_preparation(feature_set, vocab):
    vocab_r_count = []
    vocab_r = []
    feature_set_r = []
    count_words = np.sum(feature_set, axis=0)
    count_words = count_words.tolist()

    for x in count_words:
        if x >= 10:
            vocab_r_count.append(x)
            vocab_r.append(vocab[count_words.index(x)])

    # Now create the new feature set.
    with open('/content/drive/My Drive/Colab Notebooks/tokenized_corpus.csv', 'r') as read_obj:
        csv_reader = reader(read_obj)
        for row in csv_reader:
            # Create an array of size vocab that contains all zeros
            sms = [0] * len(vocab_r)
            for word in row:
                if word in vocab_r:
                    index = vocab_r.index(word)
                    sms[index] = sms[index] + 1

            feature_set_r.append(sms)
            
    return feature_set_r, vocab_r, vocab_r_count

feature_set_r, vocab_r, vocab_r_count = feature_selection_preparation(feature_set, vocab)
train_set, test_set = splitTrainTest(feature_set_r)
labels = getLabels()

In [12]:
# I did a test with sklearn's Multinomial Naive Bayes
# from sklearn.naive_bayes import MultinomialNB
# from sklearn import metrics

def forward_selection(train_set, test_set, labels):
    Features = []
    # Set initial accuracy to 50 %
    accuracy = 50 

    indexes = []
    alpha = 1
    done = False

    train_set = np.array(train_set)
    test_set = np.array(test_set)

    # clf = MultinomialNB()

    while done == False:
        curr_acc = accuracy
        for i in range(len(train_set[0])):

            if i not in Features:
                indexes_temp = indexes                
                indexes_temp.append(i)
                
                P_j_spam, P_j_ham, P_spam, P_ham = TrainMultinomialNaiveBayes(train_set[:,indexes_temp], labels, alpha)
                predictions = predict(test_set[:,indexes_temp], P_j_spam, P_j_ham, P_spam, P_ham)
                acc = get_accuracy_r(predictions, labels)


                # clf.fit(train_set[:,indexes_temp], labels[0:len(train_set)])
                # predictions = clf.predict(test_set[:,indexes_temp])
                # acc = get_accuracy_r(predictions, labels)
                # print(acc)

                if acc > accuracy:
                    accuracy = acc
                    max_index = i

                indexes_temp.pop()
                
        if curr_acc == accuracy:
            done = True
        else:
            # print("MAX INDEX", max_index)
            # print("ACC", accuracy)
            Features.append(max_index)
            indexes.append(max_index)
        

    print(Features)
    wtr = csv.writer(open ('/content/drive/My Drive/Colab Notebooks/forward_selection.csv', 'w'), delimiter=',', lineterminator='\n')
    for feature in Features : wtr.writerow([feature])


forward_selection(train_set, test_set, labels)

[0, 112, 158, 75, 77, 32, 155, 154, 29, 83, 148, 25, 33, 98, 87, 108, 124, 146, 167, 266]


In [20]:
# Prepare for 3.2 Frequency Selection

feature_set_r, vocab_r, vocab_r_count = feature_selection_preparation(feature_set, vocab)
train_set, test_set = splitTrainTest(feature_set_r)
labels = getLabels()

vocab_r_count = np.sum(train_set, axis=0)

frequencyArr = []

for i in range(len(vocab_r)):
    frequencyArr.append([vocab_r_count[i], i])

frequencyArr.sort(reverse=True)

In [21]:
# Frequency Selection
def frequency_selection(train_set, test_set, labels, frequencyArr):
    accuracy = []
    indexes = []
    alpha = 1
    train_set = np.array(train_set)
    test_set = np.array(test_set)

    for i in range(len(train_set[0])):

        indexes.append(frequencyArr[i][1])

        P_j_spam, P_j_ham, P_spam, P_ham = TrainMultinomialNaiveBayes(train_set[:,indexes], labels, alpha)
        predictions = predict(test_set[:,indexes], P_j_spam, P_j_ham, P_spam, P_ham)
        acc = get_accuracy_r(predictions, labels)

        accuracy.append(acc)


    wtr = csv.writer(open ('/content/drive/My Drive/Colab Notebooks/frequency_selection.csv', 'w'), delimiter=',', lineterminator='\n')
    for acc in accuracy : wtr.writerow([acc])

    print(accuracy)
frequency_selection(train_set, test_set, labels, frequencyArr)

86.96043165467626
87.14028776978418
87.14028776978418
87.67985611510791
87.67985611510791
87.5
87.58992805755396
87.58992805755396
87.85971223021582
88.66906474820144
89.20863309352518
89.11870503597122
90.10791366906474
90.10791366906474
90.1978417266187
89.92805755395683
92.53597122302159
92.17625899280576
92.17625899280576
92.7158273381295
93.16546762589928
92.35611510791367
92.62589928057554
93.34532374100719
93.43525179856115
93.5251798561151
94.06474820143885
93.70503597122301
94.06474820143885
93.9748201438849
94.15467625899281
93.88489208633094
93.70503597122301
93.88489208633094
94.15467625899281
93.79496402877699
93.88489208633094
93.88489208633094
93.70503597122301
94.06474820143885
94.51438848920863
94.51438848920863
94.42446043165468
94.42446043165468
93.79496402877699
93.88489208633094
93.88489208633094
93.88489208633094
93.88489208633094
93.9748201438849
93.9748201438849
94.06474820143885
94.15467625899281
94.06474820143885
93.9748201438849
93.9748201438849
94.0647482014