# Assignment 2



In [3]:
# Collection of all used libs
import numpy as np
from collections import Counter
import math
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [160]:
def read_documents(doc_file):
    docs = []
    labels = []
    with open(doc_file, encoding='utf-8') as f:
        for line in f:
            words = line.strip().split()
            docs.append(words[3:])
            labels.append(words[1])
    return docs, labels

In [161]:
all_docs, all_labels = read_documents('data.txt')

split_point = int(0.80*len(all_docs))
train_docs = all_docs[:split_point]
train_labels = all_labels[:split_point]
val_docs = all_docs[split_point:]
val_labels = all_labels[split_point:]

### Estimating parameters for the Naive Bayes classifier

In [162]:
def train_nb(documents, labels):
    # method call: zipping and counting pos/neg words
    pos_words, neg_words,pos, neg = zip_on_value(documents, labels)

    # method call: get frequency
    frequency = get_frequency(documents)
    frequency_pos = get_frequency(pos)
    frequency_neg = get_frequency(neg)
    
    # method call: method call
    pos_val, neg_val = get_total_mean(pos_words, neg_words)

    # method call: scaler (removes )
    pos_comp = get_sacalar(frequency_pos, frequency)
    neg_comp = get_sacalar(frequency_neg, frequency)

    return pos_comp, neg_comp, pos_val, neg_val

In [163]:
def zip_on_value(doc, label):
    pos_words = 0
    neg_words = 0
    pos = []
    neg = []
    for d, l in zip(doc,label):
        if l == 'pos':
            pos_words += 1
            pos.append(d)
        elif l == 'neg':
            neg_words += 1
            neg.append(d)
        else:
            print('ERROR!')
    return pos_words, neg_words, pos, neg
    
def get_frequency(val):
    return Counter(word for doc in val for word in doc)

def get_total_mean(pos, neg):
    #getting eman for the words
    total = pos + neg
    pos_val = pos / total
    neg_val = neg / total
    return pos_val, neg_val

# remove 0 values
def get_sacalar(value, frequency):
    res = {x:float(value[x])/frequency[x] for x in value}
    return res


In [164]:
dict_pos, dict_neg, pos_prior, neg_prior = train_nb(train_docs, train_labels)

In [165]:
print(dict_neg)



In [166]:
print(dict_pos)



### Classifying new documents

In [167]:
"""
This function takes only 1 document at the time and score it
"""
def score_doc_label(document, label, dict_pos, dict_neg, pos_prior, neg_prior):
    p_pos=0
    n_pos=0
    pl_k = list(dict_pos.keys())
    nl_k = list(dict_neg.keys())
    
    # log likelyhood
    for word in document:
        if word in pl_k:
            p_pos += np.log((dict_pos[word]))
        if word in nl_k:
            n_pos += np.log((dict_neg[word]))
    
    pos_prior_log = np.log(pos_prior)
    neg_prior_log = np.log(neg_prior)
    
    # Logarithmic laws (multiplication becomes addition)
    p_pos += pos_prior_log
    n_pos += neg_prior_log
    
    return p_pos, n_pos

In [168]:
def procent_to_log(d):
    holder = []
    for i in range(len(d)):
        holder.append(np.log(d[i]))
    return holder

def log_to_exp(value):
    return np.exp(value)

def zip_log_vals(log_lst, name_lst):
    stripper_lst = []
    for key in name_lst.keys():
        stripper_lst.append(key)
        
    log_dict = dict(zip(stripper_lst, log_lst))
    return log_dict

In [169]:
plog, nlog = score_doc_label(train_docs, train_labels, dict_pos, dict_neg,pos_prior, neg_prior)

In [173]:
print(plog)
print(nlog)

-0.6761896870922498
-0.7103971982200179


#### Sanity check 1

In [214]:
san_1 = ['great']
san_lab = ['pos']

san_pos, san_neg = score_doc_label(san_1, san_lab, dict_pos, dict_neg,pos_prior, neg_prior)
print(np.exp(san_pos), np.exp(san_neg))

0.3605828023571017 0.14299221037269402


We have a frequency of the word great in positive documents equal to that of 0.3605828023571017 and a frequency of the word great in negative documents equal to 0.14299221037269402. This seems reasonable as the word great is generally something we associate with positivity and therefore it should be more prevalent in positive documents.

In [204]:
# positive 
san_pos = 0
count = 0
for i in range(len(macth_l)):
    for key, value in dict_neg.items():
        if key == macth_l[i]:
            san_pos += value
            count += 1

0.30197571665247863


In [215]:
san_2 = ['bad']
san_lab_2 = ['neg']

san_pos_1, san_neg_1 = score_doc_label(san_2, san_lab_2, dict_pos, dict_neg,pos_prior, neg_prior)
print(np.exp(san_pos_1), np.exp(san_neg_1))

0.13683693038560008 0.35921372147533764


We have a frequency of the word great in positive documents equal to that of 0.13683693038560008 and a frequency of the word great in negative documents equal to 0.35921372147533764. This seems reasonable as the word great is generally something we associate with positivity and therefore it should be more prevalent in positive reviews.
in negative documents.

#### Sanity check 2

In [221]:
sanity_check_2 = [['a', 'top-quality', 'performance'], ['a', 'top-quality', 'performance']]
sanity_check_2_lab = ['pos', 'neg']

san_pos_2, san_neg_2 = score_doc_label(sanity_check_2, sanity_check_2_lab, dict_pos, dict_neg,pos_prior, neg_prior)
print(san_pos_2, san_neg_2)

-0.6761896870922498 -0.7103971982200179


When we tried the sanity check 2, it did not crash

In [14]:
def classify_nb(document, label, dict_pos, dict_neg, pos_prior, neg_prior):
    result = ''
    title  = ''
    # score the document
    res_pos, res_neg = score_doc_label(document, label, dict_pos, dict_neg, pos_prior, neg_prior)
    
    # giving the document a label of positive or negative 
    if np.exp(res_pos) > np.exp(res_neg):
        title = 'pos'
        # converting from log to exp
        result = np.exp(res_pos)
    else:
        title = 'neg'
        # converting from log to exp
        result = np.exp(res_neg)
    return title, result

In [15]:
classify_nb(train_docs, train_labels, dict_pos, dict_neg, pos_prior, neg_prior)

('pos', 0.5085510439618088)

### Evaluating the classifier

In [70]:
def classify_documents(docs, label, dict_pos, dict_neg, pos_prior, neg_prior):
    collection_val = []
    collection_lab = []
    
    for d in range(len(docs)):
        # t1 is the classifyed label
        # t2 is the classifyed probability
        t1, t2 = classify_nb(docs[d], label, dict_pos, dict_neg, pos_prior, neg_prior)
        
        collection_lab.append(t1)
        collection_val.append(t2)
    
    # zipping label and probability to a collection
    collection = zip_calass(collection_lab, collection_val)
    return collection, collection_lab, collection_val


In [71]:
def zip_calass(lst1, lst2):
    dd = tuple(zip(lst1, lst2))
    return dd

In [72]:
# calling on classify_documents
cc, guess_lab, guess_val = classify_documents(train_docs, train_labels, dict_pos, dict_neg, pos_prior, neg_prior)

In [73]:
def accuracy(true_labels, guessed_labels):
    match = 0
    for i in range(len(true_labels)):
        if true_labels[i] == guessed_labels[i]:
            match += 1
    # getting score in procent
    score =  (match / float(len(true_labels))) * 100.00
    return score

In [74]:
accuracy(val_labels, guess_lab)

50.56651279899287

The accuracy when testinf the validated accuracy labels against the our guessed labels is: 50.56651279899287

In [180]:
accuracy(train_labels, guess_lab)

87.64033154968

The accuracy when testinf the training labels accuracy against the our guessed labels is: 87.64033154968

In [207]:
"""
Methods to get F1 score
"""

def precision(true_pos, false_pos):
    p = true_pos / (true_pos + false_pos)
    return p

def recall(true_pos, false_neg):
    r = true_pos / (true_pos + false_neg)
    return r

def f1_score(precision, recall):
    score = (precision * recall) / (precision + recall)
    return 2*score

In [208]:
# train_labels guess_lab 
labels_check = tuple(zip(train_labels, guess_lab))

c_false_pos = 0
c_false_neg = 0
c_match_pos = 0
c_match_neg = 0

for index, values in enumerate(labels_check):
    if values[0] != values[1]:
        if values[0] == 'neg' and values[1] == 'pos':
            c_false_pos += 1
        elif values[0] == 'pos' and values[1] == 'neg':
            c_false_neg += 1
    elif values[0] == 'pos' and values[1] == 'pos':
        c_match_pos += 1
    elif values[0] == 'neg' and values[1] == 'neg':
        c_match_neg += 1

print('False pos:',c_false_pos)
print('False neg:', c_false_neg)
print('True pos:',c_match_pos)
print('True neg:',c_match_neg)

False pos: 533
False neg: 645
True pos: 4202
True neg: 4151


In [209]:
pre = precision(c_match_pos, c_false_pos)
re = recall(c_match_pos, c_false_neg)
f1_score(pre, re)

0.8770611563347944

### Error analysis

We are finding miss classifications. After getting the classifications we are getting the len of the documents and sort them by lowest to highest. then we get the first 30 docyuments, we thought 30 documetns were sufficient to do an error analysis on.

Then we run the 30 documents throught the classify documetn method to classify them, then we merge the togheter with the correct answer. 

In [170]:
def find_miss_class(validated, predicted):
    miss = []
    
    for i in range(len(validated)):
        if validated[i] != predicted[i]:
            miss.append(predicted[i])
 

In [29]:
error_val = []
error_key = []
count = 0
for key, value in enumerate(cc):
    if count >= 3:
        break
    error_val.append(value)  
    error_key.append(key)
    count+=1  

In [174]:
# Getting the len och documents and returns it as a list
def get_docs_len(docs):
    docs_len = []
    d_doc = []
    for i in docs:
        docs_len.append(len(i))
        d_doc.append(i)
    return docs_len, d_doc

# sorting list fo get from lowest to higest in word count
def sort_docs(docs):
    docs.sort()
    top_head = docs[:30]
    return top_head

In [106]:
# for training data
d_len, dd = get_docs_len(train_docs)
d_sort = sort_docs(d_len)

In [134]:
# for validated data
d_len_val, dd_val = get_docs_len(val_docs)
d_sort_val = sort_docs(d_len_val)

In [163]:
def get_first_heads(lst_in, check_lst):
    match_lst = []
    for i in range(len(lst_in)):
        for s in check_lst:     
            if len(s) == lst_in[i]:
                match_lst.append(s)
    ret_lst = match_lst[:30]
    return ret_lst

In [167]:
# for training data
pred_lst = get_first_heads(d_sort, train_docs)
for i in pred_lst:
    print(i)

['terrib']
['2nd', 'time', '.']
['it', "'s", 'a', 'beu']
['you', 'all', 'like', 'misogynist', 'comics']
['an', 'excellent', 'book', 'for', 'anyone', 'that', 'barbecues']
['stick', 'to', 'singing', 'in', 'spanish', ',', 'shak']
['an', 'excellent', 'book', 'for', 'anyone', 'that', 'barbecues']
['stick', 'to', 'singing', 'in', 'spanish', ',', 'shak']
['excellent', 'choice', 'to', 'make', 'a', 'great', 'job', '.']
['imposible', 'to', 'do', 'so', 'with', 'no', 'item', 'received']
['works', 'great', ',', 'no', 'problems', 'after', '4', 'months']
['excellent', 'choice', 'to', 'make', 'a', 'great', 'job', '.']
['imposible', 'to', 'do', 'so', 'with', 'no', 'item', 'received']
['works', 'great', ',', 'no', 'problems', 'after', '4', 'months']
['excellent', 'choice', 'to', 'make', 'a', 'great', 'job', '.']
['imposible', 'to', 'do', 'so', 'with', 'no', 'item', 'received']
['works', 'great', ',', 'no', 'problems', 'after', '4', 'months']
['arrived', 'on', 'time', 'and', 'in', 'terrific', 'shape', '!

#### Getting predictions from test and val data

In [137]:
few_docs, guess_lab_1, guess_val_1 = classify_documents(matchy_filter, train_labels, dict_pos, dict_neg, pos_prior, neg_prior)

In [138]:
few_docs

(('pos', 0.5085510439618088),
 ('neg', 0.07627889409428805),
 ('pos', 0.0680680222872823),
 ('pos', 0.04809066958791675),
 ('pos', 0.014588213232861063),
 ('pos', 0.009211357665387493),
 ('pos', 0.014588213232861063),
 ('pos', 0.009211357665387493),
 ('pos', 0.007555551802689143),
 ('neg', 0.004806405797049306),
 ('neg', 0.0017551051890381056),
 ('pos', 0.007555551802689143),
 ('neg', 0.004806405797049306),
 ('neg', 0.0017551051890381056),
 ('pos', 0.007555551802689143),
 ('neg', 0.004806405797049306),
 ('neg', 0.0017551051890381056),
 ('pos', 0.0022295833789030286),
 ('neg', 0.0012133502247477644),
 ('pos', 0.006277623112094543),
 ('pos', 0.0022626108110035447),
 ('neg', 0.0027396677695053846),
 ('neg', 0.001897468623456149),
 ('pos', 0.0022295833789030286),
 ('neg', 0.0012133502247477644),
 ('pos', 0.006277623112094543),
 ('pos', 0.0022626108110035447),
 ('neg', 0.0027396677695053846),
 ('neg', 0.001897468623456149),
 ('pos', 0.0022295833789030286))

#### Zipping the two sets to a singel collection

In [172]:
merged_acc = tuple(zip(train_labels, few_docs))

In [173]:
"""
We decided to look closer at the first two
"""
merged_acc

(('neg', ('pos', 0.5085510439618088)),
 ('neg', ('neg', 0.07627889409428805)),
 ('neg', ('pos', 0.0680680222872823)),
 ('pos', ('pos', 0.04809066958791675)),
 ('pos', ('pos', 0.014588213232861063)),
 ('neg', ('pos', 0.009211357665387493)),
 ('neg', ('pos', 0.014588213232861063)),
 ('pos', ('pos', 0.009211357665387493)),
 ('neg', ('pos', 0.007555551802689143)),
 ('pos', ('neg', 0.004806405797049306)),
 ('pos', ('neg', 0.0017551051890381056)),
 ('neg', ('pos', 0.007555551802689143)),
 ('pos', ('neg', 0.004806405797049306)),
 ('neg', ('neg', 0.0017551051890381056)),
 ('neg', ('pos', 0.007555551802689143)),
 ('neg', ('neg', 0.004806405797049306)),
 ('pos', ('neg', 0.0017551051890381056)),
 ('neg', ('pos', 0.0022295833789030286)),
 ('pos', ('neg', 0.0012133502247477644)),
 ('pos', ('pos', 0.006277623112094543)),
 ('pos', ('pos', 0.0022626108110035447)),
 ('neg', ('neg', 0.0027396677695053846)),
 ('pos', ('neg', 0.001897468623456149)),
 ('neg', ('pos', 0.0022295833789030286)),
 ('neg', ('neg

In [179]:
print(pred_lst[0])
print(pred_lst[2])

['terrib']
['it', "'s", 'a', 'beu']


We think that the first review was hard to classify as "terrib" could have easily been misconstrued as terrific rather than terrible.

The second one was hard to classify as it's hard to even understand what it's trying to say. That said the classification should not have been as "confident" as it was.


### Cross validation

Here are we splitting the data into 10 groupes, then we train each group and get the accuracy, then we take the mean of all the accuracies from all 10 groups to get a prediction value. 

In [492]:
N = 10
for fold_nbr in range(N):
    split_point_1 = int(float(fold_nbr)/N*len(all_docs))
    split_point_2 = int(float(fold_nbr+1)/N*len(all_docs))
    train_docs_fold = all_docs[:split_point_1] + all_docs[split_point_2:]
    train_labels_fold = all_labels[:split_point_1] + all_labels[split_point_2:]
    val_docs_fold = all_docs[split_point_1:split_point_2]


In [483]:
from random import randrange
 
# Split a dataset into k folds
def cross_validation_split(all_docs, folds=10):
    data_spit = list()
    data_cp = list(all_docs)
    fold_size = int(len(all_docs) / folds)
    for i in range(folds):
        fold = list()
        while len(fold) < fold_size:
            index = randrange(len(data_cp))
            fold.append(data_cp.pop(index))
        data_spit.append(fold)
    return data_spit

In [489]:
splitted_data = cross_validation_split(all_docs, folds=10)
splitted_labels = cross_validation_split(all_labels, folds=10)

In [569]:
def train_intervall(docs, fold_iterations, labels):
    folds_dict = dict()
    pn_dict = dict()
    copy_docs = docs
    logger_pos = []
    logger_neg = []
    acc_lst = []
    
    for i in range(0, fold_iterations):
        dict_pos_split, dict_neg_split, pos_prior_split, neg_prior_spit = train_nb(docs[i], labels[i])
        folds_dict[i] = [pos_prior_split, neg_prior_spit]
        pn_dict[i] = dict_pos_split, dict_neg_split
        cc_train, guess_lab_train, guess_val_train = classify_documents(docs[i], labels[i], dict_pos_split, dict_neg_split, pos_prior_split, neg_prior_spit)
        acc_lst.append(accuracy(labels[i], guess_lab_train))
    
    return folds_dict, pn_dict, acc_lst

In [565]:
score_dict, po_no_dict, accuracy_list = train_intervall(splitted_data, 10, splitted_labels)

In [546]:
score_dict

{0: [0.5020990764063812, 0.4979009235936188],
 1: [0.5020990764063812, 0.4979009235936188],
 2: [0.48866498740554154, 0.5113350125944585],
 3: [0.5029387069689337, 0.49706129303106633],
 4: [0.4878253568429891, 0.5121746431570109],
 5: [0.5239294710327456, 0.4760705289672544],
 6: [0.5012594458438288, 0.4987405541561713],
 7: [0.5037783375314862, 0.49622166246851385],
 8: [0.5155331654072208, 0.48446683459277917],
 9: [0.5079764903442485, 0.4920235096557515]}

In [551]:
temp_holder = []
for key, value in score_dict.items():
    temp_holder.append(value[0])
np.mean(temp_holder)

0.5036104114189757

In [566]:
accuracy_list

[68.84970612930312,
 75.73467674223342,
 82.28379513014274,
 60.70528967254408,
 83.9630562552477,
 83.54324097397145,
 52.47691015952981,
 52.896725440806044,
 67.00251889168766,
 52.64483627204031]

In [568]:
np.mean(accuracy_list)

68.01007556675063

### Naive Bayes for numerical data

In this section we tried to implement the Naïve Bayes algorithm from scratch. After creating it we tried apply it to the dataset Iris.

We used pandas for some data manipulation (giv setosa, versicolor, virginica index name. setosa = 0, versicolor = 1, virginica = 2) then we run the mean and standard deviation on each flower. 

**NOTE** We tried to properly implement our train_nb model to the problem but we were not able and decided to solve it in another way.

In [4]:
iris = pd.read_csv('iris.csv')

In [5]:
df = pd.DataFrame(iris)

In [6]:
df.species[df.species == 'setosa'] = 0
df.species[df.species == 'versicolor'] = 1
df.species[df.species == 'virginica'] = 2

lst = df.values.tolist()
lst

[[5.1, 3.5, 1.4, 0.2, 0],
 [4.9, 3.0, 1.4, 0.2, 0],
 [4.7, 3.2, 1.3, 0.2, 0],
 [4.6, 3.1, 1.5, 0.2, 0],
 [5.0, 3.6, 1.4, 0.2, 0],
 [5.4, 3.9, 1.7, 0.4, 0],
 [4.6, 3.4, 1.4, 0.3, 0],
 [5.0, 3.4, 1.5, 0.2, 0],
 [4.4, 2.9, 1.4, 0.2, 0],
 [4.9, 3.1, 1.5, 0.1, 0],
 [5.4, 3.7, 1.5, 0.2, 0],
 [4.8, 3.4, 1.6, 0.2, 0],
 [4.8, 3.0, 1.4, 0.1, 0],
 [4.3, 3.0, 1.1, 0.1, 0],
 [5.8, 4.0, 1.2, 0.2, 0],
 [5.7, 4.4, 1.5, 0.4, 0],
 [5.4, 3.9, 1.3, 0.4, 0],
 [5.1, 3.5, 1.4, 0.3, 0],
 [5.7, 3.8, 1.7, 0.3, 0],
 [5.1, 3.8, 1.5, 0.3, 0],
 [5.4, 3.4, 1.7, 0.2, 0],
 [5.1, 3.7, 1.5, 0.4, 0],
 [4.6, 3.6, 1.0, 0.2, 0],
 [5.1, 3.3, 1.7, 0.5, 0],
 [4.8, 3.4, 1.9, 0.2, 0],
 [5.0, 3.0, 1.6, 0.2, 0],
 [5.0, 3.4, 1.6, 0.4, 0],
 [5.2, 3.5, 1.5, 0.2, 0],
 [5.2, 3.4, 1.4, 0.2, 0],
 [4.7, 3.2, 1.6, 0.2, 0],
 [4.8, 3.1, 1.6, 0.2, 0],
 [5.4, 3.4, 1.5, 0.4, 0],
 [5.2, 4.1, 1.5, 0.1, 0],
 [5.5, 4.2, 1.4, 0.2, 0],
 [4.9, 3.1, 1.5, 0.1, 0],
 [5.0, 3.2, 1.2, 0.2, 0],
 [5.5, 3.5, 1.3, 0.2, 0],
 [4.9, 3.1, 1.5, 0.1, 0],
 [4.4, 3.0, 

In [27]:
def mean(n):
    return sum(n)/float(len(n))
 
def stdev(n):
    avg = mean(n)
    v = sum([(x-avg)**2 for x in n]) / float(len(n)-1)
    return math.sqrt(v)

def sum_data(data):
    s = [(mean(column), stdev(column), len(column)) for column in zip(*data)]
    del(s[-1])
    return s

def sum_class(data):
    sep = sep_class(data)
    s = dict()
    for class_value, rows in sep.items():
        s[class_value] = sum_data(rows)
    return s

def calc_prob(x, mean, stdev):
    e = np.exp(-((x-mean)**2 / (2 * stdev**2 )))
    return (1 / (math.sqrt(2 * math.pi) * stdev)) * e

def calc_class_prob(s, row):
    tr = sum([s[label][0][2] for label in s])
    prop = dict()
    for cv, cs in s.items():
        prop[cv] = s[cv][0][2]/float(tr)
        for i in range(len(cs)):
            mean, stdev, count = cs[i]
            prop[cv] *= calc_prob(row[i], mean, stdev)
    return prop

def sep_class(data):
    sep = dict()
    for i in range(len(data)):
        vector = data[i]
        cv = vector[-1]
        if (cv not in sep):
            sep[cv] = list()
        sep[cv].append(vector)
    return sep

In [28]:
summary = sum_data(lst)

In [29]:
summary 

[(5.843333333333335, 0.8280661279778629, 150),
 (3.0540000000000007, 0.4335943113621737, 150),
 (3.7586666666666693, 1.7644204199522617, 150),
 (1.1986666666666672, 0.7631607417008414, 150)]

In [30]:
sum_1 = sum_class(lst)

In [33]:
# summary class, lst[0] (data)
calc_class_prob(sum_1, lst[125])

{0: 7.942603183173781e-203, 1: 6.518884204295377e-07, 2: 0.06856152672007118}