In [7]:
import numpy as np
import pandas as pd
from collections import defaultdict
from sklearn.model_selection import KFold
import re
from scipy.stats import norm

In [8]:
data = pd.read_csv('a1_data/a1_d3.txt',sep='\t',header=None)
X = data.iloc[:,0]
y = data.iloc[:,1]

In [9]:
kf= KFold(n_splits=5,shuffle=True,random_state=2)

In [10]:
def preprocess(s):
    s = re.sub(r'[^\w]', ' ', s)
    s = s.lower()
    return s

In [11]:
farr = []
acarr = []
for train_index,test_index in kf.split(X):
    X_train= list(X[train_index])
    X_test= list(X[test_index])
    y_train= list(y[train_index])
    y_test= list(y[test_index])

    posct,negct = [0,0]
    for i in range(len(y_train)):
        if(y_train[i] == 0):
            posct += 1
        else:
            negct += 1

    pos_freq = defaultdict(float)
    neg_freq = defaultdict(float)

    for i in range(len(X_train)):
        text = preprocess(X_train[i])
        sent = y_train[i]
        for word in list(set(text.split())):
#             print(word)
            if sent == 1:      
                pos_freq[word] += 1.0
            else:
                neg_freq[word] += 1.0
                
    prediction = []
    class_prob = (posct / negct)
    for i in range(len(X_test)):
        score = class_prob
        text = preprocess(X_test[i])
        for word in list(text.split()):
            if word in pos_freq:
                score *= pos_freq[word]
            if word in neg_freq:
                score /= neg_freq[word] 
#         print(score)
        hyp = 1 if score >= 1 else 0
        prediction.append(hyp)
        
    
    tp, fp, tn, fn = [0,0,0,0]


    for i in range(len(y_test)):
        if(y_test[i]==1):
            if(prediction[i]==1):
                tp+=1
            else:
                fn+=1
        else:
            if(prediction[i]==0):
                tn+=1
            else:
                fp+=1    

    accuracy = (tp+tn)/(tp+tn+fp+fn)
    recall = tp/(tp+fn)
    precision = tp/(tp+fp) 
    f1 = 2*precision*recall/(precision+recall)
    acarr.append(accuracy)
    farr.append(f1)
for i in range(len(acarr)):
    print('Accuracy and F-score for {}th fold: {} and {}'.format(i+1, acarr[i], np.round(farr[i],4)))

Accuracy and F-score for 1th fold: 0.765 and 0.7345
Accuracy and F-score for 2th fold: 0.815 and 0.823
Accuracy and F-score for 3th fold: 0.83 and 0.8283
Accuracy and F-score for 4th fold: 0.8 and 0.8
Accuracy and F-score for 5th fold: 0.855 and 0.8449


In [13]:
mu,sigma = norm.fit(acarr)
print('Accuracy: {} ± {}'.format(mu, sigma))

Accuracy: 0.813 ± 0.03009983388658481
