In [1]:
import sys
import math
import os
import re
from collections import Counter
sys.path.insert(0, '../Code')
from nbclassifier_cb import NBClassifier

In [2]:
pos_path = os.path.expanduser("~/Dropbox/NLP Readings/hw 2/hotelPosT-rand_training.txt")
neg_path = os.path.expanduser("~/Dropbox/NLP Readings/hw 2/hotelNegT-rand_training.txt")
test_path = os.path.expanduser("~/Dropbox/NLP Readings/hw 2/rand_test_neg_and_pos.txt")
eval_path = os.path.expanduser("~/Dropbox/NLP Readings/hw 2/eval_rand_neg_and_pos.txt")

In [3]:
def file_to_tuples(path, cat):
    """
    Create and normalize document tuples from path.
    
    Returns list of tuples: [(text, cat), ...]
    """
    tuples = []
    with open(path) as f:
        for line in f:
            line = re.split('\t', line)[1:] #ignore first and last element in the list.
            line = line[0].lower()
            line = line.rstrip()
            line = re.sub(r'[^a-z|^0-9|^\s]+', '' , line)
            tuples.append((line, cat))
    return tuples

In [4]:
pos_docs = file_to_tuples(pos_path, 'pos')
neg_docs = file_to_tuples(neg_path, 'neg')

In [5]:
m = NBClassifier(['pos', 'neg'])
m.train(pos_docs)
m.train(neg_docs)

In [6]:
# make predictions
predictions = {}
with open(test_path) as f:
    for line in f:
        review_id, review = line.split("\t")
        review = review.lower()
        review = review.rstrip()
        review = re.sub(r'[^a-z|^0-9|^\s]+', '' , review)
        cat = m.predict(review)
        predictions[review_id] = cat

In [7]:
# get gold categories
golden = {}
with open(eval_path) as f:
    for line in f:
        review_id, cat = line.split("\t")
        golden[review_id] = cat.lower().rstrip()

In [8]:
def evaluate(gold_dict, pred_dict, beta=1):
    tp, tn, fp, fn = 0, 0, 0, 0
    for review_id, cat in pred_dict.items():
        golden_cat = gold_dict[review_id]
        if golden_cat == 'pos':
            if cat == 'pos':
                tp += 1
            else:
                fn += 1
        else:
            if cat == 'pos':
                fp += 1
            else:
                tn += 1
    precision = tp / (tp+fp)
    recall = tp / (tp+fn)
    accuracy = (tp + tn)/(tp+tn+fp+fn)
    f1 = ((beta**2 + 1) * precision * recall) / ((beta**2 * precision) + recall)
    #print(tp, tn, fp, fn)
    return (f1, accuracy, precision, recall)

In [9]:
evaluate(golden, predictions)

(0.9411764705882353,
 0.9473684210526315,
 0.9411764705882353,
 0.9411764705882353)