In [8]:
"""
Author: Daniel Wu
Purpose: PS3 - classify test data
"""

import os
import sys
import math
import re
import string
import glob
import random

def pause():
    programPause = input("Press the <ENTER> key to continue...")
    print("Paused Program")
    
# bring in model parameters

with open("./nbmodel.txt") as file:
    lines = []
    for line in file:
        lines.append(line[0:-2])

# Store NB parameters in separate dictionaries
positive_params = {}        
negative_params = {}
true_params = {}
deceptive_params = {}
    
for line in lines:
    line_break = line.split(' ')        
    
    if line_break[0] == 'pn':        
        positive_params[line_break[1]] = float(line_break[2])
        negative_params[line_break[1]] = float(line_break[3])
                
    elif line_break[0] == 'td':
        true_params[line_break[1]] = float(line_break[2])
        deceptive_params[line_break[1]] = float(line_break[3])
        
stop_words = ['hotel', 'hotels', 'stay', 'stayed',
              'book', 'booked', 'reserve', 'reserved',
              'room', 'rooms',
              'reservation', 'here',
              'i', 'me', 'my', 'mine',
              'the', 'we', 'our', 'ours',
              'it', 'its', 'they', 'them',
              'he', 'she', 'him', 'her', 'his',
              'they', 'them', 'theirs', 'who', 'what', 'where',
              'when', 'am', 'are', 'about',
              'to', 'in', 'out', 'up', 'down',
              'a', 'an', 'how', 'if', 'as', 'on',
              'some', 'can', 'is', 'be', 'any', 
              'through', 'of', 'off',
              'these', 'those', 'that',              
              'one', 'ha', 'would', 'from', 'by', 'thing',
              'this', 'and', 'for', ' ', 'during', 'before',
              'after', 'very'
              "i'll", "we'll", "it's",
              "i'm"
             ]

puncs1 = string.punctuation.replace("'", '')
puncs2 = puncs1.replace("-", '')
puncs = list(puncs2)

# root_dir = sys.argv[1]
root_dir = "/Users/user/Desktop/Fall_2020/CSCI_544/Coding_Assignments/PA3/dev_dataset"
file_path = glob.glob(os.path.join(root_dir, '*/*/*/*.txt'))


positive_prior = math.log(positive_params['PRIORS'])
negative_prior = math.log(negative_params['PRIORS'])
true_prior = math.log(true_params['PRIORS'])
deceptive_prior = math.log(deceptive_params['PRIORS'])


outfile = ''

for review in file_path:
    
    with open(review) as doc:                        
        test_obs = ''.join(doc.readlines())        
            
    test_obs = re.sub(r"(?:[0-2]?[0-9])(?:(?:am|pm)|(?::[0-5][0-9]?)(?:am|pm)?)", "timetok", test_obs)
    test_obs = re.sub(r"\$\d+(?:\.\d?\d)?", "amttok", test_obs)
    test_obs = test_obs.translate(str.maketrans({punc: " {0} ".format(punc) for punc in puncs}))  
    test_obs = test_obs.lower()        
        
    positive_pr = positive_prior
    negative_pr = negative_prior
    true_pr = true_prior
    deceptive_pr = deceptive_prior    
    
    for word in stop_words:
        stop_word = ' ' + word + ' '
        test_obs = test_obs.replace(stop_word, ' ')        
        
    test_obs = test_obs.split(' ')   
    test_obs = [tok for tok in test_obs if tok not in stop_words]
    test_obs = [tok for tok in test_obs if (len(tok) > 2 or tok in ('?', '!'))]

    for token in test_obs:

        if token in positive_params.keys():                
            positive_pr = positive_pr + math.log(positive_params[token])
        if token in negative_params.keys():                
            negative_pr = negative_pr + math.log(negative_params[token])
        if token in true_params.keys():                
            true_pr = true_pr + math.log(true_params[token])
        if token in deceptive_params.keys():                
            deceptive_pr = deceptive_pr + math.log(deceptive_params[token])

    if positive_pr > negative_pr:
        pn_class = 'positive'
    elif negative_pr > positive_pr:
        pn_class = 'negative'
    elif positive_pr == negative_pr:
        if random.uniform(0, 1) >= 0.5:
            pn_class = 'positive'
        else:
            pn_class = 'negative'                

    if true_pr > deceptive_pr:
        td_class = 'truthful'
    elif deceptive_pr > true_pr:
        td_class = 'deceptive'
    elif true_pr == deceptive_pr:
        if random.uniform(0, 1) >= 0.5:
            td_class = 'truthful'
        else:
            td_class = 'deceptive'

    outfile += f'{td_class} {pn_class} {review}\n'

file = open("./nboutput.txt", "w")
file.writelines(outfile[:-1])
file.close()
    