In [15]:
"""
Author: Daniel Wu
Purpose: PS3 - Train a bernoulli naive bayes model
         on hotel reviews
"""

import os
import sys
import math
import re
import string
import glob

def pause():
    programPause = input("Press the <ENTER> key to continue...")
    print("Paused Program")
    
# root_dir = sys.argv[1]
root_dir = "/Users/user/Desktop/Fall_2020/CSCI_544/Coding_Assignments/PA3/op_spam_training_data"

# filepath dictionary
p = {}

p['nd'] = "/negative_polarity/deceptive_from_MTurk"
p['nt'] = "/negative_polarity/truthful_from_Web"
p['pd'] = "/positive_polarity/deceptive_from_MTurk"
p['pt'] = "/positive_polarity/truthful_from_TripAdvisor"

def store_reviews(sub_dir):
    
    review_list = []
    
    for path in list(os.walk(root_dir + sub_dir))[1:]:    
        for text in path[2]:        
            file_path = path[0] + "/" + text
        
            with open(file_path) as doc:                
                review_list.append(''.join(doc.readlines()))
        
    return review_list

# review dictionary 
reviews = {}

for sub_dir in ['nd', 'nt', 'pd', 'pt']:                
        reviews[sub_dir] = store_reviews(p[sub_dir])

        
# set stop words - hotel context
#                  first / second person pronouns
#                  common filler words

# some of these are also retroactively added in
# based on joint probabilities being too high

stop_words = ['hotel', 'hotels', 'stay', 'stayed',
              'book', 'booked', 'reserve', 'reserved',
              'room', 'rooms',
              'reservation', 'here',
              'i', 'me', 'my', 'mine',
              'the', 'we', 'our', 'ours',
              'it', 'its', 'they', 'them',
              'he', 'she', 'him', 'her', 'his',
              'they', 'them', 'theirs', 'who', 'what', 'where',
              'when', 'am', 'are', 'about',
              'to', 'in', 'out', 'up', 'down',
              'a', 'an', 'how', 'if', 'as', 'on',
              'some', 'can', 'is', 'be', 'any', 
              'through', 'of', 'off',
              'these', 'those', 'that',              
              'one', 'ha', 'would', 'from', 'by', 'thing',
              'this', 'and', 'for', ' ', 'during', 'before',
              'after', 'very'
              "i'll", "we'll", "it's",
              "i'm"
             ]


puncs1 = string.punctuation.replace("'", '')
puncs2 = puncs1.replace("-", '')
puncs = list(puncs2)

#get list of tokens - seperate by space " "
#generalize time,
#generalize amount,
#separate punctuation

token_bag = {}
clean_reviews = {}

for cls in ['nd', 'nt', 'pd', 'pt']:
    
    word_list = ""    
    clean_reviews[cls] = []
    
    for review in reviews[cls]:        
                        
        review = re.sub(r"(?:[0-2]?[0-9])(?:(?:am|pm)|(?::[0-5][0-9]?)(?:am|pm)?)", "timetok", review)        
        review = re.sub(r"\$\d+(?:\.\d?\d)?", "amttok", review)        
        review = review.translate(str.maketrans({punc: " {0} ".format(punc) for punc in puncs}))
                         
        word_list = word_list + review.lower()
        
        clean_reviews[cls].append(review.lower())
                    
    token_bag[cls] = set(word_list.split(' '))
    
    # remove stop words and punctuations
    token_bag[cls] = [tok for tok in token_bag[cls] if tok not in stop_words]
    
    #get rid of letters and 2-letter words, but keep a few punctuations 
    token_bag[cls] = [tok for tok in token_bag[cls] if (len(tok) > 2 or tok in ('?', '!'))]
    
    
# Create total bag of words
token_bag['total'] = set(token_bag['nd'] + token_bag['nt'] + token_bag['pd'] + token_bag['pt'])

# get counts in the 4-class classifier    
token_count = {}

for cls in ['nd', 'nt', 'pd', 'pt']:
    
    token_count[cls] = {}    
            
    for tok in token_bag['total']:            
        tok_count = len([1 for review in clean_reviews[cls] if tok in review])                                        
        token_count[cls][tok] = {}
        token_count[cls][tok] = tok_count
        
        
# get joint prob
joint_prob_pn = {} #for pos/neg
joint_prob_td = {} #for true/deceptive

for tok in token_bag['total']:
        
    # first element is joint-prob for positive
    # second element is joint-prob for negative
    
    joint_prob_pn[tok] = [ (token_count['pd'][tok] + token_count['pt'][tok] + 1 ) / 
                           (len(clean_reviews['pd']) + len(clean_reviews['pt']) + 1),
                           (token_count['nd'][tok] + token_count['nt'][tok] + 1 ) / 
                           (len(clean_reviews['nd']) + len(clean_reviews['nt']) + 1) ]          
    
    # first element is joint-prob for true
    # second element is joint-prob for deceptive
    
    joint_prob_td[tok] = [ (token_count['pt'][tok] + token_count['nt'][tok] + 1 ) / 
                           (len(clean_reviews['pt']) + len(clean_reviews['nt']) + 1),
                           (token_count['pd'][tok] + token_count['nd'][tok] + 1 ) / 
                           (len(clean_reviews['pd']) + len(clean_reviews['nd']) + 1) ]

    
# trim the list to avoid overfit

joint_prob_pn = list(joint_prob_pn.items())
joint_prob_pn.sort(key= lambda x: x[1][0] + x[1][1], reverse=True)
joint_prob_pn = joint_prob_pn[0:1000]

# more tokens for td since true reviews may have wider vocab

joint_prob_td = list(joint_prob_td.items())
joint_prob_td.sort(key= lambda x: x[1][0] + x[1][1], reverse=True)
joint_prob_td = joint_prob_td[0:3000]

p_prior = (len(clean_reviews['pd']) + len(clean_reviews['pt']))/((len(clean_reviews['pd']) + len(clean_reviews['pt'])) + len(clean_reviews['nd']) + len(clean_reviews['nt']))
n_prior = (len(clean_reviews['nd']) + len(clean_reviews['nt']))/((len(clean_reviews['pd']) + len(clean_reviews['pt'])) + len(clean_reviews['nd']) + len(clean_reviews['nt']))
t_prior = (len(clean_reviews['pt']) + len(clean_reviews['nt']))/((len(clean_reviews['pt']) + len(clean_reviews['nt'])) + len(clean_reviews['pd']) + len(clean_reviews['nd']))
d_prior = (len(clean_reviews['pd']) + len(clean_reviews['nd']))/((len(clean_reviews['pt']) + len(clean_reviews['nt'])) + len(clean_reviews['pd']) + len(clean_reviews['nd']))

# {class} {word} {joint_prob_1} {joint_prob_2}

outfile = ""

outfile += f"pn PRIORS {p_prior} {n_prior} \n"

for tok in joint_prob_pn:
    outfile += f"pn {tok[0]} {tok[1][0]} {tok[1][1]} \n"
    
outfile += f"td PRIORS {t_prior} {d_prior} \n"

for tok in joint_prob_td:
    outfile += f"td {tok[0]} {tok[1][0]} {tok[1][1]} \n"

file = open("./nbmodel.txt", "w")
file.writelines(outfile[:-2])
file.close()