In [1]:
import numpy as np
import glob
import os
import re
import string
import pandas as pd
from collections import defaultdict
import json

In [2]:
def create_dataset(sources):
    """
    inputs a list of all filepaths
    outputs 4 lists:
    x1 - file content
    y1 - label corresponding to x1 (positive/negative)
    x2 - file content
    y2 - label corresponding to x2 (truthful/deceptive)
    """
    x1=[]
    y1=[]
    x2=[]
    y2=[]
    for src in sources:
        f = open(src, "r")
        file_content = f.read()[:-1]
        x1.append(file_content)
        x2.append(file_content)
        f.close()
        if src.split("/")[-4]=="negative_polarity":
            y2.append(0)
        else:
            y2.append(1)
        if src.split("/")[-3]=="deceptive_from_MTurk":
            y1.append(0)
        else:
            y1.append(1)
    return x1,y1,x2,y2

In [3]:
# creating training datasets
input_path = sys.argv[1] # 'D:/USC/Applied Natural Language Processing - 544/Naive_Bayes/op_spam_training_data/train'
train_base_path = input_path + '/**/*.txt'
train_reviews = glob.glob(train_base_path,recursive=True)
train_reviews = [review.replace("\\","/") for review in train_reviews]
x1_train,y1_train,x2_train,y2_train = create_dataset(train_reviews)

In [4]:

# preprocess string function (stemming lementization stopwords to be added)

def preprocess_string(s):
    
    # words that should be removed (no contribution to prediction, computation)
#     stop_words = [ 'are', 'around','as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before',
#              'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both',
#              'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'con', 'could', 'couldnt', 'cry', 'de',
#              'describe', 'detail', 'did', 'do', 'does', 'doing', 'don', 'done', 'down', 'due', 'during', 'each', 'eg',
#              'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone',
#              'everything', 'everywhere', 'except', 'few', 'fifteen', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'for',
#              'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had',
#              'has', 'hasnt', 'have', 'having', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon',
#              'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed',
#              'interest', 'into', 'is', 'it', 'its', 'itself', 'just', 'keep', 'last', 'latter', 'latterly', 'least', 'less',
#              'ltd', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly',
#              'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless', 'next', 'nine',
#              'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once',
#              'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own',
#              'part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed', 'seeming',
#              'seems', 'serious', 'several', 'she', 'should', 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 
#              'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'system',
#              't', 'take', 'ten', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there',
#              'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'thickv', 'thin', 'third', 'this',
#              'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward',
#              'towards', 'twelve', 'twenty', 'two', 'un', 'under', 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we',
#              'well', 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby',
#              'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom',
#              'whose', 'why', 'will', 'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 'yourself',
#              'yourselves', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll",
#              "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',
#              'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who',
#              'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
#              'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
#              'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
#              'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off',
#              'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all',
#              'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
#              'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd',
#              'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn',
#              "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't",
#              'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren',
#              "weren't", 'won', "won't", 'wouldn', "wouldn't"]

    
    s=s.translate(str.maketrans('', '', string.punctuation))
    s=re.sub('(\s+)',' ',s)
    s=s.lower()
    word_list = s.split(" ")
    new_list = []
    for word in word_list:
        if (len(word)>2):  
            new_list.append (word)
    s=" ".join(new_list)
    return s

In [5]:
print("len of x1_train: ",len(x1_train))
print(x1_train[0])

print("***")

print(preprocess_string(x1_train[0]))

len of x1_train:  960
Affinia Chicago is one of the worst hotels I have ever stayed at. Not in my life have I been treated so poorly as a guest. The front desk was very unaccommodating when I asked for a smoke free room when they had made an error in my reservation. There was no bellhop available for some strange reason so I had to move all my luggage to the elevator and down a long hallway to my room by myself. If it wasn't already a bad stay, I ordered room service and it took over an hour and a half to be delivered. If they didn't have air conditioning in the room, I would say just about everything about this stay was completely miserable. If you are traveling to Chicago for any kind of business, I hope you decide not to choose this hotel. I was quite surprised, I like Chicago as a city but this stay definitely made my trip quite a negative experience.
***
affinia chicago one the worst hotels have ever stayed not life have been treated poorly guest the front desk was very unaccommod

In [15]:
classes=None
bow_dicts=None

def addToBow(example,dict_index):
    global bow_dicts

    if isinstance(example,np.ndarray): example=example[0]
    for token_word in example.split():
        bow_dicts[dict_index][token_word]+=1

def train(dataset,labels):
    global bow_dicts, classes
    
    examples=dataset
    labels=labels
    if not isinstance(examples,np.ndarray): examples=np.array(examples)
    if not isinstance(labels,np.ndarray): labels=np.array(labels)
        
    classes = np.unique(labels)
    bow_dicts=np.array([defaultdict(lambda:0) for index in range(classes.shape[0])])

    for cat_index,cat in enumerate(classes):

        all_cat_examples=examples[labels==cat] 

        cleaned_examples=[preprocess_string(cat_example) for cat_example in all_cat_examples]
        cleaned_examples=pd.DataFrame(data=cleaned_examples)

        np.apply_along_axis(addToBow,1,cleaned_examples,cat_index)


    prob_classes=np.empty(classes.shape[0])
    all_words=[]
    cat_word_counts=np.empty(classes.shape[0])
    for cat_index,cat in enumerate(classes):

        prob_classes[cat_index]=np.sum(labels==cat)/float(labels.shape[0]) 

        count=list(bow_dicts[cat_index].values())
        cat_word_counts[cat_index]=np.sum(np.array(list(bow_dicts[cat_index].values())))+1

        all_words+=bow_dicts[cat_index].keys()

    vocab=np.unique(np.array(all_words))
    vocab_length=vocab.shape[0]

    denoms=np.array([cat_word_counts[cat_index]+vocab_length+1 for cat_index,cat in enumerate(classes)])                                                                          

    cats_info=[(bow_dicts[cat_index],prob_classes[cat_index],denoms[cat_index]) for cat_index,cat in enumerate(classes)]                               
    cats_info=np.array(cats_info) 
    
    return cats_info

model_weights = {}

tru_dec_info = train(x1_train,y1_train)
model_weights["tru_dec_info_classes"] = classes.tolist()
model_weights["tru_dec_info"] = tru_dec_info.tolist()

classes=None
bow_dicts=None
pos_neg_info = train(x2_train,y2_train)
model_weights["pos_neg_info_classes"] = classes.tolist()
model_weights["pos_neg_info"] = pos_neg_info.tolist()

print(model_weights)
with open('nbmodel.txt', 'w') as outfile:
    json.dump(model_weights, outfile)



[defaultdict(<function train.<locals>.<listcomp>.<lambda> at 0x0000024B309C4488>, {'after': 72, 'recent': 7, 'week': 19, 'stay': 400, 'the': 3987, 'affinia': 28, 'hotels': 91, 'can': 71, 'definitely': 110, 'say': 55, 'will': 150, 'coming': 12, 'back': 99, 'they': 232, 'offer': 18, 'many': 55, 'room': 576, 'amenities': 30, 'and': 2259, 'services': 10, 'just': 153, 'very': 452, 'comfortable': 161, 'relaxed': 8, 'place': 107, 'most': 61, 'enjoyable': 9, 'experience': 85, 'hotel': 930, 'was': 1472, 'amazing': 65, 'customization': 1, 'offered': 14, 'would': 229, 'recommend': 119, 'anyone': 37, 'looking': 40, 'for': 648, 'nice': 156, 'although': 19, 'much': 59, 'too': 55, 'overpriced': 3, 'opinion': 6, 'spotless': 12, 'staff': 285, 'courteous': 18, 'spa': 20, 'service': 177, 'god': 1, 'send': 1, 'relatively': 2, 'flexible': 1, 'location': 187, 'traveling': 14, 'sight': 4, 'seeing': 9, 'didnt': 61, 'spend': 9, 'major': 5, 'bucks': 3, 'trying': 6, 'get': 104, 'around': 42, 'city': 119, 'love':