In [1]:
import numpy as np
import glob
import os
import re
import string

In [2]:
def create_dataset(sources):
    """
    inputs a list of all filepaths
    outputs 4 lists:
    x1 - file content
    y1 - label corresponding to x1 (positive/negative)
    x2 - file content
    y2 - label corresponding to x2 (truthful/deceptive)
    """
    x1=[]
    y1=[]
    x2=[]
    y2=[]
    for src in sources:
        f = open(src, "r")
        file_content = f.read()[:-1]
        x1.append([file_content, src])
        x2.append([file_content, src])
        f.close()
        if src.split("/")[-4]=="negative_polarity":
            y1.append(0)
        else:
            y1.append(1)
        if src.split("/")[-3]=="deceptive_from_MTurk":
            y2.append(0)
        else:
            y2.append(1)
    return x1,y1,x2,y2

In [3]:
# creating training datasets
train_base_path = 'D:/USC/Applied Natural Language Processing - 544/Naive_Bayes/op_spam_training_data/train/**/*.txt'
train_reviews = glob.glob(train_base_path,recursive=True)
train_reviews = [review.replace("\\","/") for review in train_reviews]
x1_train,y1_train,x2_train,y2_train = create_dataset(train_reviews)

In [4]:
# print("len of x1_train: ",len(x1_train))
# print(x1_train[0])
# print("len of y1_train: ",len(y1_train))
# print(y1_train[0])
# print("len of x2_train: ",len(x2_train))
# print(x2_train[0])
# print("len of y2_train: ",len(y2_train))
# print(y2_train[0])

In [5]:
# creating validation datasets
valid_base_path = 'D:/USC/Applied Natural Language Processing - 544/Naive_Bayes/op_spam_training_data/validation/**/*.txt'
valid_reviews = glob.glob(valid_base_path,recursive=True)
valid_reviews = [review.replace("\\","/") for review in valid_reviews]
x1_valid,y1_valid,x2_valid,y2_valid = create_dataset(valid_reviews)

In [11]:
# words that should be removed (no contribution to prediction, computation)
stop_words = [ 'are', 'around','as', 'at', 'back', 'be', 'became', 'because', 'become', 'becomes', 'becoming', 'been', 'before',
             'beforehand', 'behind', 'being', 'below', 'beside', 'besides', 'between', 'beyond', 'bill', 'both',
             'bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant', 'co', 'con', 'could', 'couldnt', 'cry', 'de',
             'describe', 'detail', 'did', 'do', 'does', 'doing', 'don', 'done', 'down', 'due', 'during', 'each', 'eg',
             'eight', 'either', 'eleven', 'else', 'elsewhere', 'empty', 'enough', 'etc', 'even', 'ever', 'every', 'everyone',
             'everything', 'everywhere', 'except', 'few', 'fifteen', 'fify', 'fill', 'find', 'fire', 'first', 'five', 'for',
             'former', 'formerly', 'forty', 'found', 'four', 'from', 'front', 'full', 'further', 'get', 'give', 'go', 'had',
             'has', 'hasnt', 'have', 'having', 'he', 'hence', 'her', 'here', 'hereafter', 'hereby', 'herein', 'hereupon',
             'hers', 'herself', 'him', 'himself', 'his', 'how', 'however', 'hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed',
             'interest', 'into', 'is', 'it', 'its', 'itself', 'just', 'keep', 'last', 'latter', 'latterly', 'least', 'less',
             'ltd', 'made', 'many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine', 'more', 'moreover', 'most', 'mostly',
             'move', 'much', 'must', 'my', 'myself', 'name', 'namely', 'neither', 'never', 'nevertheless', 'next', 'nine',
             'no', 'nobody', 'none', 'noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of', 'off', 'often', 'on', 'once',
             'one', 'only', 'onto', 'or', 'other', 'others', 'otherwise', 'our', 'ours', 'ourselves', 'out', 'over', 'own',
             'part', 'per', 'perhaps', 'please', 'put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed', 'seeming',
             'seems', 'serious', 'several', 'she', 'should', 'show', 'side', 'since', 'sincere', 'six', 'sixty', 'so', 
             'some', 'somehow', 'someone', 'something', 'sometime', 'sometimes', 'somewhere', 'still', 'such', 'system',
             't', 'take', 'ten', 'than', 'that', 'the', 'their', 'theirs', 'them', 'themselves', 'then', 'thence', 'there',
             'thereafter', 'thereby', 'therefore', 'therein', 'thereupon', 'these', 'they', 'thickv', 'thin', 'third', 'this',
             'those', 'though', 'three', 'through', 'throughout', 'thru', 'thus', 'to', 'together', 'too', 'top', 'toward',
             'towards', 'twelve', 'twenty', 'two', 'un', 'under', 'until', 'up', 'upon', 'us', 'very', 'via', 'was', 'we',
             'well', 'were', 'what', 'whatever', 'when', 'whence', 'whenever', 'where', 'whereafter', 'whereas', 'whereby',
             'wherein', 'whereupon', 'wherever', 'whether', 'which', 'while', 'whither', 'who', 'whoever', 'whole', 'whom',
             'whose', 'why', 'will', 'with', 'within', 'without', 'would', 'yet', 'you', 'your', 'yours', 'yourself',
             'yourselves', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll",
             "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',
             'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who',
             'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being',
             'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or',
             'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
             'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off',
             'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all',
             'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
             'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd',
             'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn',
             "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't",
             'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren',
             "weren't", 'won', "won't", 'wouldn', "wouldn't"]


# preprocess string function (stemming lementization to be added)

def preprocess_string(s,stop_words):
    s=s.translate(str.maketrans('', '', string.punctuation))
    s=re.sub('(\s+)',' ',s)
    s=s.lower()
    word_list = s.split(" ")
    new_list = []
    for word in word_list:
        if (len(word)>2)  and (word not in stop_words):  
            new_list.append (word)
    s=" ".join(new_list)
    return s

In [None]:
# # all words list
# vocab_positive_negative = {}
# for i in range(len(x1_train)):
#     for word in x1_train[i][0].split(" "):
#         word  = word.strip(string.punctuation).lower()
#         if (len(word)>2)  and (word not in stop_words):  
#             if word in vocab_positive_negative:
#                 vocab_positive_negative[word]+=1
#             else:
#                 vocab_positive_negative[word]=1       

In [None]:
# all words list
vocab_positive_negative = {}
for i in range(len(x1_train)):
    x1_train[i][0]=x1_train[i][0].translate(str.maketrans('', '', string.punctuation))
    x1_train[i][0]=x1_train[i][0].lower()
    word_list = x1_train[i][0].split(" ")
    new_list = []
    for word in word_list:
        if (len(word)>2)  and (word not in stop_words):  
            if word in vocab_positive_negative:
                vocab_positive_negative[word]+=1
            else:
                vocab_positive_negative[word]=1
            new_list.append (word)
    x1_train[i][0]=" ".join(new_list)

In [9]:
print("len of x1_train: ",len(x1_train))
print(x1_train[0])

print("***")

print(preprocess_string(x1_train[0][0],stop_words))

len of x1_train:  960
["Affinia Chicago is one of the worst hotels I have ever stayed at. Not in my life have I been treated so poorly as a guest. The front desk was very unaccommodating when I asked for a smoke free room when they had made an error in my reservation. There was no bellhop available for some strange reason so I had to move all my luggage to the elevator and down a long hallway to my room by myself. If it wasn't already a bad stay, I ordered room service and it took over an hour and a half to be delivered. If they didn't have air conditioning in the room, I would say just about everything about this stay was completely miserable. If you are traveling to Chicago for any kind of business, I hope you decide not to choose this hotel. I was quite surprised, I like Chicago as a city but this stay definitely made my trip quite a negative experience.", 'D:/USC/Applied Natural Language Processing - 544/Naive_Bayes/op_spam_training_data/train/negative_polarity/deceptive_from_MTurk

In [None]:
print(len(vocab_positive_negative))

In [None]:
# import heapq
# most_freq = heapq.nlargest(5000, vocab_positive_negative, key=vocab_positive_negative.get)
# most_freq

In [None]:
np.asarray(list(vocab_positive_negative.values())).mean()

In [None]:
threshold_freq=2
features = []
for key in vocab_positive_negative:
    if vocab_positive_negative[key] >= threshold_freq:
        features.append(key)
print(len(features))

In [None]:
x1_train_dataset = [([1]*len(features)) for i in range(len(x1_train))]

for i in range(len(x1_train)):
    word_list = x1_train[i][1].split(" ")
    for word in word_list:
        if word in features:
            x1_train_dataset[i][features.index(word)] += 1

In [None]:
x1_valid_dataset = [([1]*len(features)) for i in range(len(x1_valid))]

for i in range(len(x1_valid)):
    word_list = x1_valid[i][1].split(" ")
    for word in word_list:
        if word in features:
            x1_valid_dataset[i][features.index(word)] += 1

In [None]:
# class NaiveBayes:
#     def fit(self, X, y):
#         n_samples, n_features = X.shape
#         self._classes = np.unique(y)
#         n_classes = len(self._classes)

#         # calculate mean, var, and prior for each class
#         self._mean = np.zeros((n_classes, n_features), dtype=np.float64)
#         self._var = np.zeros((n_classes, n_features), dtype=np.float64)
#         self._priors = np.zeros(n_classes, dtype=np.float64)

#         for idx, c in enumerate(self._classes):
#             X_c = X[y == c]
#             self._mean[idx, :] = X_c.mean(axis=0)
#             self._var[idx, :] = X_c.var(axis=0)
#             self._priors[idx] = X_c.shape[0] / float(n_samples)

#     def predict(self, X):
#         y_pred = [self._predict(x) for x in X]
#         return np.array(y_pred)

#     def _predict(self, x):
#         posteriors = []

#         # calculate posterior probability for each class
#         for idx, c in enumerate(self._classes):
#             prior = np.log(self._priors[idx])
#             posterior = np.sum(np.log(self._pdf(idx, x)))
#             posterior = prior + posterior
#             posteriors.append(posterior)

#         # return class with highest posterior probability
#         return self._classes[np.argmax(posteriors)]

#     def _pdf(self, class_idx, x):
#         mean = self._mean[class_idx]
#         var = self._var[class_idx]
#         numerator = np.exp(-((x - mean) ** 2) / (2 * var))
#         denominator = np.sqrt(2 * np.pi * var)
#         return numerator / denominator

In [None]:
# def accuracy(y_true, y_pred):
#         accuracy = np.sum(y_true == y_pred) / len(y_true)
#         return accuracy

In [None]:
# nb = NaiveBayes()
# nb.fit(np.asarray(x1_train_dataset), np.asarray(y1_train))
# predictions = nb.predict(np.asarray(x1_valid_dataset))

# print("Naive Bayes classification accuracy", accuracy(y1_valid, predictions))

In [None]:
# print(predictions)