## Perceptron Implementation with Bigram Data Representation




In [2]:
import scipy
import numpy as np
from sklearn.preprocessing import scale
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd

import time
import math
import random

In [3]:
path = "hw2data_1/reviews_tr.csv"
df_train = pd.read_csv(path)

In [4]:
# Data Representation: Bigram
# Use hashmap to compress the data
def data_compression(df_train):
    list_dict = []  # Contains training examples compressed with hashmap
    vocabulary_dict = {} # Contains the position to reconstruct the feature vector
    for index, row in df_train.iterrows():
        new_dict = {}
        words = row['text'] + ""
        words = words.split()
        
        
        for i in range(len(words) - 1):
            key = words[i] + " " + words[i + 1]
            if key in new_dict:
                new_dict[key] += 1
            else:
                new_dict[key] = 1

        if(row['label'] == 1):  # Attach the label
            new_dict['*label*'] = 1
        else: new_dict['*label*'] = -1

        new_dict['*const*'] = 1   # Lifting
        list_dict.append(new_dict)

    
    return list_dict

In [13]:
def train_bigram(list_dict):
    w = {} 
    
    # First pass
    random.shuffle(list_dict)
    for x in list_dict:
        dot_product = 0
        label = x['*label*']
        for key, value in x.items():
            if key != '*label*':
                if key in w:
                    dot_product += w[key] * x[key]
        
        if dot_product * label <= 0:           
            for key, value in x.items():
                if key != '*label*':
                    if key in w:
                        w[key] += label * x[key]
                    else: w[key] = label * x[key]
    return w

In [5]:
start_time = time.time()
list_dict = data_compression(df_train)
print("--- %s seconds ---" % (time.time() - start_time))

--- 153.72502398490906 seconds ---


In [14]:
start_time = time.time()
w = train_bigram(list_dict)
print("--- %s seconds ---" % (time.time() - start_time))

--- 113.894287109375 seconds ---


In [19]:
# Top20 positive words
print(sorted(w.items(), key=lambda x: x[1], reverse = False)[:20])

[('three stars', -79), ('a ok', -74), ('two stars', -66), ('2 stars', -57), ('3 stars', -57), ('very disappointed', -45), ('the worst', -44), ('very bland', -43), ('not impressed', -43), ('not worth', -40), ('more stars', -40), ('so salty', -40), ('just average', -40), ('meh i', -40), ('very disappointing', -39), ('mediocre food', -39), ('food poisoning', -38), ('over priced', -38), ('a joke', -38), ('was awful', -37)]


In [20]:
# Top20 negative words
print(sorted(w.items(), key=lambda x: x[1], reverse = True)[:20])

[('five stars', 67), ('four stars', 52), ('4 because', 45), ('not disappointed', 44), ('round up', 44), ('fifth star', 40), ('only negative', 38), ('fries diet', 38), ('5th star', 38), ('coke double', 38), ('not disappoint', 37), ('rounded up', 37), ('my new', 36), ('star off', 36), ('t disappointed', 35), ('onions fries', 35), ('5 but', 32), ('to 4', 32), ('was phenomenal', 31), ('very pleased', 31)]


In [21]:
path2 = "hw2data_1/reviews_te.csv"
df_test = pd.read_csv(path2)

In [23]:
def test_bigram(df_test, w):
    dict_list_test = data_compression(df_test)
    count = 0
    wrong = 0
    for dictionary in dict_list_test:
        count += 1
        dot_product = 0
        label = dictionary['*label*']
        for key, value in dictionary.items():
            if key in w and key != '*label*':
                dot_product += w[key] * dictionary[key]
        if dot_product * label <= 0: wrong += 1 
    return (count - wrong) / count

In [24]:
start_time = time.time()
print(test_bigram(df_test, w))
print("--- %s seconds ---" % (time.time() - start_time))

0.901897
--- 279.6965320110321 seconds ---
