## Perceptron Implementation with Bigram Data Representation




In [1]:
import scipy
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import time
import math
import random

In [2]:
# Load training data
path = "hw2data_1/reviews_tr.csv"
df_train = pd.read_csv(path)

In [3]:
# Load testing data
path2 = "hw2data_1/reviews_te.csv"
df_test = pd.read_csv(path2)

In [4]:
# Data Representation: Bigram
# Use hashmap to compress the data
def data_compression(df_train):
    list_dict = []  # Contains training examples compressed with hashmap
    for index, row in df_train.iterrows():
        new_dict = {}
        words = row['text'] + ""
        words = words.split()
        
        for i in range(len(words) - 1):
            key = words[i] + " " + words[i + 1] # Use two words as key
            if key in new_dict:
                new_dict[key] += 1
            else:
                new_dict[key] = 1

        if(row['label'] == 1):  # Attach the label
            new_dict['*label*'] = 1
        else: new_dict['*label*'] = -1

        new_dict['*const*'] = 1   # Lifting
        list_dict.append(new_dict)

    
    return list_dict

In [5]:
# Online-batch perceptron implementation
def train_bigram(list_dict):
    w = {} 
    
    random.shuffle(list_dict)
    for x in list_dict:
        dot_product = 0
        label = x['*label*']
        for key, value in x.items():
            if key != '*label*':
                if key in w:
                    dot_product += w[key] * x[key]
        
        if dot_product * label <= 0:           
            for key, value in x.items():
                if key != '*label*':
                    if key in w:
                        w[key] += label * x[key]
                    else: w[key] = label * x[key]
                        
    random.shuffle(list_dict)
    w_ret = dict(w) # Initilize w_ret
    for x in list_dict:
        dot_product = 0
        label = x['*label*']
        for key, value in x.items():
            if key != '*label*':
                if key in w:
                    dot_product += w[key] * x[key]
           
        if dot_product * label <= 0:
            for key, value in x.items():
                if key != '*label*':
                    if key in w:
                        w[key] += label * x[key]
                    else: w[key] = label * x[key]
        
        # Update w_ret
        for key, value in w.items():
            if key in w_ret:
                w_ret[key] += value
            else:
                w_ret[key] = value
                
                
    # Calculate weighted weight vector
    length = len(list_dict) + 1
    for key, value in w_ret.items():
        w_ret[key] /= length            
    return w_ret

In [2]:
# Test classifier on a give testing set
def test_bigram(dict_list_test, w):
    count = 0
    wrong = 0
    for dictionary in dict_list_test:
        count += 1
        dot_product = 0
        label = dictionary['*label*']
        for key, value in dictionary.items():
            if key in w and key != '*label*':
                dot_product += w[key] * dictionary[key]
        if dot_product * label <= 0: wrong += 1 
    return (count - wrong) / count

In [7]:
start_time = time.time()
list_dict = data_compression(df_train)
print("--- %s seconds ---" % (time.time() - start_time))

--- 143.66006231307983 seconds ---


In [8]:
# Evaluating the performance of the classifier with different training sizes: 10%, 20%, ... ,100%
def test(df_test, list_dict):
    dict_list_test = data_compression(df_test)
    w_list = []
    accuracy_list = []
    for i in range(10):
        list_dict_training = list_dict[:int(len(list_dict) * 0.1 * (i + 1))]
        w = train_bigram(list_dict_training)
        w_list.append(w)
        print('Training size: ', (i + 1) * 10, '%', end='')
        
        accuracy = test_bigram(dict_list_test, w)
        accuracy_list.append(accuracy)
        print('  Accuracy: ', accuracy)
        print()
        
    return w_list, accuracy_list

In [9]:
w_list = []
accuracy_list = []
w_list, accuracy_list = test(df_test, list_dict)

Training size:  10 %  Accuracy:  0.8495511086398311

Training size:  20 %  Accuracy:  0.855533202966369

Training size:  30 %  Accuracy:  0.8538807079800826

Training size:  40 %  Accuracy:  0.8655700014369522

Training size:  50 %  Accuracy:  0.8577948407169766

Training size:  60 %  Accuracy:  0.8667726679203553

Training size:  70 %  Accuracy:  0.8680815439113838

Training size:  80 %  Accuracy:  0.8751600952136998

Training size:  90 %  Accuracy:  0.8706961720843929

Training size:  100 %  Accuracy:  0.8700807817019761



In [10]:
# Top20 positive phrases when training size is 70% (highest accuracy)
sorted(w_list[7].items(), key=lambda x: x[1], reverse = False)[:20]

[('a ok', -71),
 ('two stars', -68),
 ('three stars', -66),
 ('3 stars', -62),
 ('very disappointed', -51),
 ('2 stars', -46),
 ('so disappointed', -46),
 ('food poisoning', -46),
 ('not worth', -45),
 ('the worst', -45),
 ('never again', -44),
 ('3 star', -43),
 ('very disappointing', -43),
 ('mediocre food', -42),
 ('an ok', -38),
 ('2 5', -37),
 ('more stars', -37),
 ('not impressed', -36),
 ('meh i', -36),
 ('an okay', -35)]

In [12]:
# Top20 positive phrases when training size is 70% (highest accuracy)
sorted(w_list[7].items(), key=lambda x: x[1], reverse = True)[:20]

[('five stars', 49),
 ('not disappointed', 47),
 ('four stars', 44),
 ('rib crab', 43),
 ('4 because', 39),
 ('round up', 39),
 ('just perfect', 38),
 ('never disappointed', 38),
 ('legs prime', 38),
 ('this good', 35),
 ('all delicious', 35),
 ('only negative', 34),
 ('was impeccable', 34),
 ('4 stars', 33),
 ('5 stars', 33),
 ('definitely coming', 32),
 ('you won', 32),
 ('very pleased', 32),
 ('new favorite', 31),
 ('t disappointed', 31)]

## Test Result

    Training size:  10 %  Accuracy:  0.8495511086398311
    Training size:  20 %  Accuracy:  0.855533202966369
    Training size:  30 %  Accuracy:  0.8538807079800826
    Training size:  40 %  Accuracy:  0.8655700014369522
    Training size:  50 %  Accuracy:  0.8577948407169766
    Training size:  60 %  Accuracy:  0.8667726679203553
    Training size:  70 %  Accuracy:  0.8680815439113838
    Training size:  80 %  Accuracy:  0.8751600952136998
    Training size:  90 %  Accuracy:  0.8706961720843929
    Training size:  100 %  Accuracy:  0.8700807817019761


In [14]:
start_time = time.time()
w = train_bigram(list_dict)
print("--- %s seconds ---" % (time.time() - start_time))

--- 113.894287109375 seconds ---


In [19]:
# Top20 positive words
sorted(w.items(), key=lambda x: x[1], reverse = False)[:20]

[('three stars', -79), ('a ok', -74), ('two stars', -66), ('2 stars', -57), ('3 stars', -57), ('very disappointed', -45), ('the worst', -44), ('very bland', -43), ('not impressed', -43), ('not worth', -40), ('more stars', -40), ('so salty', -40), ('just average', -40), ('meh i', -40), ('very disappointing', -39), ('mediocre food', -39), ('food poisoning', -38), ('over priced', -38), ('a joke', -38), ('was awful', -37)]


In [20]:
# Top20 negative words
sorted(w.items(), key=lambda x: x[1], reverse = True)[:20]

[('five stars', 67), ('four stars', 52), ('4 because', 45), ('not disappointed', 44), ('round up', 44), ('fifth star', 40), ('only negative', 38), ('fries diet', 38), ('5th star', 38), ('coke double', 38), ('not disappoint', 37), ('rounded up', 37), ('my new', 36), ('star off', 36), ('t disappointed', 35), ('onions fries', 35), ('5 but', 32), ('to 4', 32), ('was phenomenal', 31), ('very pleased', 31)]


In [24]:
dict_list_test = data_compression(df_test)
start_time = time.time()
print(test_bigram(dict_list_test, w))
print("--- %s seconds ---" % (time.time() - start_time))

0.901897
--- 279.6965320110321 seconds ---
