## Perceptron Implementation with Unigram Data Representation


In [1]:
import scipy
import numpy as np
from sklearn.preprocessing import scale
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd

import time
import math
import random

In [2]:
path = "hw2data_1/reviews_tr.csv"
df_train = pd.read_csv(path)

In [5]:
print(len(df_train))
print(df_train.head(10))

1000000
   label                                               text
0      1  first time here food tastes great good environ...
1      0  i have been craving burgers lately so i decide...
2      1  i love having a place like this in the neighbo...
3      1  i had the morning monte which was delicious i ...
4      1  i have this app on my phone that lists the pla...
5      1  in n out is great i love their burgers and fri...
6      1  a local favourite byob beer wine the restauran...
7      1  i had zee most elegant evening here with my bf...
8      0  outdated decor slow and unfriendly service pai...
9      1  what can i say other than wow we so enjoy our ...


In [11]:
vocabulary_set = set()
for text in df_train['text']:
    words = text.split()
    for word in words:
        if word not in vocabulary_set: vocabulary_set.add(word)
print('The number of unique words: ', len(vocabulary_set))

The number of unique words:  207429


In [3]:
# Data Representation: Unigram
# Use hashmap to compress the data
def data_compression(df_train):
    list_dict = []  # Contains training examples compressed with hashmap
#     vocabulary_dict = {} # Contains the position to reconstruct the feature vector
    for index, row in df_train.iterrows():
        new_dict = {}
        words = row['text'] + ""
        words = words.split()
        for word in words:
#             if word not in vocabulary_dict:
#                 vocabulary_dict[word] = 0

            if word in new_dict:
                new_dict[word] += 1
            else:
                new_dict[word] = 1

        if(row['label'] == 1):  # Attach the label
            new_dict['*label*'] = 1
        else: new_dict['*label*'] = -1

        new_dict['*const*'] = 1   # Lifting
        list_dict.append(new_dict)

#     count = 0
#     for word in vocabulary_dict:
#         vocabulary_dict[word] = count
#         count += 1
    return list_dict

In [None]:
start_time = time.time()
list_dict = data_compression(df_train)
print("--- %s seconds ---" % (time.time() - start_time))

In [27]:
# Incomplete online-batch perceptron implementation: need to complete

def train_unigram(list_dict):
    w = {} # Store tmp weights
    w_ret = {} # Store the average weigths
    
    # First pass
    random.shuffle(list_dict)
    for x in list_dict:
        dot_product = 0
        label = x['*label*']
        for key, value in x.items():
            if key != '*label*':
                if key in w:
                    dot_product += w[key] * x[key]
        
        if dot_product * label <= 0:
            
            for key, value in x.items():
                if key != '*label*':
                    if key in w:
                        w[key] += label * x[key]
                    else: w[key] = label * x[key]
    
    
    # Second pass
    random.shuffle(list_dict)
    w_ret = dict(w) # Initilize w_ret
    for x in list_dict:
        dot_product = 0
        label = x['*label*']
        for key, value in x.items():
            if key != '*label*':
                if key in w:
                    dot_product += w[key] * x[key]
           
        if dot_product * label <= 0:
            for key, value in x.items():
                if key != '*label*':
                    if key in w:
                        w[key] += label * x[key]
                    else: w[key] = label * x[key]
        
        # Update w_ret
        for key, value in w.items():
            if key in w_ret:
                w_ret[key] += value
            else:
                w_ret[key] = value

                
    length = len(list_dict) + 1
    for key, value in w_ret.items():
        w_ret[key] /= length
    return w_ret
    
#     # Transfrom back to a vector
#     w_vector = [0] * (len(vocabulary_dict) + 1)
#     for key, value in w.items():
#         if key in vocabulary_dict:
#             w_vector[vocabulary_dict[key]] = value
#     w_vector[-1] = w['*const*']

In [23]:
def train_unigram(list_dict):
    w = {} 
    
    # First pass
    random.shuffle(list_dict)
    for x in list_dict:
        dot_product = 0
        label = x['*label*']
        for key, value in x.items():
            if key != '*label*':
                if key in w:
                    dot_product += w[key] * x[key]
        
        if dot_product * label <= 0:           
            for key, value in x.items():
                if key != '*label*':
                    if key in w:
                        w[key] += label * x[key]
                    else: w[key] = label * x[key]
    return w

In [21]:
int(len(list_dict)*0.2)

200000

In [28]:
list_dict_tmp = list_dict[:int(len(list_dict)*0.1)]


In [None]:
start_time = time.time()
w = train_unigram(list_dict_tmp) # Test on a smaller subset
print("--- %s seconds ---" % (time.time() - start_time))

In [16]:
path2 = "hw2data_1/reviews_te.csv"
df_test = pd.read_csv(path2)

In [22]:
def test_unigram(df_test, w):
    dict_list_test = data_compression(df_test)
    count = 0
    wrong = 0
    for dictionary in dict_list_test:
        count += 1
        dot_product = 0
        label = dictionary['*label*']
        for key, value in dictionary.items():
            if key in w and key != '*label*':
                dot_product += w[key] * dictionary[key]
        if dot_product * label <= 0: wrong += 1 
    return (count - wrong) / count

In [23]:
start_time = time.time()
print(test_unigram(df_test, w))
print("--- %s seconds ---" % (time.time() - start_time))

0.879384
--- 145.81845998764038 seconds ---


In [25]:
# TOP15 negative words
sorted(w.items(), key=lambda x: x[1], reverse = False)[:15]

[('worst', -190),
 ('mediocre', -169),
 ('bland', -135),
 ('terrible', -130),
 ('overpriced', -130),
 ('horrible', -129),
 ('disappointing', -128),
 ('ok', -127),
 ('awful', -119),
 ('poor', -118),
 ('dry', -112),
 ('disappointment', -109),
 ('excited', -103),
 ('meh', -102),
 ('rude', -100)]

In [26]:
# TOP15 Positive words
sorted(w.items(), key=lambda x: x[1], reverse = True)[:15]

[('perfect', 136),
 ('amazing', 128),
 ('perfectly', 124),
 ('fantastic', 119),
 ('perfection', 107),
 ('delicious', 105),
 ('wonderful', 102),
 ('incredible', 98),
 ('excellent', 97),
 ('glad', 96),
 ('awesome', 91),
 ('pleased', 83),
 ('outstanding', 83),
 ('disappoint', 83),
 ('satisfied', 77)]

In [330]:
# TF-IDF Representation
frequency = {}
for x in list_dict:
    for key, value in x.items():
        if key in frequency:
            frequency[key] += 1
        else:
            frequency[key] = 1

In [381]:
# import copy
list_dict1 = list(list_dict) # A shallow copy of list_dict
# list_dict1 = copy.deepcopy(list_dict)  #This is a deep copy but takes up a lot of memory

In [382]:
for x in list_dict1:
    for key, value in x.items():
        if key != '*label*' and key != '*const*':
            x[key] = x[key] * math.log((len(list_dict1) / frequency[key]), 10)

In [383]:
start_time = time.time()
w_tfidf = train_unigram(list_dict1, vocabulary_dict)
print("--- %s seconds ---" % (time.time() - start_time))

{}
1000000   151064


In [408]:
sorted(w_tfidf.items(), key=lambda x: x[1], reverse = False)[:10]

[('harman', -89.99999999999999),
 ('mediocre', -84.22474040199054),
 ('worst', -80.10731281700849),
 ('roum6b8yd4ykkugqcxtoug', -77.99999999999999),
 ('underwhelmed', -77.59042463214872),
 ('hopes', -71.74741110380765),
 ('disappointing', -70.60585455721528),
 ('not', -68.8592132041259),
 ('bland', -64.50013572257728),
 ('downhill', -63.45078877872636)]

In [409]:
sorted(w_tfidf.items(), key=lambda x: x[1], reverse = True)[:10]

[('n00b', 92.83507630617008),
 ('great', 91.8086860288631),
 ('delicious', 84.33372264813823),
 ('ftr', 81.41181741504607),
 ('jmc', 80.96910013008055),
 ('and', 78.32414188144158),
 ('ohfh6alqqq35niebd1exuw', 77.99999999999999),
 ('perfection', 77.70343425406247),
 ('amazing', 74.67204708243527),
 ('zfpcpbzssimrybsg9jxndw', 71.99999999999999)]

In [410]:
frequency['n00b']

13

In [411]:
def test_tfidf(df_test, w_tfidf, frequency):
    dict_list_test, dict_vocabulary_test = data_compression(df_test)
    count = 0
    wrong = 0
    for x in dict_list_test:
        for key, value in x.items():
            if key != '*label*' and key != '*const*':
                if key in frequency: x[key] = x[key] * math.log((len(dict_list_test) / frequency[key]), 10)
        
        count += 1
        dot_product = 0
        label = x['*label*']
        for key, value in x.items():
            if key in w and key != '*label*':
                dot_product += w[key] * x[key]
        if dot_product * label <= 0: wrong += 1 
    return (count - wrong) / count

In [412]:
start_time = time.time()
print(test_tfidf(df_test, w_tfidf, frequency))
print("--- %s seconds ---" % (time.time() - start_time))

0.872884
--- 283.20725202560425 seconds ---


In [None]:
# shallow copy and deep copy

In [372]:
list1 = []
dict1 = {'a': 1, 'b': 2}
dict2 = {'f':1, 'e':2}
list1.append(dict1)
list1.append(dict2)

In [366]:
list2 = list(list1)
list2[0]['a'] = 3
print(list1)
print(list2)

[{'a': 3, 'b': 2}, {'f': 1, 'e': 2}]
[{'a': 3, 'b': 2}, {'f': 1, 'e': 2}]


In [374]:
import copy
list2 = copy.deepcopy(list1)
list2[0]['a'] = 3
print(list1)
print(list2)

[{'a': 1, 'b': 2}, {'f': 1, 'e': 2}]
[{'a': 3, 'b': 2}, {'f': 1, 'e': 2}]


In [42]:
# 

def train():
    
    X = [] # Training set feature vectors after lifting
    Y = [] # Training set labels
    w = [0] * len(X[0]) #weight vector
    w = np.array(w)

    # Need to shuffle the data
    
    # First pass
    for i in range(len(X)):
        if X[i] @ w * Y[i] <= 0: 
            w = w + Y[i] * X[i]

    w_final = np.array(w)

    # Need to shuffle the data

    
    # Second pass
    for i in range(len(X)):
        if X[i] @ w * Y[i] <= 0: 
            w = w + Y[i] * X[i]
        w_final += w

    w_final /= len(X) + 1
    
    return w_final

def test(x, w):
    if x @ w > 0: return 1
    if x @ w < 0: return 0
    else return -1  #Error

IndexError: list index out of range

In [48]:
from sklearn.feature_extraction.text import CountVectorizer
# list of text documents
text = ["The quick brown fox jumped over the lazy dog."]
# create the transform
vectorizer = CountVectorizer()
# tokenize and build vocab
vectorizer.fit(text)
# summarize
print(vectorizer.vocabulary_)
# encode document
vector = vectorizer.transform(text)
# summarize encoded vector
print(vector.shape)
print(type(vector))
print(vector.toarray())


{'the': 7, 'quick': 6, 'brown': 0, 'fox': 2, 'jumped': 3, 'over': 5, 'lazy': 4, 'dog': 1}
(1, 8)
<class 'scipy.sparse.csr.csr_matrix'>
[[1 1 1 1 1 1 1 2]]


In [74]:
# Problem lies in dot product: it takes so much time for two vectors with 110000+ values
list1 = [1] * 110000
list2 = [i for i in range(110000)]  
np.array(list1) @ np.array(list2)

In [314]:
# != compares the value, whereas is not compares if they are the same object
dict_test = {'*label*': 1}
for key, value in dict_test.items():
    if key != '*label*': print("haha") 