## Perceptron Implementation with Tf-idf Data Representation


In [2]:
import scipy
import numpy as np
from sklearn.preprocessing import scale
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd

import time
import math
import random

In [3]:
path = "hw2data_1/reviews_tr.csv"
df_train = pd.read_csv(path)

In [60]:
import nltk
stopwords = set(nltk.corpus.stopwords.words('english'))

In [88]:
# Data Representation: tf-idf
# Use hashmap to compress the data
def data_compression(df_train):
    list_dict = []  # Contains training examples compressed with hashmap

    for index, row in df_train.iterrows():
        new_dict = {}
        words = row['text'] + ""
        words = words.split()
        for word in words:
            if word not in stopwords and len(word) < 18: # Stopwords & filter some invalid strings
                if word in new_dict:
                    new_dict[word] += 1
                else:
                    new_dict[word] = 1

        if(row['label'] == 1):  # Attach the label
            new_dict['*label*'] = 1
        else: new_dict['*label*'] = -1

        new_dict['*const*'] = 1   # Lifting
        list_dict.append(new_dict)

    return list_dict

In [89]:
def train_tfidf(list_dict):
    w = {} 
    
    # First pass
    random.shuffle(list_dict)
    for x in list_dict:
        dot_product = 0
        label = x['*label*']
        for key, value in x.items():
            if key != '*label*':
                if key in w:
                    dot_product += w[key] * x[key]
        
        if dot_product * label <= 0:           
            for key, value in x.items():
                if key != '*label*':
                    if key in w:
                        w[key] += label * x[key]
                    else: w[key] = label * x[key]
    
    return w

In [90]:
start_time = time.time()
list_dict = data_compression(df_train)
print("--- %s seconds ---" % (time.time() - start_time))

--- 145.77557611465454 seconds ---


In [91]:
frequency = {}
for x in list_dict:
    for key, value in x.items():
        if key in frequency:
            frequency[key] += 1
        else:
            frequency[key] = 1

In [92]:
# Modify feature vectors' values based on tf-idf
for x in list_dict:
    for key, value in x.items():
        if key != '*label*' and key != '*const*':
            x[key] = x[key] * math.log((len(list_dict) / frequency[key]), 10)

In [93]:
start_time = time.time()
w = train_tfidf(list_dict)
print("--- %s seconds ---" % (time.time() - start_time))

--- 21.992014169692993 seconds ---


In [105]:
count = 0
for dict_ in list_dict:
    if 'o_c' in dict_: 
        count +=1
print(count)

1


In [103]:
# TOP15 negative words
sorted(w.items(), key=lambda x: x[1], reverse = False)[:15]

[('doo', -148.99111265019025),
 ('harman', -89.99999999999999),
 ('keri', -83.65391881719037),
 ('mediocre', -82.43272464875668),
 ('worst', -81.81172372800869),
 ('meh', -72.94439239998083),
 ('ok', -72.12625708293135),
 ('overcooked', -70.13803863392015),
 ('bland', -64.50013572257738),
 ('salty', -64.12420336933748),
 ('potential', -64.0265266313933),
 ('underwhelmed', -63.73499166212217),
 ('inedible', -63.6136507805343),
 ('o_c', -59.99999999999999),
 ('2kzmhgnq3zdl_oag6q', -59.99999999999999)]

In [101]:
# TOP15 Positive words
sorted(w.items(), key=lambda x: x[1], reverse = True)[:15]

[('bom', 92.04119982655924),
 ('zimm', 83.99999999999999),
 ('great', 82.99876161194948),
 ('delicious', 81.0586266229672),
 ('excellent', 75.59014627000612),
 ('stu', 75.53994238475472),
 ('amazing', 71.80004527157249),
 ('gem', 68.87245038457067),
 ('ftr', 67.84318117920506),
 ('best', 67.67807103838325),
 ('exceeded', 67.16646901630916),
 ('perfect', 66.69899477118987),
 ('perfection', 66.33219997298013),
 ('awesome', 62.26478012792374),
 ('love', 62.239691057049306)]

In [78]:
len('pd2xo1wqnic9zxltvdilga')

22

In [16]:
path2 = "hw2data_1/reviews_te.csv"
df_test = pd.read_csv(path2)

In [17]:
def test_tfidf(df_test, w_tfidf, frequency):
    dict_list_test = data_compression(df_test)
    count = 0
    wrong = 0
    for x in dict_list_test:
        for key, value in x.items():
            if key != '*label*' and key != '*const*':
                if key in frequency: x[key] = x[key] * math.log((len(dict_list_test) / frequency[key]), 10)
        
        count += 1
        dot_product = 0
        label = x['*label*']
        for key, value in x.items():
            if key in w and key != '*label*':
                dot_product += w[key] * x[key]
        if dot_product * label <= 0: wrong += 1 
    return (count - wrong) / count

In [106]:
start_time = time.time()
print(test_tfidf(df_test, w, frequency))
print("--- %s seconds ---" % (time.time() - start_time))

0.8193251322933132
--- 68.89674496650696 seconds ---
