## Perceptron Implementation with Tf-idf Data Representation


In [4]:
import scipy
import numpy as np
from sklearn.preprocessing import scale
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from collections import Counter
import matplotlib.pyplot as plt
import pandas as pd

import time
import math
import random

In [5]:
path = "hw2data_1/reviews_tr.csv"
df_train = pd.read_csv(path)

In [6]:
import nltk
stopwords = set(nltk.corpus.stopwords.words('english'))

In [29]:
# Data Representation: tf-idf
# Use hashmap to compress the data
def data_compression(df_train):
    list_dict = []  # Contains training examples compressed with hashmap

    for index, row in df_train.iterrows():
        new_dict = {}
        words = row['text'] + ""
        words = words.split()
        for word in words:
            if word not in stopwords and len(word) < 17: # Stopwords & filter some invalid strings
                if word in new_dict:
                    new_dict[word] += 1
                else:
                    new_dict[word] = 1

        if(row['label'] == 1):  # Attach the label
            new_dict['*label*'] = 1
        else: new_dict['*label*'] = -1

        new_dict['*const*'] = 1   # Lifting
        list_dict.append(new_dict)

    return list_dict

In [9]:
def train_tfidf(list_dict):
    w = {} 
    
    # First pass
    random.shuffle(list_dict)
    for x in list_dict:
        dot_product = 0
        label = x['*label*']
        for key, value in x.items():
            if key != '*label*':
                if key in w:
                    dot_product += w[key] * x[key]
        
        if dot_product * label <= 0:           
            for key, value in x.items():
                if key != '*label*':
                    if key in w:
                        w[key] += label * x[key]
                    else: w[key] = label * x[key]
     
    
    # Second pass
    random.shuffle(list_dict)
    w_ret = dict(w) # Initilize w_ret
    for x in list_dict:
        dot_product = 0
        label = x['*label*']
        for key, value in x.items():
            if key != '*label*':
                if key in w:
                    dot_product += w[key] * x[key]
           
        if dot_product * label <= 0:
            for key, value in x.items():
                if key != '*label*':
                    if key in w:
                        w[key] += label * x[key]
                    else: w[key] = label * x[key]
        
        # Update w_ret
        for key, value in w.items():
            if key in w_ret:
                w_ret[key] += value
            else:
                w_ret[key] = value
                
    length = len(list_dict) + 1
    for key, value in w_ret.items():
        w_ret[key] /= length
    return w_ret
    

In [11]:
start_time = time.time()
list_dict = data_compression(df_train)
print("--- %s seconds ---" % (time.time() - start_time))

--- 130.5123209953308 seconds ---


In [20]:
# Modify feature vectors' values based on tf-idf
frequency = {}
for x in list_dict:
    for key, value in x.items():
        if key in frequency:
            frequency[key] += 1
        else:
            frequency[key] = 1

for x in list_dict:
    for key, value in x.items():
        if key != '*label*' and key != '*const*':
            x[key] = x[key] * math.log((len(list_dict) / frequency[key]), 10)

In [37]:
list_dict_tmp = list_dict[:int(len(list_dict)*0.03)]
start_time = time.time()
w = train_tfidf(list_dict_tmp) # Test on a smaller subset
print("Training time --- %s seconds ---" % (time.time() - start_time))

Training time --- 167.93792510032654 seconds ---


In [22]:
sorted(w.items(), key=lambda x: x[1], reverse = False)[:25]

[('lwqb9h3jz9wtk24lr', -143.99999999999997),
 ('anzq', -143.99999999999997),
 ('hanoi', -122.79087609044699),
 ('dima', -122.00875854026675),
 ('no1dp', -103.91396672279238),
 ('yasha', -94.07507962915219),
 ('ok', -92.2389840300337),
 ('dh', -88.20511975092161),
 ('pettit', -87.4132690116552),
 ('cardos', -86.77210593646129),
 ('worst', -85.24122015310247),
 ('okay', -83.55269977629658),
 ('riata', -81.80311309161843),
 ('prostitute', -80.81470686957579),
 ('kakuni', -80.81470686957579),
 ('bland', -79.85562972565918),
 ('decent', -78.75404962115893),
 ('mediocre', -78.13549512404944),
 ('however', -75.44912920375026),
 ('zak', -73.76335950839965),
 ('promoters', -73.69981517247945),
 ('bad', -73.49335809793533),
 ('violeta', -72.64337162775044),
 ('vito', -72.25215525032407),
 ('mudvain', -71.99999999999999)]

In [23]:
sorted(w.items(), key=lambda x: x[1], reverse = True)[:25]

[('katz', 223.80468989661645),
 ('grp', 186.01109951944193),
 ('kg', 132.48191460990313),
 ('bola', 122.00875854026675),
 ('dazzo', 110.40159550823518),
 ('schwartz', 106.6740854552407),
 ('biz_photos', 103.44407432183588),
 ('bertolucci', 98.35114601123315),
 ('amazing', 97.83743638712194),
 ('lloll', 97.43477733095793),
 ('delicious', 95.20046441643834),
 ('definitely', 94.53852963721539),
 ('love', 92.59966980288678),
 ('best', 91.99830062649129),
 ('sugarbakers', 91.50656890520054),
 ('www', 91.17673139178257),
 ('bann', 90.04975600331132),
 ('http', 79.24328420481059),
 ('*const*', 78.54784521547845),
 ('bulgur', 78.15325221094119),
 ('ommegang', 77.9354750420972),
 ('pigglys', 77.9354750420972),
 ('always', 72.72662897648594),
 ('spagghetti', 71.99999999999999),
 ('davilan', 71.99999999999999)]

In [93]:
start_time = time.time()
w = train_tfidf(list_dict)
print("--- %s seconds ---" % (time.time() - start_time))

--- 21.992014169692993 seconds ---


In [105]:
count = 0
for dict_ in list_dict:
    if 'o_c' in dict_: 
        count +=1
print(count)

1


In [34]:
# TOP15 negative words
sorted(w.items(), key=lambda x: x[1], reverse = False)[:15]

[('docwe', -179.99999999999997),
 ('schweinhaxen', -179.99999999999997),
 ('ok', -153.25188439019598),
 ('lwqb9h3jz9wtk24lr', -143.99999999999997),
 ('anzq', -143.99999999999997),
 ('bland', -142.70265121062886),
 ('okay', -133.6185719765966),
 ('user_local_photos', -130.02503788885844),
 ('hanoi', -124.37021994944578),
 ('wai', -122.73859286707557),
 ('decent', -120.15699516554422),
 ('ellen', -119.12501808195492),
 ('sunup', -117.22987831642835),
 ('userid', -114.14296685874058),
 ('average', -110.47228516804344)]

In [27]:
len('2kzmhgnq3zdl_oag6q')

18

In [36]:
# TOP15 Positive words
sorted(w.items(), key=lambda x: x[1], reverse = True)[:15]

[('grp', 186.011099519543),
 ('delicious', 152.20091708580398),
 ('amazing', 134.802818386992),
 ('kg', 132.48191460985473),
 ('great', 129.46785997892266),
 ('love', 128.0101510409685),
 ('biz_photos', 122.17244859527894),
 ('kfh', 122.008758540266),
 ('best', 116.0262100660396),
 ('merante', 112.40367605973644),
 ('maui', 105.57547116392335),
 ('blimpies', 104.36218527723078),
 ('perfect', 103.9242866715116),
 ('smth', 103.91396672278941),
 ('awesome', 97.6481313194764)]

In [78]:
len('pd2xo1wqnic9zxltvdilga')

22

In [24]:
path2 = "hw2data_1/reviews_te.csv"
df_test = pd.read_csv(path2)

In [25]:
def test_tfidf(df_test, w_tfidf, frequency):
    dict_list_test = data_compression(df_test)
    count = 0
    wrong = 0
    for x in dict_list_test:
        for key, value in x.items():
            if key != '*label*' and key != '*const*':
                if key in frequency: x[key] = x[key] * math.log((len(dict_list_test) / frequency[key]), 10)
        
        count += 1
        dot_product = 0
        label = x['*label*']
        for key, value in x.items():
            if key in w and key != '*label*':
                dot_product += w[key] * x[key]
        if dot_product * label <= 0: wrong += 1 
    return (count - wrong) / count

In [38]:
start_time = time.time()
print(test_tfidf(df_test, w, frequency))
print("--- %s seconds ---" % (time.time() - start_time))

0.8486233373526343
--- 58.9832820892334 seconds ---
