In [1]:
# load packages
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from collections import OrderedDict

In [2]:
# load yelp data for sentiment analysis
reviews = list()
labels = list()
with open("yelp_labelled.txt", mode="r") as fp:
    content = fp.read()
    label_docs = content.split("\n") # separate reviews
    for doc in label_docs:
        tmp = doc.split("\t") # separate reviews and labels
        print(tmp) # reveiws and labels
        # special case where one of the review is missing
        if len(tmp) > 1:
            reviews.append(tmp[0])
            labels.append(tmp[1])
        else:
            continue

['Wow... Loved this place.', '1']
['Crust is not good.', '0']
['Not tasty and the texture was just nasty.', '0']
['Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.', '1']
['The selection on the menu was great and so were the prices.', '1']
['Now I am getting angry and I want my damn pho.', '0']
["Honeslty it didn't taste THAT fresh.)", '0']
['The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer.', '0']
['The fries were great too.', '1']
['A great touch.', '1']
['Service was very prompt.', '1']
['Would not go back.', '0']
['The cashier had no care what so ever on what I had to say it still ended up being wayyy overpriced.', '0']
['I tried the Cape Cod ravoli, chicken,with cranberry...mmmm!', '1']
['I was disgusted because I was pretty sure that was human hair.', '0']
['I was shocked because no signs indicate cash only.', '0']
['Highly recommended.', '1']
['Waitress was a little slow in servi

In [3]:
print("number of reviews = ", len(reviews), "and number of labels = ", len(labels))

number of reviews =  1000 and number of labels =  1000


In [4]:
reviews

['Wow... Loved this place.',
 'Crust is not good.',
 'Not tasty and the texture was just nasty.',
 'Stopped by during the late May bank holiday off Rick Steve recommendation and loved it.',
 'The selection on the menu was great and so were the prices.',
 'Now I am getting angry and I want my damn pho.',
 "Honeslty it didn't taste THAT fresh.)",
 'The potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer.',
 'The fries were great too.',
 'A great touch.',
 'Service was very prompt.',
 'Would not go back.',
 'The cashier had no care what so ever on what I had to say it still ended up being wayyy overpriced.',
 'I tried the Cape Cod ravoli, chicken,with cranberry...mmmm!',
 'I was disgusted because I was pretty sure that was human hair.',
 'I was shocked because no signs indicate cash only.',
 'Highly recommended.',
 'Waitress was a little slow in service.',
 'This place is not worth your time, let alone Vegas.',
 'did not like at all.'

In [5]:
# make all word to lower case
reviews = [rev.lower() for rev in reviews]

In [6]:
# remove all except strings
reviews = [re.sub("[^a-z ]", "", str(r)) for r in reviews]
reviews

['wow loved this place',
 'crust is not good',
 'not tasty and the texture was just nasty',
 'stopped by during the late may bank holiday off rick steve recommendation and loved it',
 'the selection on the menu was great and so were the prices',
 'now i am getting angry and i want my damn pho',
 'honeslty it didnt taste that fresh',
 'the potatoes were like rubber and you could tell they had been made up ahead of time being kept under a warmer',
 'the fries were great too',
 'a great touch',
 'service was very prompt',
 'would not go back',
 'the cashier had no care what so ever on what i had to say it still ended up being wayyy overpriced',
 'i tried the cape cod ravoli chickenwith cranberrymmmm',
 'i was disgusted because i was pretty sure that was human hair',
 'i was shocked because no signs indicate cash only',
 'highly recommended',
 'waitress was a little slow in service',
 'this place is not worth your time let alone vegas',
 'did not like at all',
 'the burrittos blah',
 'the 

In [7]:
# neglect small word since mostly they are not important
stop_words = set(stopwords.words("english"))

transaction = list()
for rev in reviews:
    tmp = list()
    for w in rev.split():
        if len(w) > 2 and w not in stop_words:
            tmp.append(w)
    transaction.append(tmp)
transaction

[['wow', 'loved', 'place'],
 ['crust', 'good'],
 ['tasty', 'texture', 'nasty'],
 ['stopped',
  'late',
  'may',
  'bank',
  'holiday',
  'rick',
  'steve',
  'recommendation',
  'loved'],
 ['selection', 'menu', 'great', 'prices'],
 ['getting', 'angry', 'want', 'damn', 'pho'],
 ['honeslty', 'didnt', 'taste', 'fresh'],
 ['potatoes',
  'like',
  'rubber',
  'could',
  'tell',
  'made',
  'ahead',
  'time',
  'kept',
  'warmer'],
 ['fries', 'great'],
 ['great', 'touch'],
 ['service', 'prompt'],
 ['would', 'back'],
 ['cashier', 'care', 'ever', 'say', 'still', 'ended', 'wayyy', 'overpriced'],
 ['tried', 'cape', 'cod', 'ravoli', 'chickenwith', 'cranberrymmmm'],
 ['disgusted', 'pretty', 'sure', 'human', 'hair'],
 ['shocked', 'signs', 'indicate', 'cash'],
 ['highly', 'recommended'],
 ['waitress', 'little', 'slow', 'service'],
 ['place', 'worth', 'time', 'let', 'alone', 'vegas'],
 ['like'],
 ['burrittos', 'blah'],
 ['food', 'amazing'],
 ['service', 'also', 'cute'],
 ['could', 'care', 'less', 'in

In [8]:
len(transaction)

1000

In [9]:
# now let us find frequency of each word
frequeny = dict()
for tra in transaction:
    for t in tra:
        if t in frequeny:
            frequeny[t] += 1
        else:
            frequeny[t] = 1
frequeny

{'despite': 1,
 'nan': 1,
 'none': 3,
 'wow': 3,
 'summer': 2,
 'madhouse': 1,
 'smoke': 1,
 'violinists': 1,
 'potato': 6,
 'decision': 1,
 'downside': 1,
 'omg': 2,
 'pumpkin': 1,
 'finish': 2,
 'smelled': 1,
 'tip': 3,
 'soooooo': 1,
 'water': 3,
 'rancheros': 1,
 'gluten': 1,
 'flatlined': 1,
 'mayowell': 1,
 'head': 1,
 'rudely': 1,
 'crowds': 1,
 'eyes': 1,
 'salmon': 4,
 'selection': 10,
 'likes': 1,
 'soi': 1,
 'hopefully': 1,
 'christmas': 1,
 'apart': 1,
 'sample': 1,
 'tiramisu': 1,
 'deal': 7,
 'doubt': 2,
 'lover': 1,
 'wrapped': 1,
 'crpe': 1,
 'downtown': 2,
 'going': 18,
 'bbq': 1,
 'weak': 1,
 'costcos': 1,
 'pack': 1,
 'give': 9,
 'thirty': 1,
 'palate': 1,
 'pucks': 1,
 'instead': 1,
 'par': 2,
 'due': 1,
 'excellent': 10,
 'creamy': 2,
 'salty': 1,
 'kept': 5,
 'middle': 1,
 'rolled': 1,
 'burger': 12,
 'itll': 1,
 'vegasthere': 1,
 'jalapeno': 1,
 'palm': 1,
 'unwelcome': 1,
 'valley': 2,
 'drenched': 1,
 'cramming': 1,
 'conclusion': 1,
 'flavor': 13,
 'pros': 1,


In [10]:
# get the support for each word
support = dict()
for tra in transaction:
    for t in tra:
        support[t] = frequeny[t] / len(transaction)
support

{'despite': 0.001,
 'nan': 0.001,
 'none': 0.003,
 'wow': 0.003,
 'summer': 0.002,
 'madhouse': 0.001,
 'smoke': 0.001,
 'violinists': 0.001,
 'potato': 0.006,
 'decision': 0.001,
 'downside': 0.001,
 'omg': 0.002,
 'pumpkin': 0.001,
 'finish': 0.002,
 'smelled': 0.001,
 'tip': 0.003,
 'soooooo': 0.001,
 'water': 0.003,
 'rancheros': 0.001,
 'gluten': 0.001,
 'flatlined': 0.001,
 'mayowell': 0.001,
 'head': 0.001,
 'rudely': 0.001,
 'crowds': 0.001,
 'eyes': 0.001,
 'salmon': 0.004,
 'selection': 0.01,
 'likes': 0.001,
 'soi': 0.001,
 'hopefully': 0.001,
 'christmas': 0.001,
 'apart': 0.001,
 'sample': 0.001,
 'tiramisu': 0.001,
 'deal': 0.007,
 'doubt': 0.002,
 'lover': 0.001,
 'wrapped': 0.001,
 'crpe': 0.001,
 'downtown': 0.002,
 'going': 0.018,
 'bbq': 0.001,
 'weak': 0.001,
 'costcos': 0.001,
 'pack': 0.001,
 'give': 0.009,
 'thirty': 0.001,
 'palate': 0.001,
 'pucks': 0.001,
 'instead': 0.001,
 'par': 0.002,
 'due': 0.001,
 'excellent': 0.01,
 'creamy': 0.002,
 'salty': 0.001,
 '

In [11]:
len(support)

1922

In [12]:
# sort the dictionary
sorted_support = sorted(support.items(), key=lambda x: x[1])
sorted_support

[('despite', 0.001),
 ('nan', 0.001),
 ('madhouse', 0.001),
 ('smoke', 0.001),
 ('violinists', 0.001),
 ('decision', 0.001),
 ('downside', 0.001),
 ('pumpkin', 0.001),
 ('smelled', 0.001),
 ('soooooo', 0.001),
 ('rancheros', 0.001),
 ('gluten', 0.001),
 ('flatlined', 0.001),
 ('mayowell', 0.001),
 ('head', 0.001),
 ('rudely', 0.001),
 ('crowds', 0.001),
 ('eyes', 0.001),
 ('likes', 0.001),
 ('soi', 0.001),
 ('hopefully', 0.001),
 ('christmas', 0.001),
 ('apart', 0.001),
 ('sample', 0.001),
 ('tiramisu', 0.001),
 ('lover', 0.001),
 ('wrapped', 0.001),
 ('crpe', 0.001),
 ('bbq', 0.001),
 ('weak', 0.001),
 ('costcos', 0.001),
 ('pack', 0.001),
 ('thirty', 0.001),
 ('palate', 0.001),
 ('pucks', 0.001),
 ('instead', 0.001),
 ('due', 0.001),
 ('salty', 0.001),
 ('middle', 0.001),
 ('rolled', 0.001),
 ('itll', 0.001),
 ('vegasthere', 0.001),
 ('jalapeno', 0.001),
 ('palm', 0.001),
 ('unwelcome', 0.001),
 ('drenched', 0.001),
 ('cramming', 0.001),
 ('conclusion', 0.001),
 ('pros', 0.001),
 ('r

In [13]:
# select top keywords and encode them
keyword_map = dict()
count = 1
for tag in sorted_support:
    keyword_map[tag[0]] = count
    count += 1
keyword_map

{'despite': 1,
 'running': 1655,
 'together': 902,
 'nan': 2,
 'apart': 23,
 'wow': 1461,
 'weak': 30,
 'madhouse': 3,
 'smoke': 4,
 'awesome': 1852,
 'potato': 1727,
 'decision': 6,
 'downside': 7,
 'omg': 1167,
 'corporation': 368,
 'pumpkin': 8,
 'smelled': 9,
 'served': 1752,
 'tip': 1462,
 'corn': 1047,
 'soooooo': 10,
 'water': 1463,
 'rancheros': 11,
 'gluten': 12,
 'flatlined': 13,
 'mayowell': 14,
 'completely': 1415,
 'carbs': 353,
 'crowds': 17,
 'eyes': 18,
 'salmon': 1606,
 'expected': 1570,
 'likes': 19,
 'cranberrymmmm': 257,
 'soi': 20,
 'hopefully': 21,
 'christmas': 22,
 'none': 1460,
 'sample': 24,
 'delicious': 1902,
 'tiramisu': 25,
 'deal': 1754,
 'doubt': 1169,
 'round': 1123,
 'wrapped': 27,
 'crpe': 28,
 'bellagio': 481,
 'friend': 1417,
 'bbq': 29,
 'summer': 1166,
 'downright': 1079,
 'pack': 32,
 'give': 1806,
 'thirty': 33,
 'palate': 34,
 'pucks': 35,
 'instead': 36,
 'par': 1171,
 'due': 37,
 'watered': 650,
 'salty': 38,
 'kept': 1673,
 'middle': 39,
 'r

Since the dataset is very small we are forced to take all words but ideally we would have filtered some words using the support and only use a subset of words as our vocab.

In [14]:
len(keyword_map)

1922

We will use these 1922 words as our vocabulary. We can use this by taking a vector of size 1922 for all reviews but is an inefficient way. We will use a vector that represents presence of the word only and not the absence - embeddings.

In [15]:
# now we encode our reviews using the mapping generated
encoded_reveiws = list()
for r in reviews:
    tmp = list()
    words = r.split()
    for w in words:
        if w in keyword_map:
            tmp.append(keyword_map[w])
        else:
            continue
    encoded_reveiws.append(tmp)
encoded_reveiws

[[1461, 1833, 1921],
 [1423, 1920],
 [1837, 1473, 1489],
 [1209, 1196, 1714, 553, 805, 50, 286, 1484, 1833],
 [1822, 1873, 1918, 1834],
 [1794, 385, 1857, 1671, 1736],
 [980, 1863, 1862, 1868],
 [1580, 1916, 663, 1880, 1750, 1881, 602, 1915, 1673, 577],
 [1818, 1918],
 [1918, 1213],
 [1919, 750],
 [1910, 1917],
 [1436, 1483, 1912, 1877, 1841, 1295, 838, 1792],
 [1761, 122, 682, 944, 631, 257],
 [276, 1897, 1778, 1197, 1281],
 [1013, 231, 1071, 641],
 [1634, 1562],
 [1793, 1835, 1838, 1919],
 [1921, 1825, 1915, 1370, 57, 1901],
 [1916],
 [741, 587],
 [1922, 1903],
 [1919, 1909, 1387],
 [1880, 1483, 476, 1153, 1485],
 [384],
 [1384, 83, 1174, 686, 1077, 1357, 1920],
 [1908, 1595, 1872, 1708],
 [901, 1464, 1918, 1231, 521, 1623, 1906, 1891],
 [1799,
  1735,
  1892,
  1922,
  1720,
  1905,
  1922,
  672,
  1751,
  1126,
  1655,
  1786,
  1916,
  1517,
  1299],
 [1866, 1606, 1504],
 [1909, 911, 1916, 1850, 1818, 1796, 1385, 1754],
 [1916, 1046, 1084],
 [1777, 1921, 530, 1880, 136],
 [1324, 

In [16]:
len(encoded_reveiws)

1000

We can always get our transaction using the reverse of the mapping.

In [17]:
# let use explore labels now
labels

['1',
 '0',
 '0',
 '1',
 '1',
 '0',
 '0',
 '0',
 '1',
 '1',
 '1',
 '0',
 '0',
 '1',
 '0',
 '0',
 '1',
 '0',
 '0',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0',
 '0',
 '1',
 '0',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0',
 '1',
 '0',
 '0',
 '1',
 '0',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '0',
 '0',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '1',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0',
 '1',
 '0',
 '0',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '1',
 '1',
 '0',
 '0',
 '1',
 '0',
 '1',
 '0',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0',
 '0',
 '0',
 '0',
 '1',
 '1',
 '0',
 '0',
 '0',
 '0',
 '1',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '0',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1'

In [18]:
# convert the sting to categorical int
labels = [int(w) for w in labels]
labels

[1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,


Now we are ready to train and model on the prepared data.

In [19]:
print(len(encoded_reveiws), len(labels))

1000 1000
