In [1]:
# load packages
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from collections import OrderedDict

In [3]:
# load yelp data for sentiment analysis
reviews = list()
labels = list()
for fname in ["amazon_labelled.txt", "yelp_labelled.txt", "imdb_labelled.txt"]:
    with open("data/" + fname, mode="r") as fp:
        content = fp.read()
        label_docs = content.split("\n") # separate reviews
        for doc in label_docs:
            tmp = doc.split("\t") # separate reviews and labels
            #print(tmp)
            # if something is missing
            if len(tmp) > 1:
                reviews.append(tmp[0])
                labels.append(tmp[1])
            else:
                continue

In [4]:
print("number of reviews = ", len(reviews), "and number of labels = ", len(labels))

number of reviews =  3000 and number of labels =  3000


In [6]:
reviews[:10], labels[:10]

(['So there is no way for me to plug it in here in the US unless I go by a converter.',
  'Good case, Excellent value.',
  'Great for the jawbone.',
  'Tied to charger for conversations lasting more than 45 minutes.MAJOR PROBLEMS!!',
  'The mic is great.',
  'I have to jiggle the plug to get it to line up right to get decent volume.',
  'If you have several dozen or several hundred contacts, then imagine the fun of sending each of them one by one.',
  'If you are Razr owner...you must have this!',
  'Needless to say, I wasted my money.',
  'What a waste of money and time!.'],
 ['0', '1', '1', '0', '1', '0', '0', '1', '0', '0'])

In [23]:
# remove all except strings
reviews = [re.sub("[^a-z.!?' ]", "", str(r).lower()) for r in reviews]
reviews = [re.sub("[.?!]+", " ", r) for r in reviews]
reviews[:10]

['so there is no way for me to plug it in here in the us unless i go by a converter ',
 'good case excellent value ',
 'great for the jawbone ',
 'tied to charger for conversations lasting more than  minutes major problems ',
 'the mic is great ',
 'i have to jiggle the plug to get it to line up right to get decent volume ',
 'if you have several dozen or several hundred contacts then imagine the fun of sending each of them one by one ',
 'if you are razr owner you must have this ',
 'needless to say i wasted my money ',
 'what a waste of money and time  ']

In [24]:
# filter out english stopwords
stop_words_en = set(stopwords.words("english"))
review_tokens = list()
for rev in reviews:
    tmp = list()
    for w in rev.split():
        if len(w) > 2 and w not in stop_words_en:
            tmp.append(w)
    review_tokens.append(tmp)
# test
review_tokens[:10]

[['way', 'plug', 'unless', 'converter'],
 ['good', 'case', 'excellent', 'value'],
 ['great', 'jawbone'],
 ['tied',
  'charger',
  'conversations',
  'lasting',
  'minutes',
  'major',
  'problems'],
 ['mic', 'great'],
 ['jiggle', 'plug', 'get', 'line', 'right', 'get', 'decent', 'volume'],
 ['several',
  'dozen',
  'several',
  'hundred',
  'contacts',
  'imagine',
  'fun',
  'sending',
  'one',
  'one'],
 ['razr', 'owner', 'must'],
 ['needless', 'say', 'wasted', 'money'],
 ['waste', 'money', 'time']]

In [26]:
len(review_tokens)

3000

In [32]:
# now let us find frequency of each word
token_frequency = dict()
for rev in review_tokens:
    for t in rev:
        if t in token_frequency:
            token_frequency[t] += 1
        else:
            token_frequency[t] = 1
# test
list(token_frequency.keys())[:10]

['way',
 'plug',
 'unless',
 'converter',
 'good',
 'case',
 'excellent',
 'value',
 'great',
 'jawbone']

In [43]:
# get the support for each word
token_support = dict()
for rev in review_tokens:
    for t in rev:
        token_support[t] = token_frequency[t]
# test
token_support

{'way': 45,
 'plug': 12,
 'unless': 7,
 'converter': 1,
 'good': 228,
 'case': 32,
 'excellent': 53,
 'value': 8,
 'great': 209,
 'jawbone': 3,
 'tied': 1,
 'charger': 19,
 'conversations': 3,
 'lasting': 3,
 'minutes': 33,
 'major': 2,
 'problems': 15,
 'mic': 4,
 'jiggle': 1,
 'get': 55,
 'line': 16,
 'right': 34,
 'decent': 9,
 'volume': 12,
 'several': 12,
 'dozen': 2,
 'hundred': 1,
 'contacts': 3,
 'imagine': 3,
 'fun': 10,
 'sending': 4,
 'one': 144,
 'razr': 5,
 'owner': 3,
 'must': 16,
 'needless': 3,
 'say': 36,
 'wasted': 10,
 'money': 31,
 'waste': 33,
 'time': 111,
 'sound': 48,
 'quality': 65,
 'impressed': 19,
 'going': 33,
 'original': 10,
 'battery': 46,
 'extended': 2,
 'two': 35,
 'seperated': 1,
 'mere': 1,
 'started': 12,
 'notice': 2,
 'excessive': 1,
 'static': 3,
 'garbled': 1,
 'headset': 47,
 'though': 18,
 'design': 12,
 'odd': 3,
 'ear': 35,
 'clip': 4,
 'comfortable': 20,
 'highly': 23,
 'recommend': 50,
 'blue': 5,
 'tooth': 2,
 'phone': 165,
 'advise': 4,

In [46]:
# create vocab based on the data
vocab = [tok for tok in token_support.keys() if token_support[tok] > 5 and token_support[tok] < 1000]
len(vocab)

603

In [47]:
vocab[:10]

['way',
 'plug',
 'unless',
 'good',
 'case',
 'excellent',
 'value',
 'great',
 'charger',
 'minutes']

In [13]:
# select top keywords and encode them
keyword_map = dict()
count = 1
for tag in sorted_support:
    keyword_map[tag[0]] = count
    count += 1
keyword_map

{'despite': 1,
 'running': 1655,
 'together': 902,
 'nan': 2,
 'apart': 23,
 'wow': 1461,
 'weak': 30,
 'madhouse': 3,
 'smoke': 4,
 'awesome': 1852,
 'potato': 1727,
 'decision': 6,
 'downside': 7,
 'omg': 1167,
 'corporation': 368,
 'pumpkin': 8,
 'smelled': 9,
 'served': 1752,
 'tip': 1462,
 'corn': 1047,
 'soooooo': 10,
 'water': 1463,
 'rancheros': 11,
 'gluten': 12,
 'flatlined': 13,
 'mayowell': 14,
 'completely': 1415,
 'carbs': 353,
 'crowds': 17,
 'eyes': 18,
 'salmon': 1606,
 'expected': 1570,
 'likes': 19,
 'cranberrymmmm': 257,
 'soi': 20,
 'hopefully': 21,
 'christmas': 22,
 'none': 1460,
 'sample': 24,
 'delicious': 1902,
 'tiramisu': 25,
 'deal': 1754,
 'doubt': 1169,
 'round': 1123,
 'wrapped': 27,
 'crpe': 28,
 'bellagio': 481,
 'friend': 1417,
 'bbq': 29,
 'summer': 1166,
 'downright': 1079,
 'pack': 32,
 'give': 1806,
 'thirty': 33,
 'palate': 34,
 'pucks': 35,
 'instead': 36,
 'par': 1171,
 'due': 37,
 'watered': 650,
 'salty': 38,
 'kept': 1673,
 'middle': 39,
 'r

Since the dataset is very small we are forced to take all words but ideally we would have filtered some words using the support and only use a subset of words as our vocab.

In [14]:
len(keyword_map)

1922

We will use these 1922 words as our vocabulary. We can use this by taking a vector of size 1922 for all reviews but is an inefficient way. We will use a vector that represents presence of the word only and not the absence - embeddings.

In [15]:
# now we encode our reviews using the mapping generated
encoded_reveiws = list()
for r in reviews:
    tmp = list()
    words = r.split()
    for w in words:
        if w in keyword_map:
            tmp.append(keyword_map[w])
        else:
            continue
    encoded_reveiws.append(tmp)
encoded_reveiws

[[1461, 1833, 1921],
 [1423, 1920],
 [1837, 1473, 1489],
 [1209, 1196, 1714, 553, 805, 50, 286, 1484, 1833],
 [1822, 1873, 1918, 1834],
 [1794, 385, 1857, 1671, 1736],
 [980, 1863, 1862, 1868],
 [1580, 1916, 663, 1880, 1750, 1881, 602, 1915, 1673, 577],
 [1818, 1918],
 [1918, 1213],
 [1919, 750],
 [1910, 1917],
 [1436, 1483, 1912, 1877, 1841, 1295, 838, 1792],
 [1761, 122, 682, 944, 631, 257],
 [276, 1897, 1778, 1197, 1281],
 [1013, 231, 1071, 641],
 [1634, 1562],
 [1793, 1835, 1838, 1919],
 [1921, 1825, 1915, 1370, 57, 1901],
 [1916],
 [741, 587],
 [1922, 1903],
 [1919, 1909, 1387],
 [1880, 1483, 476, 1153, 1485],
 [384],
 [1384, 83, 1174, 686, 1077, 1357, 1920],
 [1908, 1595, 1872, 1708],
 [901, 1464, 1918, 1231, 521, 1623, 1906, 1891],
 [1799,
  1735,
  1892,
  1922,
  1720,
  1905,
  1922,
  672,
  1751,
  1126,
  1655,
  1786,
  1916,
  1517,
  1299],
 [1866, 1606, 1504],
 [1909, 911, 1916, 1850, 1818, 1796, 1385, 1754],
 [1916, 1046, 1084],
 [1777, 1921, 530, 1880, 136],
 [1324, 

In [16]:
len(encoded_reveiws)

1000

We can always get our transaction using the reverse of the mapping.

In [17]:
# let use explore labels now
labels

['1',
 '0',
 '0',
 '1',
 '1',
 '0',
 '0',
 '0',
 '1',
 '1',
 '1',
 '0',
 '0',
 '1',
 '0',
 '0',
 '1',
 '0',
 '0',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0',
 '0',
 '1',
 '0',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0',
 '1',
 '0',
 '0',
 '1',
 '0',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '0',
 '0',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '1',
 '0',
 '0',
 '0',
 '0',
 '1',
 '0',
 '1',
 '0',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0',
 '1',
 '0',
 '0',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '0',
 '0',
 '0',
 '0',
 '0',
 '1',
 '1',
 '1',
 '0',
 '0',
 '1',
 '0',
 '1',
 '0',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1',
 '1',
 '0',
 '1',
 '0',
 '0',
 '0',
 '0',
 '1',
 '1',
 '0',
 '0',
 '0',
 '0',
 '1',
 '1',
 '0',
 '0',
 '1',
 '1',
 '1',
 '1',
 '1',
 '0',
 '0',
 '1',
 '1',
 '0',
 '1',
 '1',
 '1'

In [18]:
# convert the sting to categorical int
labels = [int(w) for w in labels]
labels

[1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,


Now we are ready to train and model on the prepared data.

In [19]:
print(len(encoded_reveiws), len(labels))

1000 1000
