In [1]:
import numpy
import urllib
import scipy.optimize
import random
from collections import defaultdict
import nltk
import string
from nltk.stem.porter import *
from sklearn import linear_model

In [4]:
def parseData(fname):
  for l in urllib.request.urlopen(fname):
    yield eval(l)
    
print ("Reading data...")
data = list(parseData("http://jmcauley.ucsd.edu/cse190/data/beer/beer_50000.json"))[:5000]
print ("done")

Reading data...
done


In [47]:
#Number of unique words
from collections import Counter

wordCount = defaultdict(int)
for d in data:
  for w in d['review/text'].split():
    wordCount[w] += 1
    
print("Unique unigram count: " +str(len(wordCount)))

wordCount = defaultdict(lambda : defaultdict(int))
uniqueBigramCount = 0
prev = ''
for d in data:
    for w in d['review/text'].split():
        if prev not in wordCount:
            uniqueBigramCount += 1
        elif prev in wordCount and w not in wordCount[prev]:
            uniqueBigramCount += 1
        wordCount[prev][w] += 1
        prev = w
    prev = ''
    
print("Unique bigram count: " +str(uniqueBigramCount))

uwordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if c not in punctuation])
    for w in r.split():
        uwordCount[w]+=1

print("Unique unigram count without punc/capitalization: "+str(len(uwordCount)))

wordCount = defaultdict(lambda : defaultdict(int))
bwordCount = defaultdict(lambda: 0)
uniqueBigramCount = 0
prev = ''
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if c not in punctuation])
    for w in r.split():
        if prev not in wordCount:
            uniqueBigramCount += 1
        elif prev in wordCount and w not in wordCount[prev]:
            uniqueBigramCount += 1
        wordCount[prev][w] += 1
        bwordCount[prev+','+w] += 1
        prev = w
    prev = ''
    
print("Unique bigram count without punc/capitalization: " +str(uniqueBigramCount))
print("Unique bigram count without punc/capitalization: " +str(len(bwordCount)))

Unique unigram count: 36225
Unique bigram count: 237209
Unique unigram count without punc/capitalization: 19426
Unique bigram count without punc/capitalization: 182902
Unique bigram count without punc/capitalization: 182902


In [43]:
counts = [(bwordCount[w], w) for w in bwordCount]
counts.sort()
counts.reverse()

words = [[x[1], x[0]] for x in counts[:1000]]

#most frequent bigrams
for i in range(5):
    print(words[i])
    
words = [x[1] for x in counts[:1000]]

['with,a', 4587]
['in,the', 2595]
['of,the', 2245]
['is,a', 2056]
['on,the', 2033]


In [44]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
    prev = ''
    for w in r.split():
        bigram = prev + ',' + w
        if bigram in words:
            feat[wordId[bigram]] += 1
        prev = w
    feat.append(1) #offset
    return feat

In [45]:
X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

In [46]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y, predictions)) #0.343976376659

0.343976376659


In [56]:
ucounts = [(uwordCount[w], w) for w in uwordCount]
ucounts.sort()
ucounts.reverse()

words = [x[1] for x in ucounts[:1000]]

#most frequent unigrams
for i in range(5):
    print(words[i], ucounts[i][0])

a 30695
the 27569
and 19512
of 15935
is 12623


In [63]:
combinedCount = ucounts + counts
combinedCount.sort()
combinedCount.reverse()

words = [x[1] for x in combinedCount[:1000]]

#most frequent combined unigrams + bigrams
for i in range(100):
    print(words[i])

a
the
and
of
is
with
to
this
i
it
in
but
beer
that
very
head
with,a
not
as
for
on
some
was
taste
nice
good
hops
light
malt
like
one
from
in,the
its
at
carbonation
dark
bit
more
sweet
flavor
of,the
an
little
my
aroma
is,a
on,the
well
hop
there
a,bit
chocolate
glass
be
color
finish
lacing
just
pours
this,is
smell
mouthfeel
up
no
into
this,beer
really
and,a
much
out
body
have
all
a,little
you
are
bitter
caramel
than
has
had
bitterness
bottle
alcohol
poured
medium
notes
a,nice
nose
would
me
citrus
too
can
s
malts
t
smooth
great


In [61]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

def feature1(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
    prev = ''
    for w in r.split():
        bigram = prev + ',' + w
        if bigram in words:
            feat[wordId[bigram]] += 1
        if w in words:
            feat[wordId[w]] += 1
        prev = w
    feat.append(1) #offset
    return feat

In [62]:
X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

#combined unigram+bigram model
print(mean_squared_error(y, predictions))

0.424951133062


In [66]:
leastWeightGrams = [words[i] for i in theta.argsort()[:5]]
print(leastWeightGrams)

['at,a', 'tastes,like', 'a,pale', 'no,lacing', 'the,bitterness']


In [70]:
mostWeightGrams = [words[i] for i in theta.argsort()[-6:] if i != len(words)]
print(mostWeightGrams)

['up,a', 'the,best', 'easy,to', 'i,love', 'very,drinkable']
