In [83]:
import numpy
import urllib
import scipy.optimize
import random
from collections import defaultdict
import nltk
import string
from nltk.stem.porter import *
from sklearn import linear_model

In [84]:
def parseData(fname):
  for l in urllib.request.urlopen(fname):
    yield eval(l)
    
print ("Reading data...")
data = list(parseData("http://jmcauley.ucsd.edu/cse190/data/beer/beer_50000.json"))[:5000]
print ("done")

Reading data...
done


In [85]:
#Number of unique words
from collections import Counter

wordCount = defaultdict(int)
for d in data:
  for w in d['review/text'].split():
    wordCount[w] += 1
    
print("Unique unigram count: " +str(len(wordCount)))

wordCount = defaultdict(lambda : defaultdict(int))
uniqueBigramCount = 0
prev = ''
for d in data:
    for w in d['review/text'].split():
        if prev not in wordCount:
            uniqueBigramCount += 1
        elif prev in wordCount and w not in wordCount[prev]:
            uniqueBigramCount += 1
        wordCount[prev][w] += 1
        prev = w
    prev = ''
    
print("Unique bigram count: " +str(uniqueBigramCount))

uwordCount = defaultdict(int)
punctuation = set(string.punctuation)
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if c not in punctuation])
    for w in r.split():
        uwordCount[w]+=1

print("Unique unigram count without punc/capitalization: "+str(len(uwordCount)))

wordCount = defaultdict(lambda : defaultdict(int))
bwordCount = defaultdict(lambda: 0)
uniqueBigramCount = 0
prev = ''
for d in data:
    r = ''.join([c for c in d['review/text'].lower() if c not in punctuation])
    for w in r.split():
        if prev not in wordCount:
            uniqueBigramCount += 1
        elif prev in wordCount and w not in wordCount[prev]:
            uniqueBigramCount += 1
        wordCount[prev][w] += 1
        bwordCount[prev+','+w] += 1
        prev = w
    prev = ''
    
print("Unique bigram count without punc/capitalization: " +str(uniqueBigramCount))
print("Unique bigram count without punc/capitalization: " +str(len(bwordCount)))

Unique unigram count: 36225
Unique bigram count: 237209
Unique unigram count without punc/capitalization: 19426
Unique bigram count without punc/capitalization: 182902
Unique bigram count without punc/capitalization: 182902


In [86]:
counts = [(bwordCount[w], w) for w in bwordCount]
counts.sort()
counts.reverse()

words = [[x[1], x[0]] for x in counts[:1000]]

#most frequent bigrams
for i in range(5):
    print(words[i])
    
words = [x[1] for x in counts[:1000]]

['with,a', 4587]
['in,the', 2595]
['of,the', 2245]
['is,a', 2056]
['on,the', 2033]


In [87]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

def feature(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
    prev = ''
    for w in r.split():
        bigram = prev + ',' + w
        if bigram in words:
            feat[wordId[bigram]] += 1
        prev = w
    feat.append(1) #offset
    return feat

In [88]:
X = [feature(d) for d in data]
y = [d['review/overall'] for d in data]

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

In [89]:
from sklearn.metrics import mean_squared_error
print(mean_squared_error(y, predictions)) #0.343976376659

0.343976376659


In [90]:
ucounts = [(uwordCount[w], w) for w in uwordCount]
ucounts.sort()
ucounts.reverse()

words = [x[1] for x in ucounts[:1000]]

#most frequent unigrams
for i in range(5):
    print(words[i], ucounts[i][0])

a 30695
the 27569
and 19512
of 15935
is 12623


In [91]:
combinedCount = ucounts + counts
combinedCount.sort()
combinedCount.reverse()

words = [x[1] for x in combinedCount[:1000]]

#most frequent combined unigrams + bigrams
for i in range(5):
    print(words[i])

a
the
and
of
is


In [92]:
wordId = dict(zip(words, range(len(words))))
wordSet = set(words)

def feature1(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
    prev = ''
    for w in r.split():
        bigram = prev + ',' + w
        if bigram in words:
            feat[wordId[bigram]] += 1
        if w in words:
            feat[wordId[w]] += 1
        prev = w
    feat.append(1) #offset
    return feat

In [93]:
X = [feature1(d) for d in data]
y = [d['review/overall'] for d in data]

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

#combined unigram+bigram model
print(mean_squared_error(y, predictions)) #0.288822120305

0.288822120305


In [94]:
leastWeightGrams = [words[i] for i in theta.argsort()[:5]]
print(leastWeightGrams)

['sort,of', 'water', 'corn', 'the,background', 'straw']


In [95]:
mostWeightGrams = [words[i] for i in theta.argsort()[-6:] if i != len(words)]
print(mostWeightGrams)

['not,bad', 'the,best', 'of,these', 'a,bad', 'sort']


In [96]:
tf = defaultdict(lambda: defaultdict(int))
idf = defaultdict(int)

for d in data:
    r = ''.join([c for c in d['review/text'].lower() if c not in punctuation])
    wordsInDoc = defaultdict(int)
    for w in r.split():
        wordsInDoc[w] += 1
    for u in wordsInDoc:
        idf[u] += 1

totalDocCount = len(data)
import math
for t in idf:
    idf[t] = math.log(totalDocCount/idf[t])

In [97]:
print("Foam idf: "+ str(idf['foam']))
print("Banana idf: "+ str(idf['banana']))
print("Smell idf: "+str(idf['smell']))
print("Lactic idf: "+str(idf['lactic']))
print("Tart idf: "+str(idf['tart']))
print("A idf: "+str(idf['a']))

Foam idf: 2.6200393237794968
Banana idf: 3.8632328412587142
Smell idf: 1.238564249095555
Lactic idf: 6.725433722188183
Tart idf: 4.160484364726646
A idf: 0.014098924379501675


In [98]:
review1 = ''.join([c for c in data[0]['review/text'].lower() if c not in punctuation])
print(review1)
tf = defaultdict(int)
for w in review1.split():
    tf[w] += 1
print("Foam tf-idf: "+ str(idf['foam']*tf['foam']))
print("Banana tf-idf: "+ str(idf['banana']*tf['banana']))
print("Smell tf-idf: "+str(idf['smell']*tf['smell']))
print("Lactic tf-idf: "+str(idf['lactic']*tf['lactic']))
print("Tart tf-idf: "+str(idf['tart']*tf['tart']))
print("A tf-idf: "+str(idf['a']*tf['a']))

a lot of foam but a lot	in the smell some banana and then lactic and tart not a good start	quite dark orange in color with a lively carbonation now visible under the foam	again tending to lactic sourness	same for the taste with some yeast and banana
Foam tf-idf: 5.2400786475589936
Banana tf-idf: 7.7264656825174285
Smell tf-idf: 1.238564249095555
Lactic tf-idf: 13.450867444376366
Tart tf-idf: 4.160484364726646
A tf-idf: 0.0563956975180067


In [99]:
words = [x[1] for x in ucounts[:1000]]
wordId = dict(zip(words, range(len(words))))

#sanity check
for a in range(5):
    print(words[a])
    
def feature2(datum):
    feat = [0]*len(words)
    r = ''.join([c for c in datum['review/text'].lower() if not c in punctuation])
    tf = defaultdict(int)
    for w in r.split():
        if w in words:
            tf[w] += 1
    for w in tf:
        feat[wordId[w]] = tf[w]*idf[w]
    feat.append(1) #offset
    return feat

a
the
and
of
is


In [100]:
X = [feature2(d) for d in data]
y = [d['review/overall'] for d in data]

#With regularization
clf = linear_model.Ridge(1.0, fit_intercept=False)
clf.fit(X, y)
theta = clf.coef_
predictions = clf.predict(X)

#Tf-IDF model
print(mean_squared_error(y, predictions))

0.278742490057
