In [43]:
import numpy as np
from urllib.request import urlopen
import scipy.optimize
import random
from math import exp
from math import log

In [45]:
def parseData(fname):
  for l in urlopen(fname):
    yield eval(l)

print("Reading data...")
data = list(parseData("http://jmcauley.ucsd.edu/cse190/data/beer/beer_50000.json"))
print("done")

def feature(datum):
  feat = [1, datum['review/taste'], datum['review/appearance'], datum['review/aroma'], datum['review/palate'], datum['review/overall']]
  return feat

X = [feature(d) for d in data]
y = [d['beer/ABV'] >= 6.5 for d in data]

def inner(x,y):
  return sum([x[i]*y[i] for i in range(len(x))])

def sigmoid(x):
  return 1.0 / (1 + exp(-x))

Reading data...
done


In [47]:
##################################################
# Logistic regression by gradient ascent         #
##################################################

# NEGATIVE Log-likelihood
def f(theta, X, y, lam):
  loglikelihood = 0
  for i in range(len(X)):
    logit = inner(X[i], theta)
    loglikelihood -= log(1 + exp(-logit))
    if not y[i]:
      loglikelihood -= logit
  for k in range(len(theta)):
    loglikelihood -= lam * theta[k]*theta[k]
  # for debugging
  # print("ll =" + str(loglikelihood))
  return -loglikelihood

# NEGATIVE Derivative of log-likelihood
def fprime(theta, X, y, lam):
  dl = [0]*len(theta)
  for i in range(len(X)):
    logit = inner(X[i], theta)
    for k in range(len(theta)):
      dl[k] += X[i][k] * (1 - sigmoid(logit))
      if not y[i]:
        dl[k] -= X[i][k]
  for k in range(len(theta)):
    dl[k] -= lam*2*theta[k]
  return numpy.array([-x for x in dl])


In [51]:
X_train = X
y_train = y

In [83]:

# split into 1/3 train, 1/3 validation, 1/3 test

Z = list(zip(X, y))

random.shuffle(Z)

x_shuffled, y_shuffled = zip(*Z)

print("X shuffled: ", np.shape(x_shuffled))
print("y shuffled: ", np.shape(y_shuffled))

samples = len(x_shuffled)

X_train = x_shuffled[0:round(samples/3)];
y_train = y_shuffled[0:round(samples/3)];

X_validation = x_shuffled[round(samples/3) + 1: 2 * round(samples/3)]
y_validation = y_shuffled[round(samples/3) + 1: 2 * round(samples/3)]

X_test = x_shuffled[2 * round(samples/3) + 1:samples]
y_test = y_shuffled[2 * round(samples/3) + 1:samples]

print("x train: ", np.shape(X_train), "x validate: ", np.shape(X_validate), "x test: ", np.shape(X_test))
print("y train: ", np.shape(y_train), "y validate: ", np.shape(y_validate), "y test: ", np.shape(y_test))



X shuffled:  (50000, 6)
y shuffled:  (50000,)
x train:  (16667, 6) x validate:  (16666, 6) x test:  (16665, 6)
y train:  (16667,) y validate:  (16666,) y test:  (16665,)


In [49]:
##################################################
# Train                                          #
##################################################

def train(lam):
  theta,_,_ = scipy.optimize.fmin_l_bfgs_b(f, [0]*len(X[0]), fprime, pgtol = 10, args = (X_train, y_train, lam))
  return theta


In [95]:
##################################################
# Predict                                        #
##################################################

def performance(theta, X, y):
  scores = [inner(theta,x) for x in X]
  predictions = [s > 0 for s in scores]
    
  positives = sum(predictions)
  negatives = len(predictions) - sum(predictions)
    
  correct = [(a==b) for (a,b) in zip(predictions, y)]
    
  truePositives = sum(correct)
  trueNegatives = len(correct) - sum(correct)

  falsePositives = sum([(a==1 and b==0) for (a,b) in zip(predictions,y)])
  falseNegatives = sum([(a==0 and b==1) for (a,b) in zip(predictions,y)])
 
  acc = sum(correct) * 1.0 / len(correct)
  return acc, positives, negatives, truePositives, trueNegatives, falsePositives, falseNegatives


In [96]:
##################################################
# Validation pipeline                            #
##################################################
lam = 1.0
theta = train(lam)

In [98]:
labels  = ["Training Set", "Validation Set", "Testing Set"]
corpusX = [X_train, X_validation, X_test]
corpusY = [y_train, y_validation, y_test]

for (label, x, y) in zip(labels, corpusX, corpusY):
    print("==================================================")
    print(label, "\n")
    acc, positives, negatives, truePositives, trueNegatives, falsePositives, falseNegatives = performance(theta, x, y)
    
    print("Positives: ", positives)
    print("Negatives: ", negatives, "\n")

    print("True Positives: ", truePositives)
    print("True Positives: ", trueNegatives, "\n")
    
    print("False Positives: ", falsePositives)
    print("False Negatives: ", falseNegatives, "\n")
    
    print("lambda = " + str(lam) + ":\taccuracy=" + str(acc), "\n")


Training Set 

Samples:  16667 

Positives:  12568
Negatives:  4099 

True Positives:  11903
True Positives:  4764 

False Positives:  3449
False Negatives:  1315 

lambda = 1.0:	accuracy=0.7141657166856663 

Validation Set 

Samples:  16666 

Positives:  12513
Negatives:  4153 

True Positives:  11974
True Positives:  4692 

False Positives:  3396
False Negatives:  1296 

lambda = 1.0:	accuracy=0.71846873874955 

Testing Set 

Samples:  16665 

Positives:  12408
Negatives:  4257 

True Positives:  11949
True Positives:  4716 

False Positives:  3402
False Negatives:  1314 

lambda = 1.0:	accuracy=0.717011701170117 



In [99]:
lambdas = [0, 0.01, 0.1, 1, 100]
labels  = ["Training Set", "Validation Set", "Testing Set"]
corpusX = [X_train, X_validation, X_test]
corpusY = [y_train, y_validation, y_test]

acc = []
positives = []
negatives = []
truePositives = []
trueNegatives = []
falsePositives = []
falseNegatives = []

for lam in lambdas:
    theta = train(lam)
    for (x, y) in zip(corpusX, corpusY):
        _acc, _positives, _negatives, _truePositives, _trueNegatives, _falsePositives, _falseNegatives \
            = performance(theta, x, y)
            
        acc.append(_acc)
        positives.append(_positives)
        negatives.append(_negatives)
        truePositives(_truePositives)
        trueNegatives(_trueNegatives)
        falsePositives(_falsePositives)
        falseNegatives(_falseNegatives)
        

Samples:  16667 

Samples:  16666 

Samples:  16665 

Samples:  16667 

Samples:  16666 

Samples:  16665 

Samples:  16667 

Samples:  16666 

Samples:  16665 

Samples:  16667 

Samples:  16666 

Samples:  16665 

Samples:  16667 

Samples:  16666 

Samples:  16665 

