In [1]:
import numpy
import urllib
import scipy.optimize
import random
from sklearn import linear_model
import gzip
from collections import defaultdict

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [5]:
f = open("5year.arff", 'r')

In [6]:
# Read and parse the data
while not '@data' in f.readline():
    pass

dataset = []
for l in f:
    if '?' in l: # Missing entry
        continue
    l = l.split(',')
    values = [1] + [float(x) for x in l]
    values[-1] = values[-1] > 0 # Convert to bool
    dataset.append(values)

In [7]:
dataset[0]

[1,
 0.088238,
 0.55472,
 0.01134,
 1.0205,
 -66.52,
 0.34204,
 0.10949,
 0.57752,
 1.0881,
 0.32036,
 0.10949,
 0.1976,
 0.096885,
 0.10949,
 1475.2,
 0.24742,
 1.8027,
 0.10949,
 0.077287,
 50.199,
 1.1574,
 0.13523,
 0.062287,
 0.41949,
 0.32036,
 0.20912,
 1.0387,
 0.026093,
 6.1267,
 0.37788,
 0.077287,
 155.33,
 2.3498,
 0.24377,
 0.13523,
 1.4493,
 571.37,
 0.32101,
 0.095457,
 0.12879,
 0.11189,
 0.095457,
 127.3,
 77.096,
 0.45289,
 0.66883,
 54.621,
 0.10746,
 0.075859,
 1.0193,
 0.55407,
 0.42557,
 0.73717,
 0.73866,
 15182.0,
 0.080955,
 0.27543,
 0.91905,
 0.002024,
 7.2711,
 4.7343,
 142.76,
 2.5568,
 3.2597,
 False]

In [6]:
X = [d[:-1] for d in dataset]
y = [d[-1] for d in dataset]

In [7]:
answers = {} # Your answers

In [8]:
def accuracy(predictions, y):
    TP = sum([(p and l) for (p,l) in zip(predictions, y)])
    FP = sum([(p and not l) for (p,l) in zip(predictions, y)])
    TN = sum([(not p and not l) for (p,l) in zip(predictions, y)])
    FN = sum([(not p and l) for (p,l) in zip(predictions, y)])

    accuracy = (TP+TN)/(TP+FP+TN+FN)
    
    return accuracy

In [9]:
def BER(predictions, y):
    TP = sum([(p and l) for (p,l) in zip(predictions, y)])
    FP = sum([(p and not l) for (p,l) in zip(predictions, y)])
    TN = sum([(not p and not l) for (p,l) in zip(predictions, y)])
    FN = sum([(not p and l) for (p,l) in zip(predictions, y)])

    TPR = TP / (TP + FN)
    TNR = TN / (TN + FP)

    ber = 1 - 1/2 * (TPR + TNR)

    return ber


In [10]:
### Question 1

In [11]:
mod = linear_model.LogisticRegression(C=1)
mod.fit(X,y)

pred = mod.predict(X)

In [12]:
correct = pred == y

ber1 = BER(pred,y)
acc1 = accuracy(pred,y)

In [13]:
answers['Q1'] = [acc1, ber1] # Accuracy and balanced error rate

In [14]:
assertFloatList(answers['Q1'], 2)

In [15]:
### Question 2

In [16]:
mod = linear_model.LogisticRegression(C=1, class_weight='balanced')
mod.fit(X,y)

pred = mod.predict(X)

In [17]:
correct = pred == y

ber2 = BER(pred,y)
acc2 = accuracy(pred,y)

In [18]:
answers['Q2'] = [acc2, ber2]

In [19]:
assertFloatList(answers['Q2'], 2)

In [20]:
### Question 3

In [21]:
random.seed(3)
random.shuffle(dataset)

In [22]:
X = [d[:-1] for d in dataset]
y = [d[-1] for d in dataset]

In [23]:
Xtrain, Xvalid, Xtest = X[:len(X)//2], X[len(X)//2:(3*len(X))//4], X[(3*len(X))//4:]
ytrain, yvalid, ytest = y[:len(X)//2], y[len(X)//2:(3*len(X))//4], y[(3*len(X))//4:]

In [24]:
len(Xtrain), len(Xvalid), len(Xtest)

(1515, 758, 758)

In [25]:
mod = linear_model.LogisticRegression(C=1, class_weight='balanced')
mod.fit(Xtrain,ytrain)

predTrain = mod.predict(Xtrain)
predValid = mod.predict(Xvalid)
predTest = mod.predict(Xtest)

berTrain = BER(predTrain,ytrain)
berValid = BER(predValid,yvalid)
berTest = BER(predTest,ytest)


In [26]:
answers['Q3'] = [berTrain, berValid, berTest]

In [27]:
assertFloatList(answers['Q3'], 3)

In [28]:
### Question 4

In [29]:
ls = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
errorTrain = [] #this is going to have the MSEtrain values of each l
errorValid = [] #this is going to have the MSEvalid values of each l
berList = []

for l in ls:
    model = linear_model.LogisticRegression(C=l, class_weight='balanced')
    model.fit(Xtrain, ytrain) #fitting the model

    predictValid = model.predict(Xvalid)


    ber = BER(predictValid,yvalid)
    berList.append(ber)

    print(ber)


0.3281320669380371
0.31931252826775225
0.3281320669380371
0.3179556761646314
0.3159203980099503
0.3111714156490276
0.2955030044582283
0.29618143050978873
0.29618143050978873


In [30]:
answers['Q4'] = berList

In [31]:
assertFloatList(answers['Q4'], 9)

In [32]:
### Question 5

In [33]:
#bestModel = None
bestVal = None
bestLamb = None

ls = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]
errorTrain = [] #this is going to have the MSEtrain values of each l
errorValid = [] #this is going to have the MSEvalid values of each l

for l in ls:
    model = linear_model.LogisticRegression(C=l, class_weight='balanced')
    model.fit(Xtrain, ytrain) #fitting the model

    predictValid = model.predict(Xvalid)


    ber = BER(predictValid,yvalid)

    predictTest = model.predict(Xtest)
    ber_test = BER(predictTest,ytest)

    #print("l = " + str(l) + ", BER = " + str(ber))
    if bestVal == None or ber < bestVal:
        bestVal = ber
        #bestModel = model
        bestLamb = l

bestC = bestLamb
ber5 = bestVal

print(bestC)
print(ber5)

100
0.2955030044582283


In [34]:
answers['Q5'] = [bestC, ber5]

In [35]:
assertFloatList(answers['Q5'], 2)

In [36]:
### Question 6

In [37]:
f = gzip.open("young_adult_10000.json.gz")
dataset = []
for l in f:
    dataset.append(eval(l))

In [38]:
dataTrain = dataset[:9000]
dataTest = dataset[9000:]

In [39]:
# Some data structures you might want

usersPerItem = defaultdict(set) # Maps an item to the users who rated it
itemsPerUser = defaultdict(set) # Maps a user to the items that they rated
reviewsPerUser = defaultdict(list) # Maps a user to the reviews they made
reviewsPerItem = defaultdict(list) # Maps an item to its reviews
ratingDict = {} # To retrieve a rating for a specific user/item pair

for d in dataTrain:
    user,item = d['user_id'], d['book_id']
    usersPerItem[item].add(user)
    itemsPerUser[user].add(item)
    ratingDict[(user,item)] = d['rating']
    reviewsPerUser[user].append(d)
    reviewsPerItem[item].append(d)

In [40]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [57]:
def mostSimilar(i, N):
    similarities = []
    users = usersPerItem[i]
    #print(type(users))
    for j in usersPerItem:
        if j == i: continue
        #print(usersPerItem[j])
        #print(users)
        sim = Jaccard(users, usersPerItem[j])
        #sim = Pearson(i, j) # Could use alternate similarity metrics straightforwardly
        similarities.append((sim,j))
    similarities.sort(reverse=True)
    return similarities[:N]

In [58]:
answers['Q6'] = mostSimilar('2767052', 10)

<class 'set'>


In [569]:
assert len(answers['Q6']) == 10
assertFloatList([x[0] for x in answers['Q6']], 10)

In [570]:
### Question 7

In [571]:
ratingMean = sum([d['rating'] for d in dataTrain]) / len(dataTrain)

itemAverages = {}
    
for i in usersPerItem:
    rs = [ratingDict[(u,i)] for u in usersPerItem[i]]
    itemAverages[i] = sum(rs) / len(rs)


def predictRating(user,item):
    ratings = []
    simscores = []
    for d in reviewsPerUser[user]:
        #print(d)
        j = d['book_id']
        if j == item: continue
        ratings.append(d['rating'] - itemAverages[j])
        simscores.append(Jaccard(usersPerItem[item],usersPerItem[j]))
    if (sum(simscores) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,simscores)]
        return itemAverages[item] + sum(weightedRatings) / sum(simscores)
    else:
        return ratingMean


def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

#alwaysPredictMean = [ratingMean for d in dataTest]
simPredictions = [predictRating(d['user_id'], d['book_id']) for d in dataTest]

labels = [d['rating'] for d in dataTest]


mse7 = MSE(simPredictions,labels)

In [572]:
answers['Q7'] = mse7

In [573]:
assertFloat(answers['Q7'])

In [574]:
### Question 8

In [575]:

ratingMean = sum([d['rating'] for d in dataTrain]) / len(dataTrain)

def predictRating(user,item):
    ratings = []
    simscores = []
    for d in reviewsPerUser[user]:
        #print(d)
        j = d['book_id']
        if j == item: continue
        ratings.append(d['rating'] - itemAverages[j])
        simscores.append(Jaccard(itemsPerUser[user],itemsPerUser[j]))
    if (sum(simscores) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,simscores)]
        return itemAverages[user] + sum(weightedRatings) / sum(simscores)
    else:
        return ratingMean


def MSE(predictions, labels):
    differences = [(x-y)**2 for x,y in zip(predictions,labels)]
    return sum(differences) / len(differences)

#alwaysPredictMean = [ratingMean for d in dataTest]
simPredictions = [predictRating(d['user_id'], d['book_id']) for d in dataTest]

labels = [d['rating'] for d in dataTest]

mse8 = MSE(simPredictions,labels)

In [576]:
answers['Q8'] = mse8

In [577]:
assertFloat(answers['Q8'])

In [524]:
f = open("answers_hw2.txt", 'w')
f.write(str(answers) + '\n')
f.close()