In [20]:
import json
import gzip
import math
from collections import defaultdict
import numpy as np
from sklearn import linear_model

In [2]:
# This will suppress any warnings, comment out if you'd like to preserve them
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Check formatting of submissions
def assertFloat(x):
    assert type(float(x)) == float

def assertFloatList(items, N):
    assert len(items) == N
    assert [type(float(x)) for x in items] == [float]*N

In [27]:
answers = {}

In [28]:
f = open("spoilers.json", 'r')

In [29]:
dataset = []
for l in f:
    d = eval(l)
    dataset.append(d)

In [30]:
f.close()

In [31]:
# A few utility data structures
reviewsPerUser = defaultdict(list)
reviewsPerItem = defaultdict(list)

for d in dataset:
    u,i = d['user_id'],d['book_id']
    reviewsPerUser[u].append(d)
    reviewsPerItem[i].append(d)

# Sort reviews per user by timestamp
for u in reviewsPerUser:
    reviewsPerUser[u].sort(key=lambda x: x['timestamp'])
    
# Same for reviews per item
for i in reviewsPerItem:
    reviewsPerItem[i].sort(key=lambda x: x['timestamp'])

In [60]:
# E.g. reviews for this user are sorted from earliest to most recent
[d['timestamp'] for d in reviewsPerUser['b0d7e561ca59e313b728dc30a5b1862e']]

['2012-03-13',
 '2013-05-06',
 '2013-09-03',
 '2015-04-05',
 '2016-02-10',
 '2016-05-29']

In [94]:
len(reviewsPerUser)

5052

In [221]:
reviewsPerUser[user_id]

[{'user_id': 'f51e2c1b6f20c9dbd2d448995e687388',
  'timestamp': '2015-08-29',
  'review_sentences': [[0,
    "While this is a solid entry in the series, it didn't quite have the magic of the previous couple of books."],
   [0,
    "I am hoping against hope that that's not because Kate and Curran have finally gotten their relationship together."],
   [0,
    "I found myself thinking of Jeaniene Frost's Night Huntress series when I was reading this one."],
   [0, 'And not in a good way.'],
   [0,
    'I loved the first few books of that series, but, now that Cat and Bones are together (as are Kate and Curran), the series has become a little episodic (as was this book).'],
   [0,
    "The writing is still good in both series, but I don't want to feel like I'm watching a TV series where each episode resolves a crime (or some such) with very little overarching plot."],
   [0, "It's early days for this series, so my fingers are crossed."]],
  'rating': 4,
  'has_spoiler': False,
  'book_id':

In [12]:
### 1a

In [13]:
y = []
y_pred = []
user_ids = list(reviewsPerUser.keys())
for user_id in user_ids:
    user_ratings = reviewsPerUser[user_id]
    if len(user_ratings) <2:
        continue
    actual_rating = reviewsPerUser[user_id][-1]['rating']
    previous_ratings = [d['rating'] for d in reviewsPerUser[user_id][:-1]]
    predicted_rating = sum(previous_ratings)/len(previous_ratings)
    
    y.append(actual_rating)
    y_pred.append(predicted_rating)
    
print("All User ratings predcited Successfuly!")

All ratings predcited Successfuly!


In [17]:
def MSE(y_true, y_prediction):
    MSE = np.square(np.subtract(y_true, y_prediction)).mean()
    print(MSE)
    return MSE

In [21]:
answers['Q1a'] = MSE(y,y_pred)

1.7686669082740192


In [22]:
assertFloat(answers['Q1a'])

In [None]:
### 1b

In [23]:
y = []
y_pred = []
item_ids = list(reviewsPerItem.keys())
for item_id in item_ids:
    item_ratings = reviewsPerItem[item_id]
    if len(item_ratings) <2:
        continue
    actual_rating = reviewsPerItem[item_id][-1]['rating']
    previous_ratings = [d['rating'] for d in reviewsPerItem[item_id][:-1]]
    predicted_rating = sum(previous_ratings)/len(previous_ratings)
    
    y.append(actual_rating)
    y_pred.append(predicted_rating)
    
print("All Items rating predcited Successfuly!")

All Items rating predcited Successfuly!


In [24]:
answers['Q1b'] = MSE(y,y_pred)

1.8881081920131038


In [25]:
assertFloat(answers['Q1b'])

In [45]:
### 2

In [49]:
answers['Q2'] = []

def predict_user_rating(N):
    y = []
    y_pred = []
    user_ids = list(reviewsPerUser.keys())
    for user_id in user_ids:
        user_ratings = reviewsPerUser[user_id]
        if len(user_ratings) < N+1:
            continue
        actual_rating = reviewsPerUser[user_id][-1]['rating']
        previous_ratings = [d['rating'] for d in reviewsPerUser[user_id][-(N+1):-1]]
        predicted_rating = sum(previous_ratings)/len(previous_ratings)

        y.append(actual_rating)
        y_pred.append(predicted_rating)

    return y, y_pred

for N in [1,2,3]:
    y, y_pred = predict_user_rating(N)
    answers['Q2'].append(MSE(y,y_pred))

2.666035950804163
2.015987909640471
1.81504303599374


In [50]:
assertFloatList(answers['Q2'], 3)

In [None]:
### 3a

In [70]:
def feature3(N, u): # For a user u and a window size of N
    y = []
    y_pred = []

    user_ratings = reviewsPerUser[u]
#     if len(user_ratings) < N+1:
#         continue
    actual_rating = reviewsPerUser[u][-1]['rating']
    feature_vec = [d['rating'] for d in reviewsPerUser[u][-(N+1):]]
    
    return feature_vec

In [71]:
answers['Q3a'] = [feature3(2,dataset[0]['user_id']), feature3(3,dataset[0]['user_id'])]

[4, 4, 3]
[4, 4, 4, 3]


In [72]:
assert len(answers['Q3a']) == 2
assert len(answers['Q3a'][0]) == 3
assert len(answers['Q3a'][1]) == 4

In [None]:
### 3b

In [92]:
answers['Q3b'] = []

def get_previous_ratings(N):
    X = []
    y = []
    user_ids = list(reviewsPerUser.keys())
    for user_id in user_ids:
        user_ratings = reviewsPerUser[user_id]
        if len(user_ratings) < N+1:
            continue
        
        next_rating = reviewsPerUser[user_id][-1]['rating']
        feature_vec = [d['rating'] for d in reviewsPerUser[user_id][-(N+1):-1]]
        
        X.append(feature_vec)
        y.append(next_rating)
        
    return X, y
    
for N in [1,2,3]:
    X, y = get_previous_ratings(N)
    
    reg = linear_model.LinearRegression()
    reg.fit(X, y)
    y_pred = reg.predict(X)
    
    mse = MSE(y, y_pred)
    answers['Q3b'].append(mse)

1.5608319121482543
1.5409512373315701
1.5396484853948416


In [93]:
assertFloatList(answers['Q3b'], 3)

In [None]:
### 4a

In [95]:
globalAverage = [d['rating'] for d in dataset]
globalAverage = sum(globalAverage) / len(globalAverage)

In [202]:
def featureMeanValue(N, u): # For a user u and a window size of N
    user_ratings = reviewsPerUser[u]
    if len(user_ratings) < 1:
        feature_vec = [globalAverage] * 11
    elif len(user_ratings) > 11:
        feature_vec = [d['rating'] for d in reviewsPerUser[u][-11:]]
    else:
        feature_vec = [d['rating'] for d in reviewsPerUser[u]]
        
        missing_index = 11 - len(feature_vec)
        avg_user_rating = sum(feature_vec)/len(feature_vec)
        missing_values = [avg_user_rating] * missing_index
        feature_vec = missing_values + feature_vec

    return feature_vec

In [215]:
def featureMissingValue(N, u):
    user_ratings = reviewsPerUser[u]
    if len(user_ratings) < 1:
        feature_vec = [1 ,0] * 10
    elif len(user_ratings) > 10:
        feature_vec = []
        for d in reviewsPerUser[u][-11:-1]:
            feature_vec.append(0)
            feature_vec.append(d['rating'])
        feature_vec.append(reviewsPerUser[u][-1]['rating'])
    else:
        feature_vec = []
        for d in reviewsPerUser[u][:-1]:
            feature_vec.append(0)
            feature_vec.append(d['rating'])

        feature_vec.append(reviewsPerUser[u][-1]['rating'])
        missing_index = 11 - len(user_ratings)
        missing_values = [1 ,0] * missing_index
        feature_vec = missing_values + feature_vec
        
    return feature_vec

In [216]:
answers['Q4a'] = [featureMeanValue(10, dataset[0]['user_id']), featureMissingValue(10, dataset[0]['user_id'])]

In [217]:
assert len(answers['Q4a']) == 2
assert len(answers['Q4a'][0]) == 11
assert len(answers['Q4a'][1]) == 21

In [218]:
### 4b

In [219]:
answers['Q4b'] = []

for featFunc in [featureMeanValue, featureMissingValue]:
    y = []
    y_pred = []
    user_ids = list(reviewsPerUser.keys())
    for user_id in user_ids:
        feature_vec = featFunc(10, user_id)
        y.append(feature_vec.pop())
#         print(feature_vec)
        prediction = sum(feature_vec)/len(feature_vec)
        y_pred.append(prediction)
    mse = MSE(y, y_pred)
    answers['Q4b'].append(mse)

0.9601498490565842
9.759098376880443


In [220]:
assertFloatList(answers["Q4b"], 2)

In [None]:
### 5

In [241]:
def feature5(sentence):
    char_len = len(sentence) - sentence.count(' ')
    exclamation_len =  sentence.count('!')
    uppen_char_len =  sum(1 for s in sentence if s.isupper())
    
    return [char_len, exclamation_len, uppen_char_len]

In [261]:
y = []
X = []

for d in dataset:
    for spoiler,sentence in d['review_sentences']:
        X.append(feature5(sentence))
        y.append(spoiler)
        
log_reg = linear_model.LogisticRegression(class_weight='balanced', C=1)
log_reg.fit(X, y)
y_pred = log_reg.predict(np.array(X))


TP = 0
FP = 0
TN = 0
FN = 0

for i in range(len(y_pred)): 
    if y[i]==y_pred[i]==1:
       TP += 1
    if y_pred[i]==1 and y[i]!=y_pred[i]:
       FP += 1
    if y[i]==y_pred[i]==0:
       TN += 1
    if y_pred[i]==0 and y[i]!=y_pred[i]:
       FN += 1
    
TPR = TP / (TP + FN)
TNR = TN / (TN + FP)

balanced_accuracy = (TPR + TNR) / 2
BER = 1 - balanced_accuracy

In [276]:
answers['Q5a'] = X[0]

In [277]:
answers['Q5b'] = [TP, TN, FP, FN, BER]

In [279]:
assert len(answers['Q5a']) == 3
assertFloatList(answers['Q5b'], 5)

In [None]:
### 6

In [None]:
def feature6(review):
    

In [None]:
y = []
X = []

for d in dataset:
    sentences = d['review_sentences']
    if len(sentences) < 6: continue
    X.append(feature6(d))
    y.append(sentences[5][0])

#etc.

In [None]:
answers['Q6a'] = X[0]

In [None]:
answers['Q6b'] = BER

In [None]:
assert len(answers['Q6a']) == 9
assertFloat(answers['Q6b'])

In [None]:
### 7

In [None]:
# 50/25/25% train/valid/test split
Xtrain, Xvalid, Xtest = X[:len(X)//2], X[len(X)//2:(3*len(X))//4], X[(3*len(X))//4:]
ytrain, yvalid, ytest = y[:len(X)//2], y[len(X)//2:(3*len(X))//4], y[(3*len(X))//4:]

In [None]:
for c in [0.01, 0.1, 1, 10, 100]:
    # etc.

In [None]:
answers['Q7'] = bers + [bestC] + [ber]

In [None]:
assertFloatList(answers['Q7'], 7)

In [None]:
### 8

In [None]:
def Jaccard(s1, s2):
    numer = len(s1.intersection(s2))
    denom = len(s1.union(s2))
    if denom == 0:
        return 0
    return numer / denom

In [None]:
# 75/25% train/test split
dataTrain = dataset[:15000]
dataTest = dataset[15000:]

In [None]:
# A few utilities

itemAverages = defaultdict(list)
ratingMean = []

for d in dataTrain:
    itemAverages[d['book_id']].append(d['rating'])
    ratingMean.append(d['rating'])

for i in itemAverages:
    itemAverages[i] = sum(itemAverages[i]) / len(itemAverages[i])

ratingMean = sum(ratingMean) / len(ratingMean)

In [None]:
reviewsPerUser = defaultdict(list)
usersPerItem = defaultdict(set)

for d in dataTrain:
    u,i = d['user_id'], d['book_id']
    reviewsPerUser[u].append(d)
    usersPerItem[i].add(u)

In [None]:
# From my HW2 solution, welcome to reuse
def predictRating(user,item):
    ratings = []
    similarities = []
    for d in reviewsPerUser[user]:
        i2 = d['book_id']
        if i2 == item: continue
        ratings.append(d['rating'] - itemAverages[i2])
        similarities.append(Jaccard(usersPerItem[item],usersPerItem[i2]))
    if (sum(similarities) > 0):
        weightedRatings = [(x*y) for x,y in zip(ratings,similarities)]
        return itemAverages[item] + sum(weightedRatings) / sum(similarities)
    else:
        # User hasn't rated any similar items
        if item in itemAverages:
            return itemAverages[item]
        else:
            return ratingMean

In [None]:
answers["Q8"] = MSE(predictions, labels)

In [None]:
assertFloat(answers["Q8"])

In [None]:
### 9

In [None]:
for d in dataTest:
    # etc.

In [None]:
answers["Q9"] = [mse0, mse1to5, mse5]

In [None]:
assertFloatList(answers["Q9"], 3)

In [None]:
### 10

In [None]:
answers["Q10"] = ("describe your solution", itsMSE)

In [None]:
assert type(answers["Q10"][0]) == str
assertFloat(answers["Q10"][1])

In [None]:
f = open("answers_midterm.txt", 'w')
f.write(str(answers) + '\n')
f.close()