In [17]:
import os
import json
import gzip
import pandas as pd
from collections import defaultdict
import math
import scipy.optimize
from sklearn import svm
import numpy
import string
import random
import string
from sklearn import linear_model
import warnings

In [18]:
### load the meta data

data = []
with gzip.open('datasets/Kindle_Store_5.json.gz') as f:
    for l in f:
        data.append(json.loads(l.strip()))

### Kindle Store set possible analysis
- Number of users
- Number of items
- Average rating
- Anything common in high/low rating entries
- relationship between length of review and rating
- Can any inference be reached by knowing whether an entry is unverified/verified
- Temporal analysis
- Analyse summary

### Other dataset possible analysis and task
- Check other dataset for analysing anything else
- Try to use image in other dataset
- Some datasets contained votes

### Possible features for rating prediction (for Kindle Store)
- User-Item interactions (Latent Factor model)
- Rating based on review and summary content
- Can style be used as feature somehow?
- Review time as feature or contributor in rating prediction.
- particular temporal analysis

In [19]:
random.shuffle(data)

train_data = data[:len(data)//2]
valid_data = data[len(data)//2:(3*(len(data)//4))]
test_data = data[(3*(len(data)//4)):]

train_data[0]

{'overall': 4.0,
 'verified': True,
 'reviewTime': '03 14, 2017',
 'reviewerID': 'A5DSWBIVCNWDE',
 'asin': 'B007HU7NXS',
 'style': {'Format:': ' Kindle Edition'},
 'reviewerName': 'Michelle Wexelblat',
 'reviewText': "I'm glad I bought these novellas. I think I like these two as a pairing better than Andrea and her Bouda.",
 'summary': 'More fun',
 'unixReviewTime': 1489449600}

In [20]:
def data_latent_form(entry):
    return [entry['reviewerID'], entry['asin'], entry['overall']]

In [21]:
train_data_latent = []
for d in train_data:
    train_data_latent.append(data_latent_form(d))

valid_data_latent = []
for d in valid_data:
    valid_data_latent.append(data_latent_form(d))

test_data_latent = []
for d in test_data:
    test_data_latent.append(data_latent_form(d))

[['A5DSWBIVCNWDE', 'B007HU7NXS', 4.0],
 ['A2V6KMOJUVFY8A', 'B00C7S9KWW', 4.0],
 ['A3M5TOBKDNBL5A', 'B00K0U3GHE', 5.0],
 ['A1UQ6YSSVHT5D1', 'B00XQUGXGQ', 5.0],
 ['A284RH4QEFMAQZ', 'B01BDU0S98', 5.0],
 ['A1GB6PENQXL6KT', 'B005V1GITK', 4.0],
 ['A2QQEWWIE0PD9J', 'B0178DD5YE', 5.0],
 ['A3O7PESG3UCALP', 'B00LG0SQHC', 5.0],
 ['A30WI6TEBY9UXT', 'B00AA3P7M8', 5.0],
 ['A2VXSQHJWZAQGY', 'B00ISOM7K6', 3.0]]

In [30]:
userRatings = defaultdict(list)
allRatings = []
for _,_,r in train_data_latent:
  r = int(r)
  allRatings.append(r)

globalAverage = sum(allRatings) / len(allRatings)

allRatings = []
for l in train_data_latent:
    allRatings.append(l)

beta_u = defaultdict(int)
beta_i = defaultdict(int)
gamma_u = defaultdict(list)
gamma_i = defaultdict(list)
alpha = 0
for u,b,_ in allRatings:
    if not beta_u[u]: beta_u[u] = 0
    if not beta_i[b]: beta_i[b] = 0
    if not gamma_u[u]: gamma_u[u] = [0.5,0.2]
    if not gamma_i[b]: gamma_i[b] = [0.5,0.2]

booksPerUser = defaultdict(set)
usersPerBook = defaultdict(set)
training_count = 0
alpha_temp = 0
beta_u_temp = 0
beta_i_temp = 0
reg_param = 10
 

for u,b,r in allRatings:
    gamma_prod = 0
    for j in range(len(gamma_u[u])):
        gamma_prod += (gamma_u[u][j] * gamma_i[b][j])
    r_pred = alpha + beta_u[u] + beta_i[b] + gamma_prod
    alpha_temp = ((training_count * alpha) + (r - r_pred + alpha))
    training_count += 1
    alpha_temp /=  training_count
    beta_u_temp = (((reg_param + len(booksPerUser[u])) * beta_u[u]) + (r - r_pred + beta_u[u]))
    beta_u_temp /= (reg_param + len(booksPerUser[u]) + 1)
    beta_i_temp = (((reg_param + len(usersPerBook[b])) * beta_i[b]) + (r - r_pred + beta_i[b]))
    beta_i_temp /= (reg_param + len(usersPerBook[b]) + 1)
    gamma_u_temp = []
    gamma_i_temp = []
    for j in range(len(gamma_u[u])):
        
        temp_u = (((reg_param + (len(booksPerUser[u]) * (gamma_i[b][j] ** 2))) * gamma_u[u][j]) + 
                (gamma_i[b][j] * (r - r_pred + (gamma_u[u][j] * gamma_i[b][j]))))
        temp_i = (((reg_param + (len(usersPerBook[b]) * (gamma_u[u][j] ** 2))) * gamma_i[b][j]) + 
                (gamma_u[u][j] * (r - r_pred + (gamma_u[u][j] * gamma_i[b][j]))))
        temp_u /= (reg_param + ((len(booksPerUser[u]) + 1) * (gamma_i[b][j] ** 2)))
        temp_i /= (reg_param + ((len(usersPerBook[b]) + 1) * (gamma_u[u][j] ** 2)))
        
        gamma_u_temp.append(temp_u)
        gamma_i_temp.append(temp_i)
    
    booksPerUser[u].add(b)
    usersPerBook[b].add(u)
    alpha = alpha_temp
    beta_u[u] = beta_u_temp
    beta_i[b] = beta_i_temp
    for j in range(len(gamma_u[u])):
        gamma_u[u][j] = gamma_u_temp[j]
        gamma_i[b][j] = gamma_i_temp[j]


In [38]:
y = []
ypred = []
for u,b,r in test_data_latent:
    y.append(r)
    if u in beta_u and b in beta_i:
        gamma_prod = 0
        for j in range(len(gamma_u[u])):
            gamma_prod += (gamma_u[u][j] * gamma_i[b][j])
        ypred.append(alpha + beta_u[u] + beta_i[b] + gamma_prod)
    elif u in beta_u and not b in beta_i:
        ypred.append(alpha + beta_u[u])
    elif not u in beta_u and b in beta_i:
        ypred.append(alpha + beta_i[b])
    else:
        ypred.append(globalAverage)
    if ypred[-1] < 0 or ypred[-1] > 5:
        if ypred[-1] < 0: ypred[-1] = 0
        else: ypred[-1] = 5
    ypred[-1] = round(ypred[-1])
y = numpy.array(y)
ypred = numpy.array(ypred)
validMSE = numpy.asarray(sum([x**2 for x in (y - ypred)])) / len(y)
print(validMSE)

0.6820123509216407


In [39]:
print(y[1000:1010])
print(ypred[1000:1010])

[4. 5. 5. 5. 5. 3. 5. 5. 5. 5.]
[4 5 4 5 5 4 4 5 4 4]
