In [15]:
import json
import pandas as pd

# create pandas dataframe from specified file containing
# a separate json object on each line for each observation
def read_file(filename):
    data = {}
    file = open(filename, 'r')
    for line in file.readlines():
        datum = json.loads(line)
        for k, v in datum.items():
            if k not in data:
                data[k] = []
            data[k].append(v)
    file.close()
    return pd.DataFrame(data)
    
# import each datafile and report shape
train = read_file('train.json')
test = read_file('test.json')
val = read_file('val.json')

print(train.shape, test.shape, val.shape)
train.head()

(841020, 5) (8582, 5) (8582, 5)


Unnamed: 0,review_id,business_id,user_id,stars,date
0,xW294l3Lwh0cxlHU1jwRDA,na4Th5DrNauOv-c43QQFvA,nkN_do3fJ9xekchVC-v68A,5.0,2004-10-19 02:46:40
1,VgBxQqaDgOHq9YRxtMUm3A,6xgcHeLad-VaoTIQewK84A,G5LXEaxhQMF_BVjNHkVr7g,5.0,2005-04-27 23:38:05
2,lgpSS6UsKYIvnQaw8JwHlQ,--9e1ONYQuAa-CB_Rrw7Tw,G5LXEaxhQMF_BVjNHkVr7g,5.0,2005-04-27 23:38:49
3,xKf_Y6PmebgJtFLULpYCzA,Wxxvi3LZbHNIDwJ-ZimtnA,G5LXEaxhQMF_BVjNHkVr7g,5.0,2005-04-27 23:44:33
4,9P9aP_laBPpHWbdY9ZZYyA,BjH8Xepc10i6OhCDQdX6og,nzsv-p1O8gCfP3XijfQrIw,4.0,2005-04-28 04:42:38


In [16]:
import numpy as np

# Collect all unique user and business ID's and map to unique index
user_ids = np.union1d(np.union1d(val['user_id'], test['user_id']), 
                      train['user_id'])
user_dict = dict(zip(user_ids, range(len(user_ids))))
biz_ids = np.union1d(np.union1d(val['business_id'], test['business_id']), 
                     train['business_id'])
biz_dict = dict(zip(biz_ids, range(len(biz_ids))))

# Tasks
## Task 1
Estimate global bias $b_g$, user specific bias $b_i$, and item specific bias $b_j$ on training data. Report global bias, and user/item specific bias estimates for one example user/business, respectively.

In [17]:
# estimate global, user, and item biases of reviews in dataframe
def get_bias(df):
    b_g = sum(df['stars']) / len(df['stars'])
    b_i = b_g - df.groupby('user_id')['stars'].mean()
    b_j = b_g - df.groupby('business_id')['stars'].mean()
    return b_g, b_i.to_dict(), b_j.to_dict()

b_g, b_i, b_j = get_bias(train)

## Task 2
Train a Latent Factor Model without bias, with k=8 factors, for 10 epochs, with learning rate 0.01 and regularisation 0.3, reporting RMSE for each epoch.

In [18]:
from tqdm.notebook import trange, tqdm
from scipy import sparse

b = [biz_dict[b] for b in train['business_id']]
u = [user_dict[u] for u in train['user_id']]
r = train['stars']
R = sparse.coo_matrix((r, (u, b)), (len(user_ids), len(biz_ids))).toarray()
R

array([[4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [54]:
def train_LFM(r=R, k=8, n_epochs=10, lr=0.01, lmda=0.3, verbose=0):
    Q = np.random.standard_normal((len(user_ids), k))
    P = np.random.standard_normal((len(biz_ids), k))
    RMSE = np.zeros(n_epochs)

    for e in trange(n_epochs):
        relevant = r != 0
        rhat = Q.dot(P.transpose()) * relevant
        err = r - rhat
        dLq = -2*err.dot(P) + 2*lmda*Q
        dLp = -2*err.transpose().dot(Q) + 2*lmda*P
        Q -= lr * dLq
        P -= lr * dLp
        RMSE[e] = np.sqrt(np.mean(sparse.find(err**2)[2]))
        if verbose:
            print(f'Epoch {e} RMSE: {RMSE[e]:.4f}')
    return {'k': k, 'RMSE': RMSE, 'Q': Q, 'P': P}

In [56]:
lfm8 = train_LFM(k=8, verbose=1)

  0%|          | 0/10 [00:00<?, ?it/s]

Epoch 0 RMSE: 4.9290
Epoch 1 RMSE: 8.5451


KeyboardInterrupt: 

## Task 3
Report RMSE on validation set of LFM trained with each k in {4,8,16}. Choose the model with best RMSE and report RMSE on test set.

In [None]:
lfm4 = train_LFM(k=4)
lfm16 = train_LFM(k=16)

In [None]:
def get_predictions(lfm, dataset):
    Q = lfm['Q']
    P = lfm['P']
    data = dataset.to_numpy()
    return [Q[user_dict[x[2]]].dot(P[biz_dict[x[1]]]) for x in data]

def get_rmse(lfm, dataset):
    pred = get_predictions(lfm, dataset)
    return np.sqrt(np.mean((dataset['stars'] - pred)**2))

In [None]:
for lfm in [lfm4, lfm8, lfm16]:
    print(f'LFM k={lfm["k"]} validation RMSE: {get_rmse(lfm, val)}')

In [None]:
print(f'LFM k=16 test RMSE: {get_rmse(lfm16, test)}')

## Task 4
Add bias terms to LFM, initialising with the estimated bias from Task 1. Train a model like that in Task 2, reporting RMSE of each epoch and specific bias of a single user and business.

In [None]:
def LFM_bias(b_g, b_i, b_j, data=train_obs, k=8, n_epochs=10, lr=0.01, lmda=0.3, verbose=0):
    Q = {u: np.random.rand(k) for u in user_ids}
    P = {b: np.random.rand(k) for b in biz_ids}
    RMSE = np.zeros(n_epochs)

    for e in range(n_epochs):
        SE = np.zeros(len(data))
        rng = trange(len(data), leave=False, desc=f'Ep. {e}') if verbose else range(len(data))
        for i in rng:
            b, u, r = data[i][1:4]
            rhat = b_g + b_i[u] + b_j[b] + Q[u].dot(P[b])
            err = r - rhat
            SE[i] = err**2
            # vectorised update of each factor by gradient of error
            Q[u] -= lr * (-2*err*P[b] + 2*lmda*Q[u])
            P[b] -= lr * (-2*err*Q[u] + 2*lmda*P[b])
            b_i[u] -= lr * (-2*err + 2*lmda*b_i[u])
            b_j[b] -= lr * (-2*err + 2*lmda*b_j[b])
        RMSE[e] = np.sqrt(np.mean(SE))
        if verbose:
            print(f'Epoch {e} RMSE: {RMSE[e]:.4f}')
    return {'k': k, 'RMSE': RMSE, 'Q': Q, 'P': P, 'b_i': b_i, 'b_j': b_j}

In [None]:
lfm8_b = LFM_bias(b_g, b_i.copy(), b_j.copy(), k=8, verbose=1)

In [None]:
usr = 'b4aIMeXOx4cn3bjtdIOo6Q'
biz = '7VQYoXk3Tc8EZeKuXeixeg'

print(f'Bias of user "{usr}": {lfm8_b['b_i'][usr]:.2f}')
print(f'Bias of business "{biz}": {lfm8_b['b_j'][biz]:.2f}')

In [None]:
lfm4_b = LFM_bias(b_g, b_i.copy(), b_j.copy(), k=4)
lfm16_b = LFM_bias(b_g, b_i.copy(), b_j.copy(), k=16)

In [None]:
for lfm in [lfm4_b, lfm8_b, lfm16_b]:
    print(f'LFM with bias k={lfm["k"]} validation RMSE: {get_rmse(lfm, val)}')

In [None]:
print(f'LFM with bias k=16 test RMSE: {get_rmse(lfm16_b, test)}')