In [3]:
import numpy as np
from scipy.sparse import csr_matrix as sparse_mat
import pickle
import sys
from sklearn.cross_validation import train_test_split

In [4]:
def non_zero(sparse_matrix):
    return np.transpose(sparse_matrix.nonzero())

In [5]:
ratings = pickle.load(open('pickles/ratings_matrix.p', 'rb'))

In [6]:
ratings.shape

(552339, 77445)

---
## Splitting the Dataset into Training, Validation and Testing Sets

In [8]:
training = []
validation = []
testing = []
for j, business in enumerate(ratings.T):
    indices = non_zero(business)
    indices = [[idx[1], j] for idx in indices]
    if len(indices) > 10:
        train, optimization = train_test_split(indices, test_size=0.3, random_state=42)
        training.extend(train)
        valid, test = train_test_split(optimization, test_size=0.33, random_state=42)
        validation.extend(valid)
        testing.extend(test)
    else:
        training.extend(indices)

#### Saving split data into pickles, so that we can evaluate all models on the same training, validation and testing sets

In [10]:
pickle.dump(training, open('pickles/training_indices.p', 'wb'))
pickle.dump(validation, open('pickles/validation_indices.p', 'wb'))
pickle.dump(testing, open('pickles/testing_indices.p', 'wb'))

In [11]:
training = pickle.load(open('pickles/training_indices.p', 'rb'))
validation = pickle.load(open('pickles/validation_indices.p', 'rb'))
testing = pickle.load(open('pickles/testing_indices.p', 'rb'))

## The Latent Factor Model, without considering reviews:

In [13]:
def train_latent_factors(R, training_indices, k=2, eta=1e-2, lamb=1e-2, threshold=1e-7, iterations=2500):
    m, n = R.shape
    U = np.random.uniform(-1, 1, (m, k))
    B = np.random.uniform(-1, 1, (n, k))
    i = 0
    prev = 0
    curr = sys.maxint
    while i < iterations and abs(curr - prev) > threshold:
        prev = curr
        cost = 0
        for idx in training_indices[:1000]:
            epsilon = R[idx[0], idx[1]] - np.dot(U[idx[0]], B[idx[1]].T)
            U[idx[0], :] = U[idx[0], :] + eta * (epsilon * B[idx[1]] - lamb * U[idx[0], :])
            B[idx[1], :] = B[idx[1], :] + eta * (epsilon * U[idx[0]] - lamb * B[idx[1], :])
            cost += (R[idx[0], idx[1]] - np.dot(U[idx[0]], B[idx[1]].T )) ** 2 + \
                                    lamb * (np.linalg.norm(U[idx[0]])**2 + np.linalg.norm(B[idx[1]])**2)

        cost = cost / len(non_zero(R))
        curr = cost
        print curr
        i += 1
        
    print "Total Iterations: ", i
    return U, B

In [16]:
user_map = pickle.load(open('pickles/users_map.p', 'rb'))
business_map = pickle.load(open('pickles/business_map.p', 'rb'))
reviews = pickle.load(open('pickles/reviews.p', 'rb'))

#### Global and Individual Baselines have to be adjusted before training

In [17]:
baselines = pickle.load(open('pickles/baselines.p', 'rb'))
alpha = baselines['alpha']
beta_users = baselines['beta_users']
beta_business = baselines['beta_business']

In [18]:
# removing baselines from ratings data
for idx in non_zero(ratings):
    ratings[idx[0], idx[1]] -= (alpha + beta_users[idx[0]] + beta_business[idx[1]])

In [19]:
U, B = train_latent_factors(ratings, training, k=10, iterations=50)

0.000850987737661
0.000715884893602
0.000621437228679
0.000551492144223
0.000497422119349
0.000454202038018
0.000418706776023
0.000388890672492
0.0003633590322
0.000341129454906
0.000321491573338
0.000303920778515
0.00028802302697
0.000273498307584
0.000260115728536
0.000247696077583
0.000236099327363
0.0002252154965
0.000214957840416
0.000205257693694
0.000196060506569
0.000187322761477
0.000179009550455
0.000171092657916
0.000163549036647
0.000156359594491
0.000149508229676
0.000142981066994
0.000136765857163
0.00013085150906
0.000125227730074
0.000119884754202
0.00011481314104
0.000110003631786
0.000105447050826
0.000101134243525
9.7056042538e-05
9.32032562931e-05
8.95666743826e-05
8.61370854656e-05
8.29053039624e-05
7.98622024088e-05
7.69987468342e-05
7.43060329834e-05
7.17753216215e-05
6.93980715571e-05
6.71659693821e-05
6.50709552622e-05
6.31052444006e-05
6.12613440465e-05
Total Iterations:  50


In [20]:
reviews.head(5)

Unnamed: 0,business_id,review_id,stars,text,user_id
0,5UmKMjUEUNdYWqANhGckJw,Ya85v4eqdd6k9Od8HbQjyA,4,"Mr Hoagie is an institution. Walking in, it do...",PUFPaY9KxDAcGqfsorJp3Q
1,5UmKMjUEUNdYWqANhGckJw,KPvLNJ21_4wbYNctrOwWdQ,5,Excellent food. Superb customer service. I mis...,Iu6AxdBYGR4A0wspR9BYHA
2,5UmKMjUEUNdYWqANhGckJw,fFSoGV46Yxuwbr3fHNuZig,5,Yes this place is a little out dated and not o...,auESFwWvW42h6alXgFxAXQ
3,UsFtqoBl7naz8AVUBZMjQQ,Di3exaUCFNw1V4kSNW5pgA,5,All the food is great here. But the best thing...,uK8tzraOp4M5u3uYrqIBXg
4,UsFtqoBl7naz8AVUBZMjQQ,0Lua2-PbqEQMjD9r89-asw,3,We checked this place out this past Monday for...,I_47G-R2_egp7ME5u_ltew


In [21]:
np.dot(U[user_map['W-VlJfTsCTBjknjIoTgUqw']], B[business_map['WuCcv_Dyd_1B_1-4jwK7sQ']].T) + alpha + \
        beta_users[user_map['W-VlJfTsCTBjknjIoTgUqw']] + beta_business[business_map['WuCcv_Dyd_1B_1-4jwK7sQ']]

1.9104009473961998

In [22]:
len(training)

1561427

In [23]:
len(validation)

385158

In [24]:
len(testing)

208836