In [1]:
import numpy as np
from scipy.sparse import csr_matrix as sparse_mat
import pickle
import sys

In [2]:
def non_zero(sparse_matrix):
    return np.transpose(sparse_matrix.nonzero())[:1000, :]

In [3]:
def mean_nonzero(sparse_matrix):
    points = non_zero(sparse_matrix)
    sum = 0
    for point in points:
        sum += c[point[0], point[1]]
    return sum / len(points)

In [34]:
ratings = pickle.load(open('pickles/ratings_matrix.p', 'rb'))

In [5]:
ratings.shape

(552339, 77445)

In [27]:
def train(R, k=2, eta=1e-2, lamb=1e-2, threshold=1e-4, iterations=2500):
    m, n = R.shape
    U = np.random.uniform(-1, 1, (m, k))
    B = np.random.uniform(-1, 1, (n, k))
    i = 0
    prev = 0
    curr = sys.maxint
    while i < iterations and abs(curr - prev) > threshold:
        prev = curr
        cost = 0
        for idx in non_zero(R):
            epsilon = R[idx[0], idx[1]] - np.dot(U[idx[0]], B[idx[1]].T)
            U[idx[0], :] = U[idx[0], :] + eta * (epsilon * B[idx[1]] - lamb * U[idx[0], :])
            B[idx[1], :] = B[idx[1], :] + eta * (epsilon * U[idx[0]] - lamb * B[idx[1], :])
            cost += (R[idx[0], idx[1]] - np.dot(U[idx[0]], B[idx[1]].T )) ** 2 + \
                                    lamb * (np.linalg.norm(U[idx[0]])**2 + np.linalg.norm(B[idx[1]])**2)

        cost = cost / len(non_zero(R))
        curr = cost
        print curr
        i += 1
        
    print "Total Iterations: ", i
    return U, B

In [16]:
user_map = pickle.load(open('pickles/users_map.p', 'rb'))
business_map = pickle.load(open('pickles/business_map.p', 'rb'))
reviews = pickle.load(open('pickles/reviews.p', 'rb'))

In [17]:
baselines = pickle.load(open('pickles/baselines.p', 'rb'))

In [18]:
alpha = baselines['alpha']
beta_users = baselines['beta_users']
beta_business = baselines['beta_business']

In [35]:
# removing baselines from ratings data
for idx in non_zero(ratings):
    ratings[idx[0], idx[1]] -= (alpha + beta_users[idx[0]] + beta_business[idx[1]])
    print ratings[idx[0], idx[1]]

0.616092113429
0.596092113429
-0.403907886571
0.596092113429
0.0960921134291
0.0960921134291
1.09609211343
1.59609211343
0.0960921134291
1.09609211343
0.0960921134291
-0.903907886571
-0.403907886571
-2.40390788657
0.0960921134291
-0.903907886571
0.596092113429
0.596092113429
1.09609211343
-0.403907886571
-0.403907886571
-0.903907886571
0.0960921134291
-1.40390788657
-0.903907886571
0.596092113429
0.0960921134291
0.0960921134291
0.596092113429
-0.343907886571
-0.843907886571
2.15609211343
-1.34390788657
1.15609211343
0.116092113429
0.116092113429
-0.0439078865709
-0.593907886571
-0.693907886571
0.876092113429
0.176092113429
-0.823907886571
1.17609211343
0.676092113429
-2.82390788657
0.676092113429
0.176092113429
0.176092113429
-1.32390788657
-2.32390788657
-0.323907886571
0.176092113429
1.17609211343
-0.323907886571
-1.82390788657
-1.32390788657
1.17609211343
1.17609211343
0.676092113429
0.676092113429
1.17609211343
-1.32390788657
-0.823907886571
1.67609211343
-1.32390788657
-2.32390788

In [36]:
U, B = train(ratings, k=10, iterations=50)

2.0023452458
1.71097732026
1.4940092052
1.32520633071
1.18955259087
1.07778728525
0.983866593658
0.903664737861
0.834259808504
0.773518272429
0.719841853407
0.672007220293
0.629060974077
0.590248772188
0.554966184978
0.52272376907
0.493121672194
0.465830774228
0.440578406897
0.417137347665
0.39531720328
0.374957573291
0.355922566388
0.338096364973
0.321379616308
0.305686485178
0.290942242126
0.277081288734
0.264045541358
0.25178310959
0.240247217393
0.229395324234
0.219188411202
0.209590403464
0.200567705539
0.192088830027
0.18412410367
0.176645437199
0.169626147403
0.163040821473
0.156865215015
0.151076176275
0.145651590226
0.140570337102
0.135812260945
0.131358144566
0.127189688123
0.123289489229
0.119641023094
0.116228621731
Total Iterations:  50


In [37]:
reviews.head(5)

Unnamed: 0,business_id,review_id,stars,text,user_id
0,5UmKMjUEUNdYWqANhGckJw,Ya85v4eqdd6k9Od8HbQjyA,4,"Mr Hoagie is an institution. Walking in, it do...",PUFPaY9KxDAcGqfsorJp3Q
1,5UmKMjUEUNdYWqANhGckJw,KPvLNJ21_4wbYNctrOwWdQ,5,Excellent food. Superb customer service. I mis...,Iu6AxdBYGR4A0wspR9BYHA
2,5UmKMjUEUNdYWqANhGckJw,fFSoGV46Yxuwbr3fHNuZig,5,Yes this place is a little out dated and not o...,auESFwWvW42h6alXgFxAXQ
3,UsFtqoBl7naz8AVUBZMjQQ,Di3exaUCFNw1V4kSNW5pgA,5,All the food is great here. But the best thing...,uK8tzraOp4M5u3uYrqIBXg
4,UsFtqoBl7naz8AVUBZMjQQ,0Lua2-PbqEQMjD9r89-asw,3,We checked this place out this past Monday for...,I_47G-R2_egp7ME5u_ltew


In [38]:
np.dot(U[user_map['PUFPaY9KxDAcGqfsorJp3Q']], B[business_map['5UmKMjUEUNdYWqANhGckJw']].T)

-0.52675369487902812

In [41]:
ratings[user_map['auESFwWvW42h6alXgFxAXQ'], business_map['5UmKMjUEUNdYWqANhGckJw']]

5.0