In [1]:
import numpy as np
from scipy.sparse import csr_matrix as sparse_mat
import pickle
import sys

In [2]:
def non_zero(sparse_matrix):
    return np.transpose(sparse_matrix.nonzero())

In [3]:
def mean_nonzero(sparse_matrix):
    points = non_zero(sparse_matrix)
    sum = 0
    for point in points:
        sum += c[point[0], point[1]]
    return sum / len(points)

In [3]:
ratings = pickle.load(open('pickles/ratings_matrix.p', 'rb'))

In [4]:
ratings.shape

(552339, 77445)

In [16]:
def train(R, k=2, eta=1e-2, lamb=1e-2, threshold=1e-7, iterations=2500):
    m, n = R.shape
    U = np.random.uniform(-1, 1, (m, k))
    B = np.random.uniform(-1, 1, (n, k))
    i = 0
    prev = 0
    curr = sys.maxint
    while i < iterations and abs(curr - prev) > threshold:
        prev = curr
        cost = 0
        indices = non_zero(R)
        for idx in indices[:1000]:
            epsilon = R[idx[0], idx[1]] - np.dot(U[idx[0]], B[idx[1]].T)
            U[idx[0], :] = U[idx[0], :] + eta * (epsilon * B[idx[1]] - lamb * U[idx[0], :])
            B[idx[1], :] = B[idx[1], :] + eta * (epsilon * U[idx[0]] - lamb * B[idx[1], :])
            cost += (R[idx[0], idx[1]] - np.dot(U[idx[0]], B[idx[1]].T )) ** 2 + \
                                    lamb * (np.linalg.norm(U[idx[0]])**2 + np.linalg.norm(B[idx[1]])**2)

        cost = cost / len(non_zero(R))
        curr = cost
        print curr
        i += 1
        
    print "Total Iterations: ", i
    return U, B

In [6]:
user_map = pickle.load(open('pickles/users_map.p', 'rb'))
business_map = pickle.load(open('pickles/business_map.p', 'rb'))
reviews = pickle.load(open('pickles/reviews.p', 'rb'))

In [7]:
baselines = pickle.load(open('pickles/baselines.p', 'rb'))

In [8]:
alpha = baselines['alpha']
beta_users = baselines['beta_users']
beta_business = baselines['beta_business']

In [9]:
# removing baselines from ratings data
for idx in non_zero(ratings):
    ratings[idx[0], idx[1]] -= (alpha + beta_users[idx[0]] + beta_business[idx[1]])

In [17]:
U, B = train(ratings, k=10, iterations=50)

0.000972858238993
0.000827728432799
0.000719478781504
0.000635100261664
0.000567218875413
0.00051129480537
0.000464357430568
0.000424368774021
0.000389876693696
0.000359813964132
0.000333375953871
0.000309942992419
0.000289029309504
0.000270248389154
0.000253288807349
0.000237896965085
0.000223864477046
0.000211018777999
0.000199216000405
0.000188335486065
0.000178275494313
0.00016894980111
0.000160284972189
0.000152218154359
0.000144695271408
0.000137669540872
0.000131100249028
0.000124951736518
0.000119192557729
0.00011379478475
0.000108733432274
0.000103985983936
9.95320036443e-05
9.5352817919e-05
9.14312572649e-05
8.77514463102e-05
8.42986339616e-05
8.10590561637e-05
7.80198250406e-05
7.51688392447e-05
7.24947112413e-05
6.99867080296e-05
6.76347024492e-05
6.54291327557e-05
6.33609685975e-05
6.14216818847e-05
5.96032213477e-05
5.78979898297e-05
5.62988235665e-05
5.47989728802e-05
Total Iterations:  50


In [37]:
reviews.head(5)

Unnamed: 0,business_id,review_id,stars,text,user_id
0,5UmKMjUEUNdYWqANhGckJw,Ya85v4eqdd6k9Od8HbQjyA,4,"Mr Hoagie is an institution. Walking in, it do...",PUFPaY9KxDAcGqfsorJp3Q
1,5UmKMjUEUNdYWqANhGckJw,KPvLNJ21_4wbYNctrOwWdQ,5,Excellent food. Superb customer service. I mis...,Iu6AxdBYGR4A0wspR9BYHA
2,5UmKMjUEUNdYWqANhGckJw,fFSoGV46Yxuwbr3fHNuZig,5,Yes this place is a little out dated and not o...,auESFwWvW42h6alXgFxAXQ
3,UsFtqoBl7naz8AVUBZMjQQ,Di3exaUCFNw1V4kSNW5pgA,5,All the food is great here. But the best thing...,uK8tzraOp4M5u3uYrqIBXg
4,UsFtqoBl7naz8AVUBZMjQQ,0Lua2-PbqEQMjD9r89-asw,3,We checked this place out this past Monday for...,I_47G-R2_egp7ME5u_ltew


In [21]:
np.dot(U[user_map['W-VlJfTsCTBjknjIoTgUqw']], B[business_map['WuCcv_Dyd_1B_1-4jwK7sQ']].T) + alpha + \
        beta_users[user_map['W-VlJfTsCTBjknjIoTgUqw']] + beta_business[business_map['WuCcv_Dyd_1B_1-4jwK7sQ']]

3.7173705244021273

In [22]:
beta_users[user_map['W-VlJfTsCTBjknjIoTgUqw']]

-0.35609211342914149

In [23]:
beta_business[business_map['WuCcv_Dyd_1B_1-4jwK7sQ']]

-0.2560921134291414

In [20]:
ratings[user_map['W-VlJfTsCTBjknjIoTgUqw'], business_map['WuCcv_Dyd_1B_1-4jwK7sQ']]

0.0