In [2]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
import sklearn
from sklearn.decomposition import TruncatedSVD
import math

business = pd.read_csv('business.csv', sep=',', usecols=[5, 41, 42, 55, 58, 59], error_bad_lines=False, encoding="utf-8")
business.columns = ['ambiance', 'business id' , 'categories', 'name', 'num reviews','stars']

user = pd.read_csv('users.csv', sep=',', usecols=[0, 17, 18, 20], error_bad_lines=False, encoding="utf-8")
user.columns = ['average rating', 'name', 'review count', 'user id']

review = pd.read_csv('train_reviews.csv', sep=',', usecols=[0, 4, 5, 8], error_bad_lines=False, encoding="utf-8")
review.columns = ['business id', 'review id', 'stars', 'user id']


In [3]:
business.head()

Unnamed: 0,ambiance,business id,categories,name,num reviews,stars
0,"{'romantic': False, 'intimate': False, 'classy...",KuxDPl6UYNLxFChPm0_MNw,"Cajun/Creole, Southern, Restaurants",Southern Accent Restaurant,146,4.0
1,"{'romantic': False, 'intimate': False, 'classy...",6SAfQKe2oM5g_EtcYXyAMg,"Bars, Sports Bars, Dive Bars, Burgers, Nightli...",Original Hamburger Works,277,4.0
2,"{'romantic': False, 'intimate': False, 'classy...",upB0RQl-l529IVwgOpwOQQ,"Nightlife, Restaurants, Bars, Chicken Wings, A...",Chicken Lips,102,4.5
3,"{'romantic': False, 'intimate': False, 'classy...",TulmRC5V0--dnXYd_GOSvA,"Beer, Wine & Spirits, Italian, Food, American ...",Omelet House Summerlin,242,4.0
4,"{'romantic': False, 'intimate': False, 'classy...",yqYtY3-Po4OVPafA9Z-Xyw,"Event Planning & Services, Soup, Salad, Mexica...",Manuel's Mexican Restaurant & Cantina - Bell Rd,230,3.5


In [4]:
user.head()

Unnamed: 0,average rating,name,review count,user id
0,2.83,James,6,UxfpKHGO2dfQCdS9xLLJow
1,3.0,Yvonne,4,Kr5NDQFPPB_01-5CDmSqVg
2,3.09,Etobicoke,10,wfoeMtriLwZsdRzcxNTaFA
3,4.0,Shirley,4,aXb0kCIsIbPEEUSGomrrmA
4,4.0,Brian,1,sLrX2KGu3lc_JczAnsg0_Q


In [5]:
review.head()

Unnamed: 0,business id,review id,stars,user id
0,WPCgtEG-bJt0cZtnM-x7yw,6sferX8QOJq9g5PONKZH4w,4.0,VDh1vjzpNUJH6HfcjH8g7Q
1,LnnO7quTjjdTUkCshSJnkA,OybsTOFlklnoFl8ZwRa_AQ,5.0,HnnjIuLrdhLTsRRVrrFIjA
2,sKrlmbrZWCyLIgiMihCPqw,fE-Z39pw4Mb5yjr9GihtHQ,5.0,HnnjIuLrdhLTsRRVrrFIjA
3,Lh5qnT2m2b4lvyYiMGMDkg,DuC9JQLQ3yY0kPsonFvk-Q,4.0,HnnjIuLrdhLTsRRVrrFIjA
4,54LYVM1gCGQ2UVFK9QhgTw,cUFJE_U6s6q9_tjDw3nTmA,5.0,HnnjIuLrdhLTsRRVrrFIjA


In [6]:
#
# CREATE MAPPINGS OF IDs TO OBJECTS
#

userId = {}
for i, row in user.iterrows():
    userId[row[3]] = row
    
businessId = {}
for i, row in business.iterrows():
    businessId[row[1]] = row
    
ratings = {}
for i, rating in review.iterrows():
    ratings[(rating[0], rating[3])] = rating[2]
        

bizid: f_eiOrEcMnkHB7GvQVOHkQ rating: 4.0


In [7]:
import collections

user2reviews = collections.defaultdict(dict)

for i, row in review.iterrows():
    user2reviews[row[3]][row[0]] = row[2]
# user2reviews maps a user to a map of business : rating    
# dict[userId][businessId] = rating

In [8]:
# just for testing
print(user2reviews['v1zm5ES3dZtn0htQNMr4Gg']['t41_VSBs7akY2POWNtzqxw'])

# print(user2reviews['QGe-bLXLO497G7NfKOFKcA'])

5.0


In [9]:
#
# CREATE A 2D MATRIX OF BUSINESS BY USER USING PIVOT TABLE
# MISSING DATA SET TO 0

user_biz_matrix = review.pivot(index = "user id", columns="business id", values="stars").fillna(0)
user_biz_matrix.head()

business id,--FBCX-N37CMYDfs790Bnw,-050d_XIor1NpCuWkbIVaQ,-0tgMGl7D9B10YjSN2ujLA,-11PbySWhJQtK6USx4IP2A,-1UMR00eXtwaeh59pEiDjA,-1VaIJza42Hjev6ukacCNg,-1m9o3vGRA8IBPNvNqKLmA,-1xuC540Nycht_iWFeJ-dw,-2ToCaDFpTNmmg3QFzxcWg,-3oxnPPPU3YoxO9M1I2idg,...,zwNLJ2VglfEvGu7DDZjJ4g,zwmps5SXn30g-f5wqg_r9A,zwvshlu1bE2na9sXYrP0TQ,zxSfGIhK3hH3vVz_pS5eaA,zx_xDVaVQlE3eDoFFENkow,zyEQSfxX3FPIP6qyNsU0AA,zyPGYeXF4XKCqNN1pjFWhg,zzSYBWuv_fXGtSgsO-6_1g,zzUj3ej4vm_DtvRxNvWDEw,zzmIMvqiBJ_-wVKg_OnGpw
user id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
--B6JIfTRS2gBmqKhCaI5g,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--UNNdnHRhsyFUbDgumdtQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--XsxD0sMPKjWzApqy43XQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
--mo7orpQqNKLstiu6kapQ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-0VMIZo5H9ukdTfD2SPd2A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
#
# NOW CAST THIS SPARSE MATRIX TO A CSR (COMPRESSED SPARSE ROW) MATRIX
user_biz_csr = csr_matrix(user_biz_matrix.values)

In [11]:
#
# MAP USERID TO USER_BIZ_MATRIX LOCATION
userid2idx = {}
idx = 0
for userid, row in user_biz_matrix.iterrows():
    userid2idx[userid] = idx
    idx +=1
    #     print('idx:', idx, 'userid', userid)
    #     idx+=1
    #     if idx == 10:
    #         break

In [12]:
#
# LEARN THE MODEL
from sklearn.neighbors import NearestNeighbors
model_knn = NearestNeighbors(metric='cosine', algorithm = 'brute')
model_knn.fit(user_biz_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [13]:
#
# calculate the distances of the nearest 5 neighbors
# query_index = np.random.choice(user_biz_matrix.shape[0])
query_index = 0
distances, indices = model_knn.kneighbors(user_biz_matrix[query_index:query_index+1], n_neighbors = 6)
# print(query_index)


In [14]:
#
# dislpay the K nearest neighbors
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for {0}:\n'.format(user_biz_matrix.index[query_index]))
    else:
        print('{0}: {1}, with distance of {2}:'.format(i, user_biz_matrix.index[indices.flatten()[i]], distances.flatten()[i]))

Recommendations for --B6JIfTRS2gBmqKhCaI5g:

1: kbfpED-6FURIsNlsAXqc2g, with distance of 0.1425070742874559:
2: vbddiP0dB-Ul4UQfzVDatg, with distance of 0.1425070742874559:
3: PyTNdQF4CE4Avs9EMUAJgg, with distance of 0.48550424457247354:
4: N5c0CxxU7jlZQ8lTK7KxLg, with distance of 0.7990005579736766:
5: Lm6GbEZlYxX4q2qCG1Bb5w, with distance of 0.8067429881158505:


In [17]:
test_data = pd.read_csv('test_queries.csv', sep=',',error_bad_lines=False, encoding="utf-8")
test_data.columns = ['userid', 'bizid']
test_data.head()

Unnamed: 0,userid,bizid
0,XEDaNNCTVAqPpvyX2zY03g,e880MGw6C6PJL-n3EJoY_g
1,E9WotdbCBpoeID-DHSQh1A,ExLXL_oQvI7oPB7TrVpGhg
2,8erAzkOo9KHs1Njo5bMm_Q,XKQsxvDMe9rAWO0P0Uwx-Q
3,eHJTLVy1bm7zSGVQlNH6KA,awI4hHMfa7H0Xf0-ChU5hg
4,KF2L4CDnmPz6diaH6TrxbA,jKrlc4xQIvfzDZ8eNfqxwg


In [None]:
# initialize return DF
df = {}
df['index'] = []
df['stars'] = []

# iterate through data frame
for i, row in test_data.iterrows():
    # get the index of userid, row['userid']
    if row['userid'] not in userid2idx:
        df['index'].append(i)
        df['stars'].append(businessId[row['bizid']]['stars'])
        continue
        
    index = userid2idx[row['userid']]
    distances, indices = model_knn.kneighbors(user_biz_matrix[index:index+1], n_neighbors = 5) # this takes a long time :|
    total = 0
    total_count = 0
    business = row['bizid']
    for neighbor in range(1, len(distances.flatten())):
        # for each nearest neighbor, add the prediction
        user = user_biz_matrix.index[indices.flatten()[neighbor]]
        dic = user2reviews[user]
        if business in dic: 
            # a similar yelper has visitied this specific restaurant
            total += dic[business]
            total_count +=1
    if total != 0:
        df['index'].append(i)
        df['stars'].append(total/total_count)
    else:
        df['index'].append(i)
        df['stars'].append(businessId[business]['stars'])  
    
    # to track progression 
    percent = str(i / 50079 * 100)[0:4]
    print('iteration #', i, '...', percent, '% complete')

In [94]:
df = pd.DataFrame(data=df)
df.to_csv('submission.csv')