## This model uses the K-NN(K-Nearest Neighbours) machine learning algorithm to measure the similarities for the collaborative filtering

# Import The Required Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

# Load the dataset

This dataset consists of reviews of fine foods from amazon. The data span a period of more than 10 years, including all ~500,000 reviews up to October 2012. Reviews include product and user information, ratings, and a plain text review. It also includes reviews from all other Amazon categories.

Data includes:
<li>Reviews from Oct 1999 - Oct 2012</li>
<li>568,454 reviews</li>
<li>256,059 users</li>
<li>74,258 products</li>
<li>260 users with > 50 reviews</li>

The dataset includes 10 columns as shown below

In [2]:
reviews = pd.read_csv('data/reviews.csv')

reviews.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


We well use Collaborative filtering recommendation using K-NN(K-Nearset Nieghbours) as a similiarity measurement.

We well remove all unnecessary columns from the dataframe

In [3]:
reviews = reviews[['ProductId', 'UserId', 'Score']]
reviews = reviews.rename(columns={'Score':'Stars'})
reviews.head()

Unnamed: 0,ProductId,UserId,Stars
0,B001E4KFG0,A3SGXH7AUHU8GW,5
1,B00813GRG4,A1D87F6ZCVE5NK,1
2,B000LQOCH0,ABXLMWJIXXAIN,4
3,B000UA0QIQ,A395BORC6FGVXV,2
4,B006K2ZZ7K,A1UQRSCLF8GW1T,5


### Take subset of dataset to make it less sparse/more dense. ( For example, keep the users only who has given 50 or more number of ratings )

In [4]:
# Top 10 users based on rating
most_rated = reviews.groupby('UserId').size().sort_values(ascending=False)[:10]
most_rated

UserId
A3OXHLG6DIBRW8    448
A1YUL9PCJR3JTY    421
AY12DBB0U420B     389
A281NPSIMI1C2R    365
A1Z54EM24Y40LL    256
A1TMAVN4CEM8U8    204
A2MUGFV2TDQ47K    201
A3TVZM3ZIXG8YW    199
A3PJZ8TU8FDQ1K    178
AQQLWCMRNDFGI     176
dtype: int64

### Data model preparation as per requirement on number of minimum ratings


In [5]:
counts = reviews['UserId'].value_counts()
reviews_final = reviews[reviews['UserId'].isin(counts[counts >= 50].index)]

In [6]:
reviews_final.head()

Unnamed: 0,ProductId,UserId,Stars
14,B001GVISJM,A2MUGFV2TDQ47K,5
44,B001EO5QW8,A2G7B7FKP2O2PU,5
46,B001EO5QW8,AQLL2R1PPR46X,5
109,B001REEG6C,AY12DBB0U420B,5
141,B001GVISJW,A2YIO225BTKVPU,4


In [7]:
print('Number of users who have rated 50 or more items =', len(reviews_final))
print('Number of unique USERS in final data = ', reviews_final['UserId'].nunique())
print('Number of unique ITEMS in final data = ', reviews_final['ProductId'].nunique())


Number of users who have rated 50 or more items = 22941
Number of unique USERS in final data =  267
Number of unique ITEMS in final data =  11313


Transform the values(restaurant_rating) of the matrix dataframe into a scipy sparse matrix for more efficient calculations.

In [8]:
user_reviews = pd.pivot_table(reviews_final,index=['UserId'], columns = 'ProductId', values = "Stars").fillna(0)
user_reviews['user_index'] = np.arange(0, user_reviews.shape[0], 1)
user_reviews = user_reviews.set_index(['user_index'])
user_reviews_features_matrix = csr_matrix(user_reviews.values)
user_reviews.head()

ProductId,7310172001,7310172101,7800648702,B00004CI84,B00004CXX9,B00004RBDU,B00004RBDZ,B00004RYGX,B00004S1C6,B000052Y74,...,B009KAQZ9G,B009KAQZIM,B009KOHGEK,B009KP6HBM,B009LRLB6U,B009LT26BC,B009M2LUEW,B009PCDDO4,B009QEBGIQ,B009RB4GO4
user_index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Fitting the K-NN

In [9]:
knn_recomm = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
knn_recomm.fit(user_reviews_features_matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

### Testing the model

In [10]:
#randomChoice = np.random.choice(user_reviews.shape[0])
randomChoice = 199
distances, indices = knn_recomm.kneighbors(user_reviews.iloc[randomChoice].values.reshape(1, -1), n_neighbors = 11)
for i in range(0, len(distances.flatten())):
    if i == 0:
        print('Recommendations for User {0} on priority basis:\n'.format(user_reviews.index[randomChoice]))
    else:
        print('{0}: {1}'.format(i, user_reviews.index[indices.flatten()[i]]))

Recommendations for User 199 on priority basis:

1: 184
2: 188
3: 230
4: 181
5: 70
6: 109
7: 108
8: 185
9: 124
10: 190


In [11]:
# index 0 is the user himself
result = user_reviews.loc[indices.flatten()[1]]

user = user_reviews.loc[randomChoice]

recommendations = result[user == 0]
recommendations = recommendations[result > 4]
print(recommendations)

ProductId
B000168O3I    5.000000
B0001VVCV4    5.000000
B0001VVCVO    5.000000
B0006UIJE8    5.000000
B000E63L8S    5.000000
B000E671W0    5.000000
B000EMK4CS    5.000000
B000EMQF58    5.000000
B000EMQFY4    5.000000
B000EPUPSS    5.000000
B000ETVRQS    5.000000
B000F4DKAS    5.000000
B000F4F95C    5.000000
B000F4F95M    5.000000
B000F4J76E    5.000000
B000I0QDVC    5.000000
B000JVCBO8    5.000000
B000MPQ4Q2    5.000000
B000YCJRIU    5.000000
B0011DMP8K    5.000000
B0014X5O1C    5.000000
B00181Y1PU    5.000000
B0018SMUVA    5.000000
B001E6IUMY    5.000000
B001G7QG5O    5.000000
B001IMUHCU    5.000000
B001KUOGAO    5.000000
B001OCBT3U    5.000000
B001OCKIBY    5.000000
B001OHX9A6    5.000000
B001U2OMAQ    5.000000
B0027MIP9C    5.000000
B002L68KDK    5.000000
B002LMA8FC    5.000000
B002LMQ6OO    5.000000
B002LMQRA2    5.000000
B002LMXFCU    5.000000
B002LN566C    5.000000
B003MZ89GW    5.000000
B003MZH8PK    5.000000
B004158VLU    5.000000
B004867T24    5.000000
B004BKLHOS    5.000000
B