In [1]:
import numpy as np  
import pandas as pd

In [2]:
colnames = ['userId', 'productId', 'ratings', 'timestamp']
ratings_data = pd.read_csv("ratings_Electronics.csv", names= colnames)  
ratings_data.head()

Unnamed: 0,userId,productId,ratings,timestamp
0,AKM1MP6P0OYPR,132793040,5.0,1365811200
1,A2CX7LUOHB2NDG,321732944,5.0,1341100800
2,A2NWSAGRHCP8N5,439886341,1.0,1367193600
3,A2WNBOD3WNDNKT,439886341,3.0,1374451200
4,A1GI0U4ZRJA8WN,439886341,1.0,1334707200


In [3]:
ratings_data.shape

(7824482, 4)

In [4]:
ratings_data.dtypes

userId        object
productId     object
ratings      float64
timestamp      int64
dtype: object

In [5]:
# cutting down the record size based on user rated more than 70 times
users = ratings_data[ratings_data.groupby('userId')['userId'].transform('size') > 70]

users.shape

(75352, 4)

In [6]:
users.head()

Unnamed: 0,userId,productId,ratings,timestamp
118,AT09WGFUM934H,594481813,3.0,1377907200
177,A32HSNCNPRUMTR,970407998,1.0,1319673600
178,A17HMM1M7T9PJ1,970407998,4.0,1281744000
631,A3TAS1AG6FMBQW,972683275,5.0,1353456000
1322,A1A5KUIIIHFF4U,1400501466,1.0,1332547200


In [7]:
users.describe()

Unnamed: 0,ratings,timestamp
count,75352.0,75352.0
mean,4.258308,1317145000.0
std,1.047671,78714720.0
min,1.0,944006400.0
25%,4.0,1278115000.0
50%,5.0,1340928000.0
75%,5.0,1375315000.0
max,5.0,1406074000.0


In [30]:
#Popularity Recommender Model

In [8]:
users.groupby('productId')['ratings'].mean().sort_values(ascending=False).head()

productId
B00LKG1MC8    5.0
B002IPHA1W    5.0
B002IKKFIC    5.0
B002IKLJU0    5.0
B002IKLJUU    5.0
Name: ratings, dtype: float64

In [9]:
users.groupby('productId')['ratings'].count().sort_values(ascending=False).head() 

productId
B0088CJT4U    131
B003ES5ZUU    102
B002R5AM7C     98
B000N99BBC     98
B00829TIEK     95
Name: ratings, dtype: int64

In [10]:
ratings_mean_count = pd.DataFrame(users.groupby('productId')['ratings'].mean())

In [11]:
ratings_mean_count['rating_counts'] = pd.DataFrame(users.groupby('productId')['ratings'].count()) 

In [12]:
ratings_mean_count.head()

Unnamed: 0_level_0,ratings,rating_counts
productId,Unnamed: 1_level_1,Unnamed: 2_level_1
594481813,3.0,1
970407998,2.5,2
972683275,5.0,1
1400501466,2.0,2
1400501776,4.0,1


In [15]:
from surprise import Dataset,Reader

reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(users[['userId', 'productId', 'ratings']], reader)

In [16]:
data.df.head()

Unnamed: 0,userId,productId,ratings
118,AT09WGFUM934H,594481813,3.0
177,A32HSNCNPRUMTR,970407998,1.0
178,A17HMM1M7T9PJ1,970407998,4.0
631,A3TAS1AG6FMBQW,972683275,5.0
1322,A1A5KUIIIHFF4U,1400501466,1.0


In [17]:
# Split data to train and test
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.30,random_state=123)

In [18]:
from surprise import SVD
from surprise import accuracy

In [19]:
svd_model = SVD(n_factors=50,biased=False)
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x116062160>

In [20]:
testset

[('ANRS196NKFVUU', 'B000O8I474', 5.0),
 ('A3R4794K2RVU1S', 'B0048IW030', 3.0),
 ('AXU3VKZE848IY', 'B001SEQN3U', 5.0),
 ('ARXU3FESTWMJJ', 'B0011000R6', 4.0),
 ('A2JMJVNTBL7K7E', 'B002J9HBIO', 4.0),
 ('A1L64KDYO5BOJA', 'B0019UGCLG', 4.0),
 ('A2IFKH3TJ10387', 'B009WE65EA', 5.0),
 ('A2NYK9KWFMJV4Y', 'B00HVMIL1U', 4.0),
 ('A1C5WS021EL3WO', 'B00BY3XN7E', 5.0),
 ('A2FRKEXDXDN1KI', 'B003VS9UP4', 4.0),
 ('A1BVE2ZIBKJ7YI', 'B003HGHR82', 5.0),
 ('A3977M5S0GIG5H', 'B003ZUIHY8', 5.0),
 ('A7QMQBGJ2TCQG', 'B000ZPJEDE', 2.0),
 ('A27ADCSD15F3GL', 'B00E20SSWQ', 2.0),
 ('A2AF0NOCM71J0B', 'B008DBI6Y0', 4.0),
 ('AS269J1OGZT1V', 'B006OSQALU', 3.0),
 ('AROQO2VDODT7', 'B003GBRXKU', 4.0),
 ('A30XZK10EZN9V4', 'B000G7PLCY', 5.0),
 ('ACJT8MUC0LRF0', 'B00DK2JQOQ', 5.0),
 ('A25HBO5V8S8SEA', 'B00023440W', 5.0),
 ('A1CSRR7FCKBL9M', 'B003IZFCFW', 4.0),
 ('A14JBDSWKPKTZA', 'B00006G2OJ', 5.0),
 ('AAA0TUKS5VBSA', 'B004CLYEFK', 4.0),
 ('ADLVFFE4VBT8', 'B0032ZZP82', 5.0),
 ('A39137LW12KK7B', 'B000VZCEUI', 4.0),
 ('AFICF7DK

In [21]:
test_pred = svd_model.test(testset)

In [22]:
# compute RMSE
accuracy.rmse(test_pred)

RMSE: 2.0845


2.0844725075510953

In [23]:
uid = "A3Q4TYJVAM4IRM"  # raw user id (as in the ratings file). They are **strings**!
iid = "B000RQHAUA"  # raw item id (as in the ratings file). They are **strings**!

In [24]:
# get a prediction for specific users and items.
pred = svd_model.predict(uid, iid, r_ui=0.0, verbose=True)

user: A3Q4TYJVAM4IRM item: B000RQHAUA r_ui = 0.00   est = 1.62   {'was_impossible': False}


In [25]:
pred = pd.DataFrame(test_pred)
pred[pred['uid'] == "A3Q4TYJVAM4IRM"][['iid', 'r_ui']].sort_values(by = 'r_ui',ascending = False).head(10)

Unnamed: 0,iid,r_ui
138,B003CFATT2,5.0
11287,B000AZ57M6,5.0
7438,B0007RBWSU,5.0
8009,B001DJ64C0,5.0
8528,B0002SQ0A4,5.0
199,B000W67G72,5.0
17736,B002I3OZB2,5.0
9906,B0000ANEX9,5.0
16584,B00JC31SGG,5.0
6286,B00834SJSK,5.0
