## Collected data has uploaded to S3 and made it publicly accessible. Now going to read the data to our working directory.

In [1]:
%mkdir ../data
!wget -O ../data/home_and_kitchen.csv - https://raw-data.s3.ap-south-1.amazonaws.com/home_and_kitchen.csv  

--2020-08-25 23:29:30--  http://-/
Resolving - (-)... failed: Name or service not known.
wget: unable to resolve host address ‘-’
--2020-08-25 23:29:30--  https://raw-data.s3.ap-south-1.amazonaws.com/home_and_kitchen.csv
Resolving raw-data.s3.ap-south-1.amazonaws.com (raw-data.s3.ap-south-1.amazonaws.com)... 52.219.62.111
Connecting to raw-data.s3.ap-south-1.amazonaws.com (raw-data.s3.ap-south-1.amazonaws.com)|52.219.62.111|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 915245469 (873M) [text/csv]
Saving to: ‘../data/home_and_kitchen.csv’


2020-08-25 23:30:30 (14.9 MB/s) - ‘../data/home_and_kitchen.csv’ saved [915245469/915245469]

FINISHED --2020-08-25 23:30:30--
Total wall clock time: 1m 0s
Downloaded: 1 files, 873M in 59s (14.9 MB/s)


## Defining directory and file paths

In [2]:
data_dir = '../data'
raw_data_file = '../data/home_and_kitchen.csv'
processed_data_file = '../data/home_and_kitchen_processed.csv'

## Importing necessary libraries for data processing 

In [3]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# read the data with coloumns by giving columns to the data frame
# data frame = home_kitchen_data 
home_kitchen_data = None
columns=['productId', 'reviewerId', 'rating','timestamp']
home_kitchen_data = pd.read_csv(raw_data_file,names=columns)
home_kitchen_data

Unnamed: 0,productId,reviewerId,rating,timestamp
0,0006564224,A3NSN9WOX8470M,5.0,1283212800
1,0006564224,A2AMX0AJ2BUDNV,5.0,1270166400
2,0560467893,A8LUWTIPU9CZB,5.0,1446681600
3,0560467893,AABKIIHAL0L66,4.0,1446076800
4,0560467893,A3DA0KIQ5OBK5C,3.0,1441756800
...,...,...,...,...
21928563,B01HJH7K3Q,A33Z5MOHLFIECI,3.0,1513814400
21928564,B01HJH7K3Q,A1BP93Y9L2HUL7,5.0,1493164800
21928565,B01HJH7K3Q,A3EYW1FBJ48SH1,5.0,1483401600
21928566,B01HJHTC6O,AYOGJY5CDIY49,5.0,1533254400


In [5]:
# Data Preprocessing 
# As the timestamp is not necessary for our model we can remove the time stamp column from data frame
home_kitchen_data.drop('timestamp',axis=1,inplace=True)
home_kitchen_data.head()
# write this processed file in to a seperate csv file for future use
home_kitchen_data.to_csv(processed_data_file,index = False)

In [6]:
# read the processed data file after removing unnecessary columns 
# data frame = home_kitchen_data 
home_kitchen_data = None
home_kitchen_data = pd.read_csv(processed_data_file)
home_kitchen_data

Unnamed: 0,productId,reviewerId,rating
0,0006564224,A3NSN9WOX8470M,5.0
1,0006564224,A2AMX0AJ2BUDNV,5.0
2,0560467893,A8LUWTIPU9CZB,5.0
3,0560467893,AABKIIHAL0L66,4.0
4,0560467893,A3DA0KIQ5OBK5C,3.0
...,...,...,...
21928563,B01HJH7K3Q,A33Z5MOHLFIECI,3.0
21928564,B01HJH7K3Q,A1BP93Y9L2HUL7,5.0
21928565,B01HJH7K3Q,A3EYW1FBJ48SH1,5.0
21928566,B01HJHTC6O,AYOGJY5CDIY49,5.0


## Installing  Surprise - A Python scikit for recommender systems

In [7]:
!pip install numpy

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip' command.[0m


In [9]:
!/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip

Collecting pip
  Using cached pip-20.2.2-py2.py3-none-any.whl (1.5 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 20.0.2
    Uninstalling pip-20.0.2:
      Successfully uninstalled pip-20.0.2
Successfully installed pip-20.2.2


In [10]:
!pip --version

pip 20.2.2 from /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pip (python 3.6)


In [11]:
# installing scikit-surprise library for recommendation system
!pip install scikit-surprise

Collecting scikit-surprise
  Downloading scikit-surprise-1.1.1.tar.gz (11.8 MB)
[K     |████████████████████████████████| 11.8 MB 2.4 MB/s eta 0:00:01
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp36-cp36m-linux_x86_64.whl size=1703178 sha256=ba42cc59552d2c702d649ab6a29c0d81b9548a107092b59783d92bb96e769a15
  Stored in directory: /home/ec2-user/.cache/pip/wheels/de/9a/41/6a57bf37eb7b50de7f8c7ca9d7053bebe0ea7c7c9bae9fa293
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


In [12]:
from surprise import Dataset
from surprise import Reader
from surprise import NormalPredictor
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

## Defining the rating scale 

In [13]:
reader = Reader(rating_scale=(1, 5))

## Load Data Set 

In [14]:
data = Dataset.load_from_df(home_kitchen_data[['reviewerId', 'productId', 'rating']], reader)
data

<surprise.dataset.DatasetAutoFolds at 0x7f96daebd6d8>

## Use  NormalPredictor algorithm as the benchmark algorithm

In [15]:
algorithm = NormalPredictor()

## Cross Validate 

In [16]:

cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.6624  1.6623  1.6605  1.6607  1.6618  1.6615  0.0008  
MAE (testset)     1.2395  1.2394  1.2380  1.2384  1.2394  1.2390  0.0006  
Fit time          40.09   62.62   62.11   63.37   63.80   58.40   9.17    
Test time         57.07   49.40   49.43   56.71   48.84   52.29   3.76    


{'test_rmse': array([1.66240576, 1.6623137 , 1.66048985, 1.66069975, 1.66183907]),
 'test_mae': array([1.23953522, 1.23942078, 1.23798216, 1.2383935 , 1.23943878]),
 'fit_time': (40.09000039100647,
  62.61944031715393,
  62.11469030380249,
  63.36779522895813,
  63.797972202301025),
 'test_time': (57.06850457191467,
  49.39928150177002,
  49.43034791946411,
  56.70641112327576,
  48.84450006484985)}

## Train Test Split and Prediction 

In [17]:
#split data in to trainset(70%) testset(30%) randomly
trainset, testset = train_test_split(data, test_size=.30)

In [18]:
# view testset
testset

[('A1RDEL9QBXQEYS', 'B00SZEYMAW', 3.0),
 ('A4YRMPGKIBHJY', 'B013DROPRA', 5.0),
 ('A5JRXWSKJDAH6', 'B00902X68W', 3.0),
 ('A2ZO8KF4NUY3XO', 'B00IR77HOK', 3.0),
 ('AD0SVXYCQDCFK', 'B001AS94TY', 5.0),
 ('AEAB1POT7KF39', 'B018F28KR4', 4.0),
 ('A1NUW93IZDJYU0', 'B01C42V938', 5.0),
 ('AHOPF81PVG2WO', 'B00NT3OPRE', 3.0),
 ('A4L0D252PB0YA', 'B007TIN0GW', 3.0),
 ('A12IKGICRWZB8V', 'B000Q3K20O', 1.0),
 ('A3NX6WD3DPJUJ1', 'B00DE01NZ4', 5.0),
 ('A1NNWWH1ZONIQQ', 'B000KFXQ0Q', 2.0),
 ('AQMWH1BVPA705', 'B002WN1PZW', 2.0),
 ('AMP2UIY8ADCWA', 'B00EPGN1AQ', 5.0),
 ('ABEJ431MUH316', 'B00VIJWOTC', 5.0),
 ('A3A1FP8ZDBNJFK', 'B002CK2TPA', 5.0),
 ('A1ABXU3TFTBUZW', 'B00400OLAQ', 5.0),
 ('AP5UVQKNE6P6', 'B011MJGXJY', 5.0),
 ('A3GP3LND98D73V', 'B00DCCRFCO', 5.0),
 ('A14AMQX136I5IF', 'B00YW6Y8JW', 3.0),
 ('A2EGFNBQCDS1T3', 'B00HRS1D40', 5.0),
 ('AD0Q5TSOJ7AR1', 'B00183HPJ2', 4.0),
 ('ABF4P010DY9RR', 'B01DUYUQCE', 5.0),
 ('A3DAYVWF9YPW7T', 'B000KYX58U', 5.0),
 ('A29SJKAP9DF1FR', 'B01DG45HOU', 5.0),
 ('A1NF7UF6X9

In [20]:
# training the model
algorithm.fit(trainset)
# testing the model
predictions = algorithm.test(testset)
# get predictions
predictions

[Prediction(uid='A1RDEL9QBXQEYS', iid='B00SZEYMAW', r_ui=3.0, est=4.217313923250693, details={'was_impossible': False}),
 Prediction(uid='A4YRMPGKIBHJY', iid='B013DROPRA', r_ui=5.0, est=1.5838977645183996, details={'was_impossible': False}),
 Prediction(uid='A5JRXWSKJDAH6', iid='B00902X68W', r_ui=3.0, est=3.5276593944984387, details={'was_impossible': False}),
 Prediction(uid='A2ZO8KF4NUY3XO', iid='B00IR77HOK', r_ui=3.0, est=4.593740476187559, details={'was_impossible': False}),
 Prediction(uid='AD0SVXYCQDCFK', iid='B001AS94TY', r_ui=5.0, est=4.247323983629874, details={'was_impossible': False}),
 Prediction(uid='AEAB1POT7KF39', iid='B018F28KR4', r_ui=4.0, est=4.154587839026361, details={'was_impossible': False}),
 Prediction(uid='A1NUW93IZDJYU0', iid='B01C42V938', r_ui=5.0, est=4.924339131730233, details={'was_impossible': False}),
 Prediction(uid='AHOPF81PVG2WO', iid='B00NT3OPRE', r_ui=3.0, est=4.444290494304186, details={'was_impossible': False}),
 Prediction(uid='A4L0D252PB0YA', ii

In [21]:
# test the accuracy for benchmark model
from surprise import accuracy
accuracy.rmse(predictions)
accuracy.mae(predictions)

RMSE: 1.6621
MAE:  1.2394


1.2394447360279919

## Combining the Train & Test Data Set - Evaluate One User (A1RDEL9QBXQEYS) in testset

In [23]:
# find the row id in the dataframe of the interested user 
# combine the reviewerId and productId and search the particular row / index in the home_kitchen_data dataframe 
np.where((home_kitchen_data.reviewerId=='A1RDEL9QBXQEYS') & (home_kitchen_data.productId=='B00SZEYMAW'))

(array([12831245]),)

In [27]:
# run the prediction on selected reviewer & selected product
uid = str(12831245)  # raw reviewerId (as in the ratings file). 
iid = str(12831245)  # raw productId (as in the ratings file).
#r_ui= real rating of the user = 3
pred = algorithm.predict(uid, iid, r_ui=3, verbose=True)

user: 12831245   item: 12831245   r_ui = 3.00   est = 3.84   {'was_impossible': False}
