# Memory Based Collaborative Filtering Model for Recommendation

## Downloading Data to Directory 

In [1]:
%mkdir ../data
!wget -O ../data/home_and_kitchen.csv - https://raw-data.s3.ap-south-1.amazonaws.com/home_and_kitchen.csv  

mkdir: cannot create directory ‘../data’: File exists
--2021-01-27 01:52:52--  http://-/
Resolving - (-)... failed: Name or service not known.
wget: unable to resolve host address ‘-’
--2021-01-27 01:52:52--  https://raw-data.s3.ap-south-1.amazonaws.com/home_and_kitchen.csv
Resolving raw-data.s3.ap-south-1.amazonaws.com (raw-data.s3.ap-south-1.amazonaws.com)... 52.219.62.46
Connecting to raw-data.s3.ap-south-1.amazonaws.com (raw-data.s3.ap-south-1.amazonaws.com)|52.219.62.46|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 915245469 (873M) [text/csv]
Saving to: ‘../data/home_and_kitchen.csv’


2021-01-27 01:53:52 (14.8 MB/s) - ‘../data/home_and_kitchen.csv’ saved [915245469/915245469]

FINISHED --2021-01-27 01:53:52--
Total wall clock time: 1m 0s
Downloaded: 1 files, 873M in 59s (14.8 MB/s)


## Defining Directories and File Paths

In [1]:
data_dir = '../data'
raw_data_file = '../data/home_and_kitchen.csv'
processed_data_file = '../data/home_and_kitchen_processed.csv'

## Importing Necessary Libraries For Data Processing

In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
# read the data with coloumns by giving columns to the data frame
# data frame = home_kitchen_data 
home_kitchen_data = None
columns=['productId', 'reviewerId', 'rating','timestamp']
home_kitchen_data = pd.read_csv(raw_data_file,names=columns)
home_kitchen_data

Unnamed: 0,productId,reviewerId,rating,timestamp
0,0006564224,A3NSN9WOX8470M,5.0,1283212800
1,0006564224,A2AMX0AJ2BUDNV,5.0,1270166400
2,0560467893,A8LUWTIPU9CZB,5.0,1446681600
3,0560467893,AABKIIHAL0L66,4.0,1446076800
4,0560467893,A3DA0KIQ5OBK5C,3.0,1441756800
...,...,...,...,...
21928563,B01HJH7K3Q,A33Z5MOHLFIECI,3.0,1513814400
21928564,B01HJH7K3Q,A1BP93Y9L2HUL7,5.0,1493164800
21928565,B01HJH7K3Q,A3EYW1FBJ48SH1,5.0,1483401600
21928566,B01HJHTC6O,AYOGJY5CDIY49,5.0,1533254400


In [5]:
# Data Preprocessing 
# As the timestamp is not necessary for our model we can remove the time stamp column from data frame
home_kitchen_data.drop('timestamp',axis=1,inplace=True)
home_kitchen_data.head()
# write this processed file in to a seperate csv file for future use
home_kitchen_data.to_csv(processed_data_file,index = False)

In [6]:
# read the processed data file after removing unnecessary columns 
# data frame = home_kitchen_data 
home_kitchen_data = None
home_kitchen_data = pd.read_csv(processed_data_file)
home_kitchen_data

Unnamed: 0,productId,reviewerId,rating
0,0006564224,A3NSN9WOX8470M,5.0
1,0006564224,A2AMX0AJ2BUDNV,5.0
2,0560467893,A8LUWTIPU9CZB,5.0
3,0560467893,AABKIIHAL0L66,4.0
4,0560467893,A3DA0KIQ5OBK5C,3.0
...,...,...,...
21928563,B01HJH7K3Q,A33Z5MOHLFIECI,3.0
21928564,B01HJH7K3Q,A1BP93Y9L2HUL7,5.0
21928565,B01HJH7K3Q,A3EYW1FBJ48SH1,5.0
21928566,B01HJHTC6O,AYOGJY5CDIY49,5.0


## Installing Surprise - A Python scikit for recommender systems¶

In [7]:
!pip install numpy



In [8]:
!/home/ec2-user/anaconda3/envs/python3/bin/python -m pip install --upgrade pip

Collecting pip
  Using cached pip-21.0-py3-none-any.whl (1.5 MB)
  Using cached pip-20.3.4-py2.py3-none-any.whl (1.5 MB)


In [9]:
!pip --version

pip 21.0 from /home/ec2-user/anaconda3/envs/python3/lib/python3.6/site-packages/pip (python 3.6)


In [10]:
# installing scikit-surprise library for recommendation system
!pip install scikit-surprise



In [11]:
# import necessary libraries from scikit-suprise
from surprise import Dataset
from surprise import Reader
from surprise import KNNWithMeans
from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

## Define Rating Scale 

In [12]:
#Configure rating scale for reader
reader = Reader(rating_scale=(1, 5))

### Load Data Set

In [13]:
#Load Dataset with scikit-suprise
data = Dataset.load_from_df(home_kitchen_data[['reviewerId', 'productId', 'rating']], reader)
data

<surprise.dataset.DatasetAutoFolds at 0x7fc58bdb86a0>

### TrainTest Split

In [16]:
#split data in to trainset(70%) testset(30%) randomly
trainset, testset = train_test_split(data, test_size=.30)

In [18]:
print('Number of users: ', trainset.n_users, '\n')
print('Number of items: ', trainset.n_items, '\n')

Number of users:  7822448 

Number of items:  1104318 



## Use *KNN With Means* Algorithm for Memory Based Collaborative Filtering Model 
### Item Based Approach
* number of items are lower than the number of users therefore I have used Item Based Approch for creating the model

### Define sim options for *KNNWithMeans* algorithm 

In [19]:
# name - The similarities module used to compute similarity metrics between users or items
# user_based - similarities will be computed between items
# min_support - minimum number of common items between users

sim_options = {
    "name": 'pearson',
    "user_based": False,
    "min_support": 50
}

In [20]:
algorithm = KNNWithMeans(k=50,sim_options=sim_options)

In [22]:
# training the model
algorithm.fit(trainset)
# testing the model
predictions = algorithm.test(testset)
# get predictions
predictions

Computing the pearson similarity matrix...


MemoryError: Unable to allocate 8.87 TiB for an array with shape (1104318, 1104318) and data type int64

## Cross Validate

In [21]:
cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

AttributeError: 'Trainset' object has no attribute 'raw_ratings'

### TrainTest Split & Prediction

In [94]:
#split data in to trainset(70%) testset(30%) randomly
trainset, testset = train_test_split(data, test_size=.30)

In [95]:
# view testset
trainset

<surprise.trainset.Trainset at 0x7f6c1ae97c18>