# Amazon Review Data Recommendation System

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Read the data file

In [2]:
import json

data = []
with open('/Users/seungwooseo/Desktop/Python/Amazon_Review_Kaggle/Cell_Phones_and_Accessories_5.json', 'r') as f:
    for line in f:
        data.append(json.loads(line))

df = pd.DataFrame(data)


In [4]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,A30TL5EWN6DFXT,120401325X,christina,"[0, 0]",They look good and stick good! I just don't li...,4.0,Looks Good,1400630400,"05 21, 2014"
1,ASY55RVNIL0UD,120401325X,emily l.,"[0, 0]",These stickers work like the review says they ...,5.0,Really great product.,1389657600,"01 14, 2014"
2,A2TMXE2AFO7ONB,120401325X,Erica,"[0, 0]",These are awesome and make my phone look so st...,5.0,LOVE LOVE LOVE,1403740800,"06 26, 2014"
3,AWJ0WZQYMYFQ4,120401325X,JM,"[4, 4]",Item arrived in great time and was in perfect ...,4.0,Cute!,1382313600,"10 21, 2013"
4,ATX7CZYFXI1KW,120401325X,patrice m rogoza,"[2, 3]","awesome! stays on, and looks great. can be use...",5.0,leopard home button sticker for iphone 4s,1359849600,"02 3, 2013"


### Exploratory Data Analysis

In [5]:
# Info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 194439 entries, 0 to 194438
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   reviewerID      194439 non-null  object 
 1   asin            194439 non-null  object 
 2   reviewerName    190920 non-null  object 
 3   helpful         194439 non-null  object 
 4   reviewText      194439 non-null  object 
 5   overall         194439 non-null  float64
 6   summary         194439 non-null  object 
 7   unixReviewTime  194439 non-null  int64  
 8   reviewTime      194439 non-null  object 
dtypes: float64(1), int64(1), object(7)
memory usage: 13.4+ MB


In [6]:
# Describe
df.describe()

Unnamed: 0,overall,unixReviewTime
count,194439.0,194439.0
mean,4.129912,1368714000.0
std,1.222499,32300320.0
min,1.0,982800000.0
25%,4.0,1357603000.0
50%,5.0,1374538000.0
75%,5.0,1390262000.0
max,5.0,1406074000.0


In [7]:
df.columns

Index(['reviewerID', 'asin', 'reviewerName', 'helpful', 'reviewText',
       'overall', 'summary', 'unixReviewTime', 'reviewTime'],
      dtype='object')

In [8]:
# Average Number of Buying Per Person
round(df['reviewerID'].value_counts().mean())

7

In [9]:
# Average Number of Selling Per Product
round(df['asin'].value_counts().mean())

19

In [21]:
# Number of Selling by Product
asin_counts = df['asin'].value_counts()
print(asin_counts)

asin
B005SUHPO6    837
B0042FV2SI    694
B008OHNZI0    657
B009RXU59C    636
B000S5Q9CA    628
             ... 
B006W95VHU      5
B004T6S7C4      5
B009P5XSAU      5
B004TETBQ2      5
B0089ZUTWM      5
Name: count, Length: 10429, dtype: int64


In [24]:
asin_counts[asin_counts>100]

asin
B005SUHPO6    837
B0042FV2SI    694
B008OHNZI0    657
B009RXU59C    636
B000S5Q9CA    628
             ... 
B00AGJMLZC    104
B006JW3BYU    104
B0091XI1SU    103
B0030C4K8I    102
B00EV8Y684    101
Name: count, Length: 207, dtype: int64

### Feature selection based on the number of deal

In [25]:
# Retreive the values which are more than the average
asin_counts = asin_counts[asin_counts > 100]

In [27]:
asin_counts = pd.DataFrame(asin_counts.reset_index())
asin_counts

Unnamed: 0,asin,count
0,B005SUHPO6,837
1,B0042FV2SI,694
2,B008OHNZI0,657
3,B009RXU59C,636
4,B000S5Q9CA,628
...,...,...
202,B00AGJMLZC,104
203,B006JW3BYU,104
204,B0091XI1SU,103
205,B0030C4K8I,102


In [50]:
recalled_df = []
recalled_df = pd.DataFrame(columns=df.columns)

for asin in asin_counts['asin']:
    recalled_df = pd.concat([recalled_df, df[df['asin'] == asin]])

  recalled_df = pd.concat([recalled_df, df[df['asin'] == asin]])


In [54]:
recalled_df['overall'].value_counts().sum()

40648

## Collaborative Filtering

### Cosine Similarity

#### Labelling each ReviewerID and Asin

In [55]:
#import label encoder
from sklearn.preprocessing import LabelEncoder

In [73]:
#Labeling the reviewerID
le = LabelEncoder()
le.fit(recalled_df['reviewerID'])
origin_df = pd.DataFrame()
origin_df['user_id'] = le.transform(recalled_df['reviewerID'])

#Labeling the asin
le.fit(recalled_df['asin'])
origin_df['item_id'] = le.transform(recalled_df['asin'])

In [74]:
origin_df

Unnamed: 0,user_id,item_id
0,17402,63
1,7109,63
2,2001,63
3,16482,63
4,7183,63
...,...,...
40643,5930,198
40644,16514,198
40645,3700,198
40646,20049,198


In [80]:
overalls = pd.DataFrame(recalled_df['overall'].reset_index(drop=True))
overalls

Unnamed: 0,overall
0,5.0
1,4.0
2,5.0
3,5.0
4,5.0
...,...
40643,5.0
40644,4.0
40645,3.0
40646,5.0


In [81]:
#Mother Matrix for calculation of similarity
origin_df = pd.concat([origin_df,overalls], axis=1)

In [82]:
#Make a table for iterrow works
origin_df

Unnamed: 0,user_id,item_id,overall
0,17402,63,5.0
1,7109,63,4.0
2,2001,63,5.0
3,16482,63,5.0
4,7183,63,5.0
...,...,...,...
40643,5930,198,5.0
40644,16514,198,4.0
40645,3700,198,3.0
40646,20049,198,5.0


#### Divide them into train_split and test_split & Calculate the similarity

In [83]:
# Train_test_split
from sklearn.model_selection import train_test_split

In [84]:
# Split the database into train and test
train_data, test_data = train_test_split(origin_df, test_size=0.3, random_state=33)

In [85]:
train_data

Unnamed: 0,user_id,item_id,overall
16811,2278,107,5.0
16164,486,176,5.0
13848,5267,136,5.0
25761,19994,0,5.0
20577,18422,97,5.0
...,...,...,...
27282,7068,57,5.0
578,12369,63,5.0
38616,995,156,1.0
2439,19251,148,5.0


In [86]:
# Count the whole rows and columns to make a similarity table
n_users = origin_df['user_id'].nunique()
n_items = origin_df['item_id'].nunique()
print('Number of Users: ', n_users)
print('Number of Items: ', n_items)

Number of Users:  20142
Number of Items:  207


In [87]:
#Make a similarity table
train_data_matrix = np.zeros([n_users,n_items])
test_data_matrix = np.zeros([n_users,n_items])

In [88]:
# Composing a train_data_matrix
for train in train_data.itertuples():
    train_data_matrix[train[1], train[2]] = train[3] #train[0] is index

# Composing a test_data_matrix
for test in test_data.itertuples():
    test_data_matrix[[test[1]-1, test[2]-1]] = test[3] #test[0] is index

In [89]:
train_data_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#### Calculate the similarity

In [90]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity = pairwise_distances(train_data_matrix, metric='cosine')
item_similarity = pairwise_distances(train_data_matrix.T, metric='cosine')

In [91]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [92]:
item_prediction = predict(train_data_matrix, item_similarity, type='item')
user_prediction = predict(train_data_matrix, user_similarity, type='user')

In [93]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):#Filtering the comparable values
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [94]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 4.355703215946293
Item-based CF RMSE: 4.355695221365662
