# Import necessary package


In [52]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics.pairwise import cosine_similarity

random.seed(0)
np.random.seed(0)

# Load data

In [53]:
train_data = pd.read_csv("data/review.csv").fillna(0)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52512 entries, 0 to 52511
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ReviewerID  52512 non-null  object 
 1   ProductID   52512 non-null  object 
 2   Text        52512 non-null  object 
 3   Summary     52512 non-null  object 
 4   Star        52512 non-null  float64
dtypes: float64(1), object(4)
memory usage: 2.0+ MB


In [54]:
len(train_data['ReviewerID'].unique())

2752

# Create the User/Item matrix

In [55]:
train_user_item_matrix = train_data.pivot_table(values='Star', index="ReviewerID", columns="ProductID")
train_item_user_matrix = train_user_item_matrix.T

# Handle the missing values with mean
train_user_item_matrix = train_user_item_matrix.apply(lambda x: x.fillna(x.mean()), axis=0)
train_item_user_matrix = train_item_user_matrix.apply(lambda x: x.fillna(x.mean()), axis=0)



# Similarity
We use cosine simlilarity

In [56]:
def similarity(matrix):
    return cosine_similarity(matrix)

# User CF

In [None]:
def predict_user_based(user_base_matrix, user_similarity):
    user_mean = user_base_matrix.mean(axis=1).values.reshape((-1,1))
    centered_user_base_matrix = user_base_matrix - user_mean
    sum_similarity = np.sum(np.abs(user_similarity), axis=1, keepdims=True)
    predict = user_mean + (np.dot(user_similarity, centered_user_base_matrix)/sum_similarity)
    return pd.DataFrame(predict,index=user_base_matrix.index, columns=user_base_matrix.columns)

In [59]:
user_similarity = similarity(train_user_item_matrix)
user_base_predict = predict_user_based(train_user_item_matrix,user_similarity)

In [60]:
user_base_predict

ProductID,B000FBFMHU,B000FC27TA,B000FCKPG2,B000GCFWXW,B000JMKRTI,B000V507D4,B000W4RFBQ,B000W916C0,B000W93A42,B000W966HU,...,B01HAQ70M2,B01HBTH0VO,B01HCB4LAO,B01HCB4LFE,B01HF6OU70,B01HFFPC2I,B01HFGNGYI,B01HFTVMXM,B01HFUF1GK,B01HHJZ3EO
ReviewerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0020356UF96ZV361ST,4.999546,3.499546,1.999546,3.142403,4.666212,4.999546,2.999546,3.999546,4.999546,4.799546,...,4.937046,4.545000,4.999546,3.999546,4.932879,4.499546,4.332879,4.499546,4.999546,4.691853
A00463782V7TKAP9EMNL,5.000018,3.500018,2.000018,3.142876,4.666685,5.000018,3.000018,4.000018,5.000018,4.800018,...,4.937518,4.545473,5.000018,4.000018,4.933351,4.500018,4.333351,4.500018,5.000018,4.692326
A0099735VDZ3HDCAAYKL,5.002251,3.502251,2.002251,3.145109,4.668918,5.002251,3.002251,4.002251,5.002251,4.802251,...,4.939751,4.547705,5.002251,4.002251,4.935584,4.502251,4.335584,4.502251,5.002251,4.694559
A01631062UX24GI4LJKF,5.000222,3.500222,2.000222,3.143080,4.666889,5.000222,3.000222,4.000222,5.000222,4.800222,...,4.937722,4.545677,5.000222,4.000222,4.933555,4.500222,4.333555,4.500222,5.000222,4.692530
A0178408Z1TQAM7D75FY,5.000452,3.500452,2.000452,3.143309,4.667118,5.000452,3.000452,4.000452,5.000452,4.800452,...,4.937952,4.545906,5.000452,4.000452,4.933785,4.500452,4.333785,4.500452,5.000452,4.692759
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AZQSQSF2QI02F,4.999873,3.499873,1.999873,3.142731,4.666540,4.999873,2.999873,3.999873,4.999873,4.799873,...,4.937373,4.545328,4.999873,3.999873,4.933207,4.499873,4.333207,4.499873,4.999873,4.692181
AZRPAHQG1VHR0,4.997107,3.497107,1.997107,3.139964,4.663774,4.997107,2.997107,3.997107,4.997107,4.797107,...,4.934607,4.542561,4.997107,3.997107,4.930440,4.497107,4.330440,4.497107,4.997107,4.689415
AZULU4TOTOLEU,4.999698,3.499698,1.999698,3.142555,4.666364,4.999698,2.999698,3.999698,4.999698,4.799698,...,4.937198,4.545152,4.999698,3.999698,4.933031,4.499698,4.333031,4.499698,4.999698,4.692005
AZYERRDY2VW61,4.996290,3.496290,1.996290,3.139148,4.662957,4.996290,2.996290,3.996290,4.996290,4.796290,...,4.933790,4.541745,4.996290,3.996290,4.929623,4.496290,4.329623,4.496290,4.996290,4.688598


# Evaluation

In [71]:
prediction_data = pd.read_csv("data/prediction.csv")


In [73]:
prediction_data

Unnamed: 0,ReviewerID,ProductID,Star
0,A2MK1L1Y74WTWH,B01GT5XDFS,0
1,A19I68RW4PBT29,B00OME9OQQ,0
2,A1UPHTDW5GM12T,B01GSRNLOK,0
3,A1LFIFPYMOJ8RV,B01CUJYMR0,0
4,A10Y597K071WTQ,B004SI455Q,0
...,...,...,...
6628,A23Y4UGTFDMZOP,B00J5327X6,0
6629,A2PFNDDKHOOMZU,B01G0GIXJ2,0
6630,A1K4S4MWXI9E9M,B01FKDKB96,0
6631,AOLHNMI8G8R6K,B00NUDPR66,0


In [77]:
reviewerID = prediction_data['ReviewerID'].values
productID = prediction_data['ProductID'].values
star = []
for r,p in zip(reviewerID,productID):
    star.append(user_base_predict.at[r,p])
star

KeyError: 'B01GWMFSYM'