In [3]:
import pandas as pd
import numpy as np
import timeit
import time

# get the data from csv file to dataframe
DataTrain = pd.read_csv(
    'data/SentimentDataTrain.csv', 
    low_memory=False)
DataTest = pd.read_csv(
    'data/SentimentDataTest.csv', 
    low_memory=False)

DataTrain.drop("Unnamed: 0", 1, inplace = True)
DataTest.drop("Unnamed: 0", 1, inplace = True)


In [None]:
############################ THE RECOMMENDATION CLASS #########################

In [2]:
def matrixFactorization(R, K, steps=10, gamma=0.001, lamda=0.02):
    N=len(R.index)#number of users
    M=len(R.columns)#number of items
    P=pd.DataFrame(np.random.rand(N,K), index=R.index)
    Q=pd.DataFrame(np.random.rand(M,K), index=R.columns)
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij=R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i]=P.loc[i]+gamma*(eij*Q.loc[j]-lamda*P.loc[i])
                    Q.loc[j]=Q.loc[j]+gamma*(eij*P.loc[i]-lamda*Q.loc[j])
        e=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    e=e+pow(R.loc[i,j]-np.dot(P.loc[i], Q.loc[j]),2)+lamda*(pow(np.linalg.norm(P.loc[i]),2)+pow(np.linalg.norm(Q.loc[j]),2))
        if e<0.001:
            break
        if step % 100 ==0:
            print (step)
            print(e)
        
    return P,Q


In [None]:
############################ TRAINING #########################

In [None]:
DataTrain.head()

In [5]:
print("user count : " +str(len(set(DataTrain.userId))) )
print("item count : " +str(len(set(DataTrain.itemId))) )

user count : 1622
item count : 11


In [None]:
# Training Rating
TuserItemRatingMatrix=pd.pivot_table(DataTest, values='rating', index=['userId'], columns=['itemId'])
start = time.time()
(R,S)=matrixFactorization(TuserItemRatingMatrix.iloc[:1622,:11], K=2, gamma=0.001, lamda=0.02, steps=1000)
end = time.time()
print(end-start)

In [None]:
# Training Testimony Value
NuserItemLabelMatrix=pd.pivot_table(DataTest, values='testimony_value', index=['userId'], columns=['itemId'])

start = time.time()
(A,B)=matrixFactorization(NuserItemLabelMatrix.iloc[:1622,:11], K=2, gamma=0.001, lamda=0.02, steps=1000)
end = time.time()
print(end-start)


In [None]:
############################ TESTING #########################

In [15]:
DataTest.head()

Unnamed: 0,reviewId,rating,testimony,itemId,userId,testimony_value
0,2800,2.0,Excellent Reader and Product for the price. Ve...,8,3599,3.0
1,4019,2.0,I bought this tablet for my 13 yr. old God-chi...,20,2757,3.0
2,3778,2.0,I have an LG tablet but picked up this one for...,20,977,3.0
3,2368,2.0,"I bought 2 of these, 1 for each of my 2 younge...",7,1166,3.0
4,4912,1.0,I got this tablet so I wouldn't have to pack u...,19,302,3.0


In [14]:
print("user count : " +str(len(set(DataTest.userId))) )
print("item count : " +str(len(set(DataTest.itemId))) )

user count : 453
item count : 11


In [7]:
# Test Rating
NinguserItemRatingMatrix=pd.pivot_table(DataTest, values='rating', index=['userId'], columns=['itemId'])
start = time.time()
(P,Q)=matrixFactorization(NinguserItemRatingMatrix.iloc[:1622,:11], K=2, gamma=0.001, lamda=0.02, steps=1000)
end = time.time()
print(end-start)

0
707.8455422430453
100
202.30842221527791
200
112.79695716484304
300
77.32732006672768
400
62.305036540947256
500
55.52903581486288
600
52.20456411786395
700
50.37680408744928
800
49.22522451098049
900
48.39559907891138
1681.2775149345398


In [13]:
# Test Testimony Value
NuserItemLabelMatrix=pd.pivot_table(DataTest, values='testimony_value', index=['userId'], columns=['itemId'])

start = time.time()
(C,D)=matrixFactorization(NuserItemLabelMatrix.iloc[:453,:11], K=2, gamma=0.001, lamda=0.02, steps=1000)
end = time.time()
print(end-start)


0
2865.5571024435344
100
252.9415804537715
200
126.5670773443478
300
103.86100444181935
400
97.08969105370693
500
93.47509688400464
600
90.80501556336023
700
88.57565171480938
800
86.62833614482035
900
84.89288834699198
1423.7771713733673


In [16]:
# Rating Rank
activeUser=977
TestpredictItemRating=pd.DataFrame(np.dot(P.loc[activeUser],Q.T), index=Q.index, columns=['Ratings'])
topRecommendations_byrating=pd.DataFrame.sort_values(TestpredictItemRating,['Ratings'],ascending=[0])[:5]
#we found the ratings of all movie by the active user and then sorted them to top 3

topRecommendations_byrating


Unnamed: 0_level_0,Ratings
itemId,Unnamed: 1_level_1
13,1.95258
20,1.943186
7,1.900207
2,1.897495
4,1.8959


In [18]:
# Testimony Rank
activeUser=977
TestpredictItemTestimony=pd.DataFrame(np.dot(C.loc[activeUser],D.T), index=Q.index, columns=['Testimony'])
topRecommendations_bytestimony=pd.DataFrame.sort_values(TestpredictItemTestimony,['Testimony'],ascending=[0])[:5]
#we found the ratings of all movie by the active user and then sorted them to top 5

topRecommendations_bytestimony


Unnamed: 0_level_0,Testimony
itemId,Unnamed: 1_level_1
20,2.939995
2,2.881244
8,2.833527
7,2.768935
5,2.755539


In [19]:
# MERGE RANK
points = [5,4,3,2,1]
# give points
topRecommendations_byrating['points_rat']= points
topRecommendations_bytestimony['points_tes']= points

In [22]:
# merge dataframes
topRecommendations = pd.merge(
    topRecommendations_byrating, topRecommendations_bytestimony, how='outer',
    left_index=True, right_index=True)
topRecommendations

Unnamed: 0_level_0,Ratings,points_rat,Testimony,points_tes
itemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,1.897495,2.0,2.881244,4.0
4,1.8959,1.0,,
5,,,2.755539,1.0
7,1.900207,3.0,2.768935,2.0
8,,,2.833527,3.0
13,1.95258,5.0,,
20,1.943186,4.0,2.939995,5.0


In [23]:
topRecommendations = topRecommendations.fillna(0)

In [26]:
# calculate the result
topRecommendations['Result'] = topRecommendations.apply(
    lambda row: (row['points_rat']+row['points_tes'])/2,
    axis=1
)
topRecommendations = topRecommendations.sort_values(by=['Result'],ascending=[0])
topRecommendations

Unnamed: 0_level_0,Ratings,points_rat,Testimony,points_tes,Result
itemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
20,1.943186,4.0,2.939995,5.0,4.5
2,1.897495,2.0,2.881244,4.0,3.0
7,1.900207,3.0,2.768935,2.0,2.5
13,1.95258,5.0,0.0,0.0,2.5
8,0.0,0.0,2.833527,3.0,1.5
4,1.8959,1.0,0.0,0.0,0.5
5,0.0,0.0,2.755539,1.0,0.5


In [27]:
DataItem = pd.read_csv(
    'data/item.csv', 
    low_memory=False)
DataItem.head()

Unnamed: 0.1,Unnamed: 0,itemId,itemname,bef_subtotal_review,subtotal_review
0,0,1,"Fire HD 8 Tablet with Alexa, 8"" HD Display, 32...",53,53
1,1,2,"Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16...",561,556
2,2,3,Amazon Tap - Alexa-Enabled Portable Bluetooth ...,225,225
3,3,4,"Fire HD 10 Tablet, 10.1 HD Display, Wi-Fi, 16 ...",106,96
4,4,5,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",51,45


In [29]:
topRecommendationsTitles=DataItem.loc[DataItem.itemId.isin(topRecommendations.index)]
print (list(topRecommendationsTitles.itemname))

['Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16 GB, Blue Kid-Proof Case', 'Fire HD 10 Tablet, 10.1 HD Display, Wi-Fi, 16 GB - Includes Special Offers, Silver Aluminum', 'All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi, 16 GB - Includes Special Offers, Blue', 'Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16 GB, Green Kid-Proof Case', 'Fire Tablet, 7 Display, Wi-Fi, 16 GB - Includes Special Offers, Black', 'All-New Fire HD 8 Tablet, 8" HD Display, Wi-Fi, 32 GB - Includes Special Offers, Magenta', 'All-New Fire HD 8 Tablet, 8" HD Display, Wi-Fi, 16 GB - Includes Special Offers, Magenta']
