In [3]:
import pandas as pd
import numpy as np
import timeit
import time

# get the data from csv file to dataframe
DataTrain = pd.read_csv(
    'data/SentimentDataTrain.csv', 
    low_memory=False)
DataTest = pd.read_csv(
    'data/SentimentDataTest.csv', 
    low_memory=False)

DataTrain.drop("Unnamed: 0", 1, inplace = True)
DataTest.drop("Unnamed: 0", 1, inplace = True)


In [None]:
############################ THE RECOMMENDATION CLASS #########################

In [2]:
def matrixFactorization(R, K, steps=10, gamma=0.001, lamda=0.02):
    N=len(R.index)#number of users
    M=len(R.columns)#number of items
    P=pd.DataFrame(np.random.rand(N,K), index=R.index)
    Q=pd.DataFrame(np.random.rand(M,K), index=R.columns)
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij=R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i]=P.loc[i]+gamma*(eij*Q.loc[j]-lamda*P.loc[i])
                    Q.loc[j]=Q.loc[j]+gamma*(eij*P.loc[i]-lamda*Q.loc[j])
        e=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    e=e+pow(R.loc[i,j]-np.dot(P.loc[i], Q.loc[j]),2)+lamda*(pow(np.linalg.norm(P.loc[i]),2)+pow(np.linalg.norm(Q.loc[j]),2))
        if e<0.001:
            break
        if step % 100 ==0:
            print (step)
            print(e)
        
    return P,Q


In [None]:
############################ TRAINING #########################

In [None]:
DataTrain.head()

In [5]:
print("user count : " +str(len(set(DataTrain.userId))) )
print("item count : " +str(len(set(DataTrain.itemId))) )

user count : 1622
item count : 11


In [None]:
NinguserItemRatingMatrix=pd.pivot_table(DataTest, values='rating', index=['userId'], columns=['itemId'])
start = time.time()
(P,Q)=matrixFactorization(NinguserItemRatingMatrix.iloc[:1622,:11], K=2, gamma=0.001, lamda=0.02, steps=1000)
end = time.time()
print(end-start)

0
707.8455422430453
100
202.30842221527791
200
112.79695716484304
300
77.32732006672768


In [None]:
############################ TESTING #########################

In [None]:
DataTest.head()

In [4]:
print("user count : " +str(len(set(DataTest.userId))) )
print("item count : " +str(len(set(DataTest.itemId))) )

user count : 453
item count : 11


In [None]:
# Test Rating
TuserItemRatingMatrix=pd.pivot_table(DataTest, values='rating', index=['userId'], columns=['itemId'])
start = time.time()
(A,B)=matrixFactorization(TuserItemRatingMatrix.iloc[:453,:11], K=2, gamma=0.001, lamda=0.02, steps=1000)
end = time.time()
print(end-start)

In [None]:
# Test Testimony Value
TuserItemLabelMatrix=pd.pivot_table(DataTest, values='testimony_value', index=['userId'], columns=['itemId'])

start = time.time()
(C,D)=matrixFactorization(userItemRatingMatrix.iloc[:901,:22], K=2, gamma=0.001, lamda=0.02, steps=1000)
end = time.time()
print(end-start)


In [8]:
activeUser=2484
TpredictItemRating=pd.DataFrame(np.dot(A.loc[activeUser],B.T), index=Q.index, columns=['Ratings'])
topRecommendations_byrating=pd.DataFrame.sort_values(TpredictItemRating,['Ratings'],ascending=[0])[:5]
#we found the ratings of all movie by the active user and then sorted them to top 3

topRecommendations_byrating


user count : 901
item count : 22


Unnamed: 0_level_0,Ratings
itemId,Unnamed: 1_level_1
16,3.410092
4,3.340633
19,3.316509
6,3.273265
17,3.26678


0
5266.383923048226
100
514.9110889622377
200
262.12714241061883
300
212.41158176226102
400
197.3622452277835
500
189.9486994847723
600
184.69468650652064
700
180.31999001450805
800
176.47087489021217
900
173.0137108168327
3050.0839619636536


In [16]:
activeUser=2484
predictItemLabel=pd.DataFrame(np.dot(R.loc[activeUser],S.T), index=S.index, columns=['Label'])
topRecommendations_bylabel=pd.DataFrame.sort_values(predictItemLabel,['Label'],ascending=[0])[:5]
#we found the ratings of all movie by the active user and then sorted them to top 3

topRecommendations_bylabel


Unnamed: 0_level_0,Label
itemId,Unnamed: 1_level_1
16,3.319173
4,3.273252
6,3.214883
17,3.18139
8,3.181126


In [17]:
# merge the recommendations by rating and label

topRecommendationsMerge = pd.merge(
    topRecommendations_byrating, topRecommendations_bylabel, how='outer',
    left_index=True, right_index=True)
topRecommendationsMerge = topRecommendationsMerge.fillna(0)
topRecommendationsMerge

Unnamed: 0_level_0,Ratings,Label
itemId,Unnamed: 1_level_1,Unnamed: 2_level_1
4,3.340633,3.273252
6,3.273265,3.214883
8,0.0,3.181126
16,3.410092,3.319173
17,3.26678,3.18139
19,3.316509,0.0


In [18]:
# calculate the result
topRecommendationsMerge['Result'] = topRecommendationsMerge.apply(
    lambda row: (row['Ratings']+row['Label'])/2,
    axis=1
)
topRecommendationsMerge = topRecommendationsMerge.sort_values(by=['Result'],ascending=[0])
topRecommendationsMerge

Unnamed: 0_level_0,Ratings,Label,Result
itemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
16,3.410092,3.319173,3.364633
4,3.340633,3.273252,3.306942
6,3.273265,3.214883,3.244074
17,3.26678,3.18139,3.224085
19,3.316509,0.0,1.658255
8,0.0,3.181126,1.590563
