In [4]:
import pandas as pd
import numpy as np

# get the data from csv file to dataframe
data = pd.read_csv(
    'data/data_trainig.csv', 
    low_memory=False)

In [7]:
userItemRatingMatrix=pd.pivot_table(data, values='rating', index=['userId'], columns=['itemId'])

def matrixFactorization(R, K, steps=10, gamma=0.001, lamda=0.02):
    N=len(R.index)#number of users
    M=len(R.columns)#number of items
    P=pd.DataFrame(np.random.rand(N,K), index=R.index)
    Q=pd.DataFrame(np.random.rand(M,K), index=R.columns)
    for step in range(steps):
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    eij=R.loc[i,j]-np.dot(P.loc[i],Q.loc[j])
                    P.loc[i]=P.loc[i]+gamma*(eij*Q.loc[j]-lamda*P.loc[i])
                    Q.loc[j]=Q.loc[j]+gamma*(eij*P.loc[i]-lamda*Q.loc[j])
        e=0
        for i in R.index:
            for j in R.columns:
                if R.loc[i,j]>0:
                    e=e+pow(R.loc[i,j]-np.dot(P.loc[i], Q.loc[j]),2)+lamda*(pow(np.linalg.norm(P.loc[i]),2)+pow(np.linalg.norm(Q.loc[j]),2))
        if e<0.001:
            break
        if step % 100 ==0:
            print (step)
            print(e)
        
    return P,Q

(P,Q)=matrixFactorization(userItemRatingMatrix.iloc[:250,:23], K=2, gamma=0.001, lamda=0.02, steps=1000)


0
1422.642863292628
100
321.2317371966348
200
86.31067428921111
300
50.06719499377391
400
41.90225298406437
500
39.54150496266546
600
38.6472490011683
700
38.162312245694054
800
37.80491674473488
900
37.49533714898635


In [10]:
activeUser=17
predictItemRating=pd.DataFrame(np.dot(P.loc[activeUser],Q.T), index=Q.index, columns=['Ratings'])
topRecommendations_byrating=pd.DataFrame.sort_values(predictItemRating,['Ratings'],ascending=[0])[:5]
#we found the ratings of all movie by the active user and then sorted them to top 3

topRecommendations_byrating


Unnamed: 0_level_0,Ratings
itemId,Unnamed: 1_level_1
1,3.265591
23,3.179335
19,3.15668
8,3.105569
18,3.076627


In [11]:
userItemLabelMatrix=pd.pivot_table(data, values='label', index=['userId'], columns=['itemId'])
(R,S)=matrixFactorization(userItemRatingMatrix.iloc[:250,:23], K=2, gamma=0.001, lamda=0.02, steps=1000)


0
1348.7592561927202
100
306.2072543006806
200
82.95949451950688
300
49.3862223339131
400
42.00891193869359
500
39.990157464786016
600
39.205254871305506
700
38.743115406143026
800
38.381255806857666
900
38.05953826970683


In [20]:
activeUser=17
predictItemLabel=pd.DataFrame(np.dot(R.loc[activeUser],S.T), index=S.index, columns=['Label'])
topRecommendations_bylabel=pd.DataFrame.sort_values(predictItemLabel,['Label'],ascending=[0])[:5]
#we found the ratings of all movie by the active user and then sorted them to top 3

topRecommendations_bylabel


Unnamed: 0_level_0,Label
itemId,Unnamed: 1_level_1
8,2.984492
2,2.964963
19,2.950742
23,2.903387
1,2.898652


In [26]:
# merge the recommendations by rating and label

topRecommendationsMerge = pd.merge(
    topRecommendations_byrating, topRecommendations_bylabel, how='outer',
    left_index=True, right_index=True)
topRecommendationsMerge = topRecommendationsMerge.fillna(0)
topRecommendationsMerge

Unnamed: 0_level_0,Ratings,Label
itemId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,3.265591,2.898652
2,0.0,2.964963
8,3.105569,2.984492
18,3.076627,0.0
19,3.15668,2.950742
23,3.179335,2.903387


In [28]:
# calculate the result
topRecommendationsMerge['Result'] = topRecommendationsMerge.apply(
    lambda row: (row['Ratings']+row['Label'])/2,
    axis=1
)
topRecommendationsMerge = topRecommendationsMerge.sort_values(by=['Result'],ascending=[0])
topRecommendationsMerge

Unnamed: 0_level_0,Ratings,Label,Result
itemId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,3.265591,2.898652,3.082122
19,3.15668,2.950742,3.053711
8,3.105569,2.984492,3.04503
23,3.179335,2.903387,3.041361
18,3.076627,0.0,1.538313
2,0.0,2.964963,1.482482
