In [2]:
import numpy as np
import matplotlib.pyplot as plt

In [19]:
def loadMovieLens(path='recodata/',dataTrain = 'u1.train',dataTest='u1.test'):
    """ get triples : (user,movie,rating) ; (source,target,weight=1)"""
    ratings_train = []
    ratings_test = []
    users = set()
    movies = set()
    links = []
    for line in open(path+dataTrain):
        (user,movieid,rating,ts)=line.split('\t')
        ratings_train.append([user,movieid,float(rating)/5])
        users.add(user)
        movies.add(movieid)
        
    for line in open(path+dataTest):
        (user,movieid,rating,ts)=line.split('\t')
        if (user in users) and (movieid in movies):
            ratings_test.append([user,movieid,float(rating)/5])
            
    for line in open(path+'u.links'):
        l = line[:-1].split('\t')
        source = l[0]
        #check if source in triples ratings
        if source in users:
            for target in l[1:]:
                #check if target in triples ratings
                links.append([source,target,1])
                
    return ratings_train,ratings_test,links


In [20]:
class SOREC():
    '''Model SOREC
    '''
    def __init__(self, k, lambdaC=0.2, lambdaU=0.2, lambdaV=0.2, lambdaZ=0.2, eps=1e-5, maxIter=2000):
        self.k = k
        self.lambdaC = lambdaC
        self.lambdaU = lambdaU
        self.lambdaV = lambdaV
        self.lambdaZ = lambdaZ
        self.eps = eps
        self.maxIter = maxIter
        
    def fit(self, tripletsUsersItems, tripletsLinks):
        self.u = {}
        self.v = {}
        self.z = {}
        self.loss = []
        #Choix du paramètre a optimisé en cas d'optimisation alternée
        for i in xrange(self.maxIter):
            lossUV = 0
            lossUZ = 0
            lossReg = 0
            for j in xrange(len(tripletsUsersItems)):
                # Ratings --------------------------------------------------------------------------------------------
                r = np.random.randint(len(tripletsUsersItems)) 
                user =   tripletsUsersItems[r][0]
                item =   tripletsUsersItems[r][1]
                rating = tripletsUsersItems[r][2]
                if not user in self.u:
                    self.u[user] = np.random.rand(1,self.k)
                if not item in self.v:
                    self.v[item] = np.random.rand(self.k,1)
                expUV = np.exp(self.u[user].dot(self.v[item])[0][0])
                logistiqueUV = (1.0/(1 + expUV))
                tmp = logistiqueUV - rating
                self.u[user] = self.u[user] - self.eps * tmp * expUV * (logistiqueUV **2) * self.v[item].transpose()
                self.v[item] = self.v[item] - self.eps * tmp * expUV * (logistiqueUV **2) * self.u[user].transpose()
                lossUV = lossUV + tmp*tmp/2. 
                # Links ---------------------------------------------------------------------------------------------
                r = np.random.randint(len(tripletsLinks))
                userSource = tripletsLinks[r][0]
                userTarget = tripletsLinks[r][1]
                linkScore  = tripletsLinks[r][2]
                if not userSource in self.u:
                    self.u[userSource] = np.random.rand(1,self.k)
                if not userTarget in self.z:
                    self.z[userTarget] = np.random.rand(self.k,1)
                expUZ = np.exp(self.u[userSource].dot(self.z[userTarget])[0][0])
                logistiqueUZ = (1.0/(1 + expUZ))
                tmp = logistiqueUZ - linkScore
                self.u[userSource] = self.u[userSource] - self.eps * tmp * expUZ * (logistiqueUZ **2) * self.z[userTarget].transpose()
                self.z[userTarget] = self.z[userTarget] - self.eps * tmp * expUZ * (logistiqueUZ **2) * self.u[userSource].transpose()
                lossUZ = lossUZ + tmp*tmp/2. 
                # Regularize  --------------------------------------------------------------------------------------
                ru = np.random.choice(self.u.keys());
                rv = np.random.choice(self.v.keys());
                rz = np.random.choice(self.z.keys());
                self.u[ru] = self.u[ru] * (1 - self.lambdaU * self.eps)
                self.v[rv] = self.v[rv] * (1 - self.lambdaV * self.eps)
                self.z[rz] = self.z[rz] * (1 - self.lambdaZ * self.eps)
                lossReg = lossReg + np.sqrt((self.u[ru]**2).sum()) + np.sqrt((self.v[rv]**2).sum()) + np.sqrt((self.z[rz]**2).sum())
            self.loss.append([lossUV, lossUZ, lossReg])
            if (i % 1 == 0):
                print i, (lossUV + lossUZ + lossReg) / len(tripletsUsersItems)
                
    def predict(self, tripletsUsersItems):
        pred = np.zeros(len(tripletsUsersItems))
        for ind,c in enumerate(tripletsUsersItems):
            pred[ind] = self.u[c[0]].dot(self.v[c[1]])[0][0]
        return pred
    
    def score(self, tripletsUsersItems):
        pred = self.predict(tripletsUsersItems)
        return ((pred - np.array(np.array(ratings_test)[:,2], float)) ** 2).mean()



In [21]:
# Chargement
ratings_train, ratings_test, links = loadMovieLens()

In [23]:
sorec = SOREC(k=5,eps=0.01,maxIter = 50)
sorec.fit(ratings_train,links)

0 4.1125927467
1 3.93975525962
2 3.78564880906
3 3.64612358038
4 3.52297305965
5 3.40647364768
6 3.30481265737
7 3.21362869091
8 3.13162483844
9 3.05572619624
10 2.99534241978
11 2.93746218915
12 2.8800556211
13 2.8378457854
14 2.79588614063
15 2.7544171721
16 2.72513863651
17 2.68872243145
18 2.65855958722
19 2.63476933672
20 2.61076922955
21 2.5940757102
22 2.57132031567
23 2.55478120756
24 2.53498376478
25 2.52458657041
26 2.51025918026
27 2.49701960179
28 2.48365298642
29 2.47397874387
30 2.46805702493
31 2.45823711985
32 2.44141633583
33 2.44067042286
34 2.4367647309
35 2.4332969163
36 2.43002603806
37 2.41833363269
38 2.41777003041
39 2.40586629884
40 2.40505233993
41 2.40760566814
42 2.39603739054
43 2.39851691725
44 2.39649313509
45 2.39319730998
46 2.38898309405
47 2.38683139348
48 2.38554947024
49 2.38921592994


In [24]:
print 'test erreur Sorec : {}'.format(sorec.score(ratings_test))

test erreur Sorec : 0.601835569678
