In [68]:
import numpy as np
import csv
import time
from sklearn.model_selection import train_test_split
from sklearn.manifold import TSNE
from matplotlib import pyplot as plt

datapath = 'ml-100k/'
filename = "u.data"

def read_lines(filename):
    return list(row[0:3] for row in list(csv.reader( open(filename, 'rb'), delimiter='\t')) )
    
    
def split_data(data, train_prop):
    p = np.random.permutation(data)
    train_size = int(round(train_prop * len(p) ))
    return p[:train_size], p[train_size:] 

def plot_embedding_2D(X,y):
    fig = plt.figure()
    plt.scatter(X[:,0], X[:,1], c=y)
    plt.show()
    
def plot_embedding_3D(X,y):
    fig = plt.figure()
    ax=Axes3D(fig)
    ax.scatter(X[:,0], X[:,1],X[:,2],c=y)
    plt.show()


In [69]:
# Renvoi un dictionnaire de la forme 
# dict {user : dict { item_id : rating } }
def user_index(data):
    user_index = {}
    for [user_id, item_id, rating] in data:
        if user_id not in user_index:
            user_index[user_id] = {item_id: rating}
        else:
            user_index[user_id][item_id] = rating
    return user_index

# Renvoi un dictionnaire de la forme 
# dict {item_id : dict { user_id : rating } }
def item_index(data):
    item_index = {}
    for [user_id, item_id, rating] in data:
        if item_id not in item_index:
            item_index[item_id] = {user_id: rating}
        else:
            item_index[item_id][user_id] = rating
    return item_index

In [70]:
class BaselineItem():
    def fit(self, items_id):
        self.model = {}
        for item in items_id.keys():
            self.model[item] = 0
            for user in items_id[item].keys():
                self.model[item] += int(items_id[item][user])
            self.model[item] /= len(items_id[item]) 
            #print(self.model[item])
            
    def predict(self,lines):
        self.pred = [self.model[i] for [_, i, _] in lines]
        return self.pred
    
    def error(self,test):
        return ((np.array(self.pred)  - test )**2).mean()
    
class BaselineUser():
    def fit(self, users_id):
        self.model = {}
        for user in users_id.keys():
            self.model[user] = 0
            for item in users_id[user].keys():
                self.model[user] += int(users_id[user][item])
            self.model[user] /= len(users_id[user]) 
            #print(self.model[item])
            
    def predict(self,lines):
        self.pred = [self.model[u] for [u, _, _] in lines]
        return self.pred
    
    def error(self,test):
        return ((np.array(self.pred)  - test )**2).mean()
    

    

lines = read_lines(datapath+filename)
train_lines, test_lines = split_data(lines, 0.8)

#supprime les qui sont dans le set de test mais pas celui d'apprentissage
# on fait cela car avec ce modéle on ne peut traiter que les données sur lesquels on a deja des informations
users_del = set([line[0] for line in test_lines]) - set([line[0] for line in train_lines])
films_del = set([line[1] for line in test_lines]) - set([line[1] for line in train_lines])
print(len(test_lines))
test_lines = [t.tolist() for t in test_lines if t[0] not in users_del and t[1] not in films_del]
print(len(test_lines))

user_index_train = user_index(train_lines)
item_index_train = item_index(train_lines)


20000
19963


In [94]:
model_user = BaselineUser()
model_user.fit(user_index_train)
model_user.predict(test_lines)
print "Erreur en test pour la baseline User:", model_user.error(np.array(test_lines, float)[:,2])

model_item = BaselineItem()
model_item.fit(item_index_train)
model_item.predict(test_lines)
print "Erreur en test pour la baseline Item:",  model_item.error(np.array(test_lines, float)[:,2])

Erreur en test pour la baseline User: 1.41201222261
Erreur en test pour la baseline Item: 1.38175624906


In [5]:
class MatrixFactorisation():
    def __init__(self,f,iter = 100 , e = 0.001, lmbda = 0.2):
        self.f = f
        self.iter = iter
        self.e = e
        self.lmbda = lmbda
        
        self.p = {}
        self.q = {}
        self.bu = {}
        self.bi = {}
        
        self.loss_history = []
        self.pred = []

        
    def fit(self, data):
        t_total = 0
        for it in xrange(self.iter):
            loss = 0
            t = time.time()
            data_perm = np.random.permutation(data) # stochastique
            for j in xrange(len(data_perm)):
                
                u = data_perm[j][0]
                i = data_perm[j][1]
                r_ui = data_perm[j][2]
                
                self.mu = np.random.random()
                # init mu, p, bu ,q, bi
                if u not in self.p:
                    self.p[u] = np.random.rand(1,self.f)
                    self.bu[u] = np.random.random()
                if i not in self.q:
                    self.q[i] = np.random.rand(self.f,1)
                    self.bi[i] = np.random.random()
                
                # update
                phi_ui = float(r_ui) - (self.mu + self.bi[i] + self.bu[u] + np.dot(self.p[u], self.q[i])[0][0])
                self.p[u] = (1 - self.lmbda * self.e) * self.p[u] + self.e * self.q[i].T * phi_ui
                self.q[i] = (1 - self.lmbda * self.e) * self.q[i] + self.e * self.p[u].T * phi_ui
                self.bu[u] = (1 - self.lmbda * self.e) * self.bu[u] + self.e * phi_ui
                self.bi[i] = (1 - self.lmbda * self.e) * self.bi[i] + self.e * phi_ui
                self.mu = (1 - self.lmbda * self.e) * self.mu + self.e * phi_ui
                
                loss += phi_ui**2

            t_total += (time.time() - t)
            vitesse = (it+1) / t_total
            if (it % 50==0):
                print "%d: loss=%.2f, Temps restant %.2fs" % ( it, loss/len(data), vitesse * (self.iter - (it+1) ) )
            self.loss_history.append(loss)
                
    def predict(self, lines):         
            self.pred = [ self.mu + self.bu[u] + self.bi[i] + np.dot( self.p[u], self.q[i] )[0][0] for [u, i, _] in lines ]
            return self.pred

    def error(self, test):
            return (( np.array(self.pred) - test ) ** 2).mean()

In [6]:
nb_iter = 10
lambdas = [0.01, 0.05, 0.1, 0.5, 1]
ks = [1, 5, 20, 100]

t_total = 0
it = 0
for k in ks:
    for lmbda in lambdas:
        t = time.time() # tic
        print t
        it += 1
        
        model = MatrixFactorisation(k, iter=nb_iter, lmbda=lmbda, e=1e-2)
        model.fit(train_lines)
        model.predict(test_lines)
        error = model.error(np.array(test_lines, float)[:,2])
        
        print (time.time() - t)
        t_total += (time.time() - t) # toc        
        print "k: %d,lambda: %.2f, erreur²: %.2f | Temps restant ~ %.2fs" % (k, lmbda, error, (it / t_total) * ((len(lambdas)*len(ks))-it))

1508681953.2
0: loss=1.71, Temps restant 5.17s
17.3440001011
k: 1,lambda: 0.01, erreur²: 0.90 | Temps restant ~ 1.10s
1508681970.55
0: loss=1.70, Temps restant 5.24s
17.4229998589
k: 1,lambda: 0.05, erreur²: 0.98 | Temps restant ~ 1.04s
1508681987.97
0: loss=1.75, Temps restant 5.15s
17.5670001507
k: 1,lambda: 0.10, erreur²: 0.93 | Temps restant ~ 0.97s
1508682005.54
0: loss=1.99, Temps restant 5.08s
17.3599998951
k: 1,lambda: 0.50, erreur²: 1.29 | Temps restant ~ 0.92s
1508682022.9
0: loss=2.48, Temps restant 5.14s
17.7820000648
k: 1,lambda: 1.00, erreur²: 1.21 | Temps restant ~ 0.86s
1508682040.68
0: loss=1.25, Temps restant 4.92s
19.2039999962
k: 5,lambda: 0.01, erreur²: 1.05 | Temps restant ~ 0.79s
1508682059.89
0: loss=1.24, Temps restant 4.68s
18.7380001545
k: 5,lambda: 0.05, erreur²: 1.09 | Temps restant ~ 0.73s
1508682078.62
0: loss=1.24, Temps restant 4.87s
18.4379999638
k: 5,lambda: 0.10, erreur²: 0.91 | Temps restant ~ 0.67s
1508682097.06
0: loss=1.40, Temps restant 4.98s
18