In [1]:
import numpy as np
import csv
import time
import pickle as pkl
import scipy.stats

datapath = 'ml-100k/'
filename = "u.data"

def read_lines(filename):
    return list(row[0:3] for row in list(csv.reader( open(filename, 'rb'), delimiter='\t')) )

In [2]:
def user_indexed(data):
    user_index = {}
    for [user_id, item_id, rating] in data:
        if user_id not in user_index:
            user_index[user_id] = {item_id: rating}
        else:
            user_index[user_id][item_id] = rating
    return user_index


def item_indexed(data):
    item_index = {}
    for [user_id, item_id, rating] in data:
        if item_id not in item_index:
            item_index[item_id] = {user_id: rating}
        else:
            item_index[item_id][user_id] = rating
    return item_index

In [7]:
def split_data(data, train_prop):
    p = np.random.permutation(data)
    train_size = int(round(train_prop * len(p) ))
    return p[:train_size], p[train_size:] #train, test

In [8]:
class BaselineUser():
    def fit(self, user_index):
        self.model = {}
        for user in user_index.keys():
            self.model[user] = 0
            for item in user_index[user].keys():
                self.model[user] += int(user_index[user][item])
            self.model[user] /= len( user_index[user] )
    def predict(self, lines):
        self.pred = [self.model[u] for [u, _, _] in lines]
        return self.pred
    
    def error(self, test):
        return (( np.array(self.pred) - test ) ** 2).mean()

    
class BaselineItem():
    def fit(self,item_index):
        self.model = {}
        for item in item_index.keys():
            self.model[item] = 0
            for user in item_index[item].keys():
                self.model[item] += int(item_index[item][user])
            self.model[item] /= len( item_index[item] )
    def predict(self, lines):
        self.pred = [self.model[i] for [_, i, _] in lines]
        return self.pred
    
    def error(self, test):
        return (( np.array(self.pred) - test ) ** 2).mean()

In [9]:
class MatrixFactorization():
    def __init__(self,f, iter=100, e=0.001, lmbda=0.2):
        self.f = f
        self.iter = iter
        self.e = e
        self.lmbda = lmbda
        
        
        self.p = {}
        self.q = {}
        self.bu = {}
        self.bi = {}
        
        self.loss_history = []
        self.pred = []

    def fit(self, data):
        t_total = 0
        for it in xrange(self.iter):
            loss = 0
            t = time.time()
            data_perm = np.random.permutation(data) # stochastique
            for j in xrange(len(data_perm)):
                
                u = data_perm[j][0]
                i = data_perm[j][1]
                r_ui = data_perm[j][2]
                
                self.mu = np.random.random()
                # init mu, p, bu ,q, bi
                if u not in self.p:
                    self.p[u] = np.random.rand(1,self.f)
                    self.bu[u] = np.random.random()
                if i not in self.q:
                    self.q[i] = np.random.rand(self.f,1)
                    self.bi[i] = np.random.random()
                
                # update
                phi_ui = float(r_ui) - (self.mu + self.bi[i] + self.bu[u] + np.dot(self.p[u], self.q[i])[0][0])
                self.p[u] = (1 - self.lmbda * self.e) * self.p[u] + self.e * self.q[i].T * phi_ui
                self.q[i] = (1 - self.lmbda * self.e) * self.q[i] + self.e * self.p[u].T * phi_ui
                self.bu[u] = (1 - self.lmbda * self.e) * self.bu[u] + self.e * phi_ui
                self.bi[i] = (1 - self.lmbda * self.e) * self.bi[i] + self.e * phi_ui
                self.mu = (1 - self.lmbda * self.e) * self.mu + self.e * phi_ui
                
                loss += phi_ui**2

            t_total += (time.time() - t)
            vitesse = (it+1) / t_total
            if (it % 50==0):
                print "%d: loss=%.2f, Temps restant %.2fs" % ( it, loss/len(data), vitesse * (self.iter - (it+1) ) )
            self.loss_history.append(loss)
                
    def predict(self, lines):         
        self.pred = [ self.mu + self.bu[u] + self.bi[i] + np.dot( self.p[u], self.q[i] )[0][0] for [u, i, _] in lines ]
        return self.pred
    
    def error(self, test):
        return (( np.array(self.pred) - test ) ** 2).mean()

In [28]:
lines = read_lines(datapath+filename)
train_lines, test_lines = split_data(lines, 0.8)
dep = len( test_lines )
print "Données de test de départ:", dep
users_del = set([line[0] for line in test_lines]) - set([line[0] for line in train_lines])
films_del = set([line[1] for line in test_lines]) - set([line[1] for line in train_lines])
test_lines = [t.tolist() for t in test_lines if t[0] not in users_del and t[1] not in films_del]
print "Données de test supprimées:", dep-len( test_lines )

user_index_train = user_indexed(train_lines)
item_index_train = item_indexed(train_lines)

Données de test de départ: 20000
Données de test supprimées: 42


In [30]:
model_user = BaselineUser()
model_user.fit(user_index_train)
model_user.predict(test_lines)
print "Erreur en test pour la baseline User:", model_user.error(np.array(test_lines, float)[:,2])

model_item = BaselineItem()
model_item.fit(item_index_train)
model_item.predict(test_lines)
print "Erreur en test pour la baseline Item:",  model_item.error(np.array(test_lines, float)[:,2])

Erreur en test pour la baseline User: 1.41557270268
Erreur en test pour la baseline Item: 1.36291211544


In [31]:
def loadMovieLens(path='/data/movielens'):
    # Get movie titles
    movies={}
    for line in open(path+'/u.item'):
        (id,title)=line.split('|')[0:2]
        movies[id]=title
    # Load data
    prefs={}
    for line in open(path+'/u.data'):
        (user,movieid,rating,ts)=line.split('\t')
        prefs.setdefault(user,{})
        prefs[user][movies[movieid]]=float(rating)
    return prefs

In [34]:
# Charement du modèle entrainé

nb_iter = 1000
lmbda = 0.05
k = 5

model = MatrixFactorization(k, iter=nb_iter, lmbda=lmbda, e=1e-3)
model.fit(train_lines)
model.predict(test_lines)
error = model.error(np.array(test_lines, float)[:,2])
print "k: %d,lambda: %.2f, erreur²: %.2f" % (k, lmbda, error)
     
f = open('model.pkl', 'w')
pkl.dump(model, f)
f.close()

f = open("model.pkl")
model = pkl.load(f)
f.close()

movielens = user_indexed(read_lines(datapath+filename))

x_movies = []
x_users= []
y = []
for (movie_id, rep_movie), (user_id, rep_user) in zip( model.q.items(), model.p.items() ):
    y.append( int(round( model.predict( [ [user_id,movie_id, 0] ] )[0] )) )
    x_movies.append( rep_movie.squeeze() )
    x_users.append( rep_user.squeeze() )

# tSNE_movies = TSNE(learning_rate=1000,  n_iter=200).fit_transform(x_movies)
# tSNE_users = TSNE(learning_rate=100,  n_iter=200).fit_transform(x_users)


tSNE_movies = TSNE2(n_components=2, perplexity=30.0, learning_rate=1000.0, n_iter=200, alpha=0).fit_transform(np.array(x_movies))

0: loss=1.90, Temps restant 555.00s
50: loss=0.91, Temps restant 520.27s
100: loss=0.88, Temps restant 493.04s
150: loss=0.85, Temps restant 467.27s
200: loss=0.82, Temps restant 440.39s
250: loss=0.79, Temps restant 413.65s
300: loss=0.77, Temps restant 386.37s
350: loss=0.75, Temps restant 356.80s
400: loss=0.74, Temps restant 327.20s
450: loss=0.73, Temps restant 293.83s
500: loss=0.72, Temps restant 266.14s
550: loss=0.72, Temps restant 239.19s
600: loss=0.72, Temps restant 212.14s
650: loss=0.71, Temps restant 185.72s
700: loss=0.70, Temps restant 159.55s
750: loss=0.70, Temps restant 133.24s
800: loss=0.70, Temps restant 106.66s
850: loss=0.69, Temps restant 79.62s
900: loss=0.69, Temps restant 52.73s
950: loss=0.69, Temps restant 26.06s
k: 5,lambda: 0.05, erreur²: 1.15


NameError: name 'TSNE2' is not defined

In [None]:
f = open("model.pkl")
model = pkl.load(f)
f.close()

movielens = user_indexed(read_lines(datapath+filename))

x_movies = []
x_users= []
y = []
for (movie_id, rep_movie), (user_id, rep_user) in zip( model.q.items(), model.p.items() ):
    y.append( int(round( model.predict( [ [user_id,movie_id, 0] ] )[0] )) )
    x_movies.append( rep_movie.squeeze() )
    x_users.append( rep_user.squeeze() )

# tSNE_movies = TSNE(learning_rate=1000,  n_iter=200).fit_transform(x_movies)
# tSNE_users = TSNE(learning_rate=100,  n_iter=200).fit_transform(x_users)


tSNE_movies = TSNE(n_components=2, perplexity=30.0, learning_rate=1000.0, n_iter=200).fit_transform(np.array(x_movies))