In [1]:
import os
import re
import pandas as pd
from sklearn.decomposition import NMF
from scipy.sparse import dok_matrix
import numpy as np
import pandas as pd
from scipy.spatial.distance import sqeuclidean, cosine
import tensorflow as tf
import math
np.random.seed(0)

# NON NEGATIVE-MATRIX FACTORIZATION

solution au problème des zéros traités comme des notes par la NMF : https://nipunbatra.github.io/blog/2017/nnmf-tensorflow.html

In [2]:
def MSE_err(truth,pred):
    """
    computes MSE from real-pred difference
    """
    return np.mean((truth-pred)**2)

def MAE_err(truth,pred):
    """
    computes MAE from real-pred difference
    """
    return np.mean(abs(np.array(truth-pred)))

### récupération des données

In [3]:
path = "ratings_60"

In [4]:
d_user = dict() #{username : {serie: note, serie : note}}

for user in sorted(os.listdir(path)):
    username = re.sub(".txt", "", user)
    d_user[username] = dict()
    with open(path+"/"+user) as file: 
        lignes = file.readlines()
        for ligne in lignes :
            serie, rating = ligne.split(" ")
            rating = rating.rstrip("\n")
            rating = float(rating)
            
            d_user[username][serie] = rating

liste_series = set()
for username, d_s in d_user.items() :
    for serie, rating in d_s.items() :
        liste_series.add(serie)
liste_series = list(liste_series)

data = []
for username, d_s in d_user.items() :
    for serie, rating in d_s.items() :
        data.append( (username, serie, rating) )
        
# We first remap users and item to ids between (0,len(user)) and (0,len(item))
u_dic = {} #{username : user id}
i_dic = {} #{item title : item id}
        
all_data = [] #[(user id, item id, rating)]
    
d_username_id = dict()
d_itemname_id = dict()
for uid,iid,rating in data:  # iterating on all data
    
    uk = u_dic.setdefault(uid,len(u_dic))
    ik = i_dic.setdefault(iid,len(i_dic))
    all_data.append((uk,ik,float(rating)))
    d_username_id[uid] = uk
    d_itemname_id[iid] = ik

num_user = len(u_dic)
num_item = len(i_dic)

print(str(num_user)+" users and "+str(num_item)+" items.")

60 users and 1357 items.


In [5]:
# (1) Create sparse matrix from all ratings
Full = dok_matrix((num_user, num_item), dtype=np.float32)

for uid,iid,rating in all_data:
    Full[uid,iid] = float(rating)
    
# We take 10% of the train set as test data
train_mat = dok_matrix((num_user, num_item), dtype=np.float32)
test = []
train = []
    
for i,(uid,iid,rating) in enumerate(all_data):
    if i%10 == 0: #one out of 10 is for test
        test.append((uid,iid,rating))
    else:
        train.append((uid,iid,rating))
        train_mat[uid,iid] = rating   

print("Number of train examples: ", train_mat.nnz)
print("Number of test examples: ", len(test))

#True values for train and test
truth_tr = np.array([rating for (uid,iid),rating in train_mat.items()])
truth_te = np.array([rating for uid,iid,rating in test])

Number of train examples:  3305
Number of test examples:  368


# NMF

In [6]:
A_orig = np.array(train_mat.todense())

#on remplace les zeros (notes absentes) par des NaN
for i in range(num_user):
    for j in range(num_item):
        if A_orig[i][j] == 0.0:
            A_orig[i][j] = np.NaN
            
A_df = pd.DataFrame(A_orig)
A_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1347,1348,1349,1350,1351,1352,1353,1354,1355,1356
0,,10.0,10.0,10.0,7.0,10.0,10.0,8.0,10.0,10.0,...,,,,,,,,,,
1,,,,,,,,,8.0,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,


In [33]:
#on utilise un masque : les notes renseignées sont à True, les NaN à False
np_mask = A_df.notnull()
np_mask.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1347,1348,1349,1350,1351,1352,1353,1354,1355,1356
0,False,True,True,True,True,True,True,True,True,True,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


#### Tensorflow setup

In [34]:
# Boolean mask for computing cost only on valid (not missing) entries
tf_mask = tf.Variable(np_mask.values)

A = tf.constant(A_df.values)
shape = A_df.values.shape

#latent factors : nombre de dimensions latente
rank = 200

# Initializing random H and W
temp_H = np.random.randn(rank, shape[1]).astype(np.float32)
temp_H = np.divide(temp_H, temp_H.max())

temp_W = np.random.randn(shape[0], rank).astype(np.float32)
temp_W = np.divide(temp_W, temp_W.max())

H =  tf.Variable(temp_H)
W = tf.Variable(temp_W)
WH = tf.matmul(W, H)

#### Cost function

In [35]:
#cost of Frobenius norm
cost = tf.reduce_sum(tf.pow(tf.boolean_mask(A, tf_mask) - tf.boolean_mask(WH, tf_mask), 2))

#### Initialization

In [36]:
# Learning rate
lr = 0.001
# Number of steps
steps = 1000
train_step = tf.train.GradientDescentOptimizer(lr).minimize(cost)
init = tf.global_variables_initializer()

#### Ensuring non-negativity (on veut uniquement des valeurs positives dans W et H tq A = W*H)

In [37]:
# Clipping operation. This ensure that W and H learnt are non-negative
clip_W = W.assign(tf.maximum(tf.zeros_like(W), W))
clip_H = H.assign(tf.maximum(tf.zeros_like(H), H))
clip = tf.group(clip_W, clip_H)

#### Ok let's go

In [38]:
steps = 1000
with tf.Session() as sess:
    sess.run(init)
    for i in range(steps):
        sess.run(train_step)
        sess.run(clip)
        if i%100==0:
            print("\nCost: %f" % sess.run(cost))
            print("*"*40)
    learnt_W = sess.run(W)
    learnt_H = sess.run(H)


Cost: 120459.148438
****************************************

Cost: 211.744476
****************************************

Cost: 15.723871
****************************************

Cost: 2.687321
****************************************

Cost: 0.633908
****************************************

Cost: 0.172783
****************************************

Cost: 0.051176
****************************************

Cost: 0.015993
****************************************

Cost: 0.005192
****************************************

Cost: 0.001733
****************************************


In [12]:
def pred_func(W, H, u, i):
    H_t = H.T
    return np.dot(W[u], H_t[i])

def replace_sup10_inf1(l):
    res = []
    for i in range(len(l)):
        if l[i] > 10.0:
            res.append(10.0)
        elif l[i] < 1.0:
            res.append(1.0)
        else:
            res.append(l[i])
    return res

In [40]:
prediction_tr = np.array([pred_func(learnt_W, learnt_H, u, i) for (u,i),rating in train_mat.items()])
prediction_te = np.array([pred_func(learnt_W, learnt_H, u, i) for u,i,rating in test])

In [41]:
#on arrondi les ratings predits
prediction_tr = prediction_tr.round()
prediction_te = prediction_te.round()

In [42]:
#on remplace les ratings > 10 par 10, < 1 par 1
prediction_tr = replace_sup10_inf1(prediction_tr)
prediction_te = replace_sup10_inf1(prediction_te)

#### L'heure de vérité

In [43]:
print("Training Error:")
print("MSE:",  MSE_err(prediction_tr,truth_tr))
print("MAE:",  MAE_err(prediction_tr,truth_tr))
    
print("Test Error:")
print("MSE:",  MSE_err(prediction_te,truth_te))
print("MAE:",  MAE_err(prediction_te,truth_te))

Training Error:
MSE: 0.0
MAE: 0.0
Test Error:
MSE: 5.315217391304348
MAE: 1.815217391304348


Tada !

<b>A voir:</b> Comment optimiser le nombre de dimensions latentes ?

# Prise en compte user/item bias

In [20]:
train_mat_mean = np.array(list(train_mat.values())).mean() #moyenne des notes dans la matrice train
item_bias = [] #biais item : moyenne des notes reçues l'item - moyenne du dataset
user_bias = [] #biais user : moyenne des notes données par l'utilisateur - moyenne du dataset

for i in range(0, num_user):
    user = train_mat[i]
    user_mean = np.array(list(user.values())).mean()
    bias = user_mean - train_mat_mean
    if np.isnan(bias):
        user_bias.append(0)
    else:
        user_bias.append(bias)

train_mat_T = train_mat.copy().transpose()
for i in range(0, num_item):
    item = train_mat_T[i]
    item_mean = np.array(list(item.values())).mean()
    bias = item_mean - train_mat_mean
    if np.isnan(bias):
        item_bias.append(0)
    else:
        item_bias.append(bias)

  ret = ret.dtype.type(ret / rcount)


In [21]:
train_bias = dok_matrix((num_user, num_item), dtype=np.float32)

for (u,i), rating in train_mat.items():
    train_bias[u,i] = rating - train_mat_mean - user_bias[u] - item_bias[i]

In [22]:
def pred_func_bias(W, H, mean, user_bias, item_bias, u, i):
    H_t = H.T
    return np.dot(W[u], H_t[i])+mean+ user_bias[u]+item_bias[i]

In [32]:
A_orig = np.array(train_bias.todense())

#on remplace les zeros (notes absentes) par des NaN
for i in range(num_user):
    for j in range(num_item):
        if A_orig[i][j] == 0.0:
            A_orig[i][j] = np.NaN
            
A_df = pd.DataFrame(A_orig)

#on utilise un masque
np_mask = A_df.notnull()

# Boolean mask for computing cost only on valid (not missing) entries
tf_mask = tf.Variable(np_mask.values)

A = tf.constant(A_df.values)
shape = A_df.values.shape

#latent factors : nombre de dimensions latente
rank = 200

# Initializing random H and W
temp_H = np.random.randn(rank, shape[1]).astype(np.float32)
temp_H = np.divide(temp_H, temp_H.max())

temp_W = np.random.randn(shape[0], rank).astype(np.float32)
temp_W = np.divide(temp_W, temp_W.max())

H =  tf.Variable(temp_H)
W = tf.Variable(temp_W)
WH = tf.matmul(W, H)

#cost of Frobenius norm
cost = tf.reduce_sum(tf.pow(tf.boolean_mask(A, tf_mask) - tf.boolean_mask(WH, tf_mask), 2))

# Learning rate
lr = 0.001
# Number of steps
steps = 1000
train_step = tf.train.GradientDescentOptimizer(lr).minimize(cost)
init = tf.global_variables_initializer()

# Clipping operation. This ensure that W and H learnt are non-negative
clip_W = W.assign(tf.maximum(tf.zeros_like(W), W))
clip_H = H.assign(tf.maximum(tf.zeros_like(H), H))
clip = tf.group(clip_W, clip_H)

steps = 1000
with tf.Session() as sess:
    sess.run(init)
    for i in range(steps):
        sess.run(train_step)
        sess.run(clip)
        if i%100==0:
            print("\nCost: %f" % sess.run(cost))
            print("*"*40)
    learnt_W = sess.run(W)
    learnt_H = sess.run(H)


Cost: 23262.197266
****************************************

Cost: 6144.735352
****************************************

Cost: 5411.951172
****************************************

Cost: 5197.051270
****************************************

Cost: 5119.221680
****************************************

Cost: 5085.755859
****************************************

Cost: 5071.203125
****************************************

Cost: 5064.835449
****************************************

Cost: 5061.595703
****************************************

Cost: 5059.794434
****************************************


In [33]:
prediction_tr = np.array([pred_func_bias(learnt_W, learnt_H, train_mat_mean,  user_bias, item_bias, u, i,) for (u,i),rating in train_mat.items()])
prediction_te = np.array([pred_func_bias(learnt_W, learnt_H, train_mat_mean,  user_bias, item_bias, u, i) for u,i,rating in test])

#on arrondi les ratings predits
prediction_tr = prediction_tr.round()
prediction_te = prediction_te.round()

#on remplace les ratings > 10 par 10, < 1 par 1
prediction_tr = replace_sup10_inf1(prediction_tr)
prediction_te = replace_sup10_inf1(prediction_te)

In [34]:
print("Training Error:")
print("MSE:",  MSE_err(prediction_tr,truth_tr))
print("MAE:",  MAE_err(prediction_tr,truth_tr))
    
print("Test Error:")
print("MSE:",  MSE_err(prediction_te,truth_te))
print("MAE:",  MAE_err(prediction_te,truth_te))

Training Error:
MSE: 1.3797276853252647
MAE: 0.5573373676248109
Test Error:
MSE: 6.274456521739131
MAE: 1.6983695652173914
