# Recommandation filtrage collaboratif

In [52]:
import os
import re
import pandas as pd
from sklearn.decomposition import NMF
from scipy.sparse import dok_matrix
import numpy as np

In [73]:
# take as input two lists of ratings

def MSE_err(truth,pred):
    """
    computes MSE from real-pred difference
    """
    return np.mean((truth-pred)**2)

def MAE_err(truth,pred):
    """
    computes MAE from real-pred difference
    """
    return np.mean(abs(np.array(truth-pred)))
        

In [23]:
path = '/Vrac/PLDAC_addic7ed/ratings/small_ratings'

In [44]:
d_user = dict()
cpt = 0
for user in os.listdir(path) :
    username = re.sub(".txt", "", user)
    d_user[username] = dict()
    with open(path+"/"+user) as file: 
        lignes = file.readlines()
        for ligne in lignes :
            serie, rating = ligne.split(" ")
            rating = rating.rstrip("\n")
            rating = float(rating)
            
            d_user[username][serie] = rating
    
    cpt += 1
    if cpt > 30 :
        break
    
d_user

{'=G=': {'alias': 7.0,
  'angels-in-america': 7.0,
  'band-of-brothers': 10.0,
  'brothers': 10.0,
  'cambridge-spies': 6.0,
  'csi-crime-scene-investigation': 8.0,
  'csi-miami': 7.0,
  'daniel-deronda': 7.0,
  'empire-falls': 7.0,
  'jack-the-ripper': 5.0,
  'profiler': 5.0,
  'the-closer': 5.0,
  'the-shield': 7.0,
  'the-sopranos': 9.0,
  'the-west-wing': 10.0,
  'the-wire': 10.0,
  'touching-evil': 7.0},
 'B24': {'any-human-heart': 8.0,
  'big-love': 9.0,
  'boardwalk-empire': 7.0,
  'bosch': 8.0,
  'copper': 6.0,
  'deutschland-83': 7.0,
  'downton-abbey': 7.0,
  'empire': 7.0,
  'grantchester': 6.0,
  'homeland': 4.0,
  'john-adams': 9.0,
  'outlander': 2.0,
  'please-like-me': 10.0,
  'slings-and-arrows': 8.0,
  'the-borgias': 7.0,
  'to-the-ends-of-the-earth': 10.0,
  'true-blood': 6.0,
  'turn': 5.0,
  'zen': 8.0},
 'Brave_TraveIor': {'bloodline': 9.0,
  'breaking-bad': 10.0,
  'californication': 10.0,
  'chosen': 10.0,
  'crash': 9.0,
  'desperate-housewives': 8.0,
  'fargo'

In [45]:
liste_series = set()
for username, d_s in d_user.items() :
    for serie, rating in d_s.items() :
        liste_series.add(serie)
liste_series = list(liste_series)
len(liste_series)

237

In [46]:
d_user['ikrani']['the-13-ghosts-of-scooby-doo']

5.0

In [61]:
data = []
for username, d_s in d_user.items() :
    for serie, rating in d_s.items() :
        data.append( (username, serie, rating) )

In [62]:
data[0]

('ikrani', 'cake-boss', 7.0)

In [95]:
# We first remap users and item to ids between (0,len(user)) and (0,len(item))
u_dic = {}
i_dic = {}
        
all_data = []
    
d_username_id = dict()
d_itemname_id = dict()
for uid,iid,rating in data:  # iterating on all data
    
    uk = u_dic.setdefault(uid,len(u_dic))
    ik = i_dic.setdefault(iid,len(i_dic))
    all_data.append((uk,ik,float(rating)))
    d_username_id[uid] = uk
    d_itemname_id[iid] = ik

num_user = len(u_dic)
num_item = len(i_dic)

print(str(num_user)+" users and "+str(num_item)+" items.")

16 users and 237 items.


In [70]:
# (1) Create sparse matrix from all ratings
Full = dok_matrix((num_user, num_item), dtype=np.float32)

for uid,iid,rating in all_data:
    Full[uid,iid] = float(rating)
    
    
# (2) Factorizing matrix

model = NMF(n_components=25, init='random', random_state=0, max_iter=350)
U = model.fit_transform(Full) #users
I = model.components_      #items

I = I.transpose()
I.shape


(237, 25)

In [75]:
Full.shape

(16, 237)

In [76]:
len(all_data)

277

In [72]:
# We take 10% of the train set as test data
train_mat = dok_matrix((num_user, num_item), dtype=np.float32)
test = []
train = []
    
for i,(uid,iid,rating) in enumerate(all_data):
    if i%10 == 0: #one out of 10 is for test
        test.append((uid,iid,rating))
    else:
        train.append((uid,iid,rating))
        train_mat[uid,iid] = rating
    
print("Number of train examples: ", train_mat.nnz)
print("Number of test examples: ", len(test))


Number of train examples:  249
Number of test examples:  28


In [80]:
train_mat.shape

(16, 237)

In [82]:
from sklearn.decomposition import NMF, TruncatedSVD


print("----------------------NMF---------------------------")



## NMF model
model = NMF(n_components=16, solver='cd' ,random_state=0, max_iter=100,alpha=5,l1_ratio=0.5)

#get submatrices
U_nmf = model.fit_transform(train_mat)
I_nmf = model.components_.transpose()

print("Shapes :")
print(U_nmf.shape) ## see the shapes of your submatrices
print(I_nmf.shape)

## to complete
def pred_func_nmf(uid,iid):
    
    Uu = U_nmf[uid]
    Ii = I_nmf[iid]
    
    return  np.dot(Uu, Ii)


## Getting the truth values
truth_tr = np.array([rating for (uid,iid),rating in train_mat.items()])
truth_te = np.array([rating for uid,iid,rating in test])

prediction_tr = np.array([pred_func_nmf(u, i) for (u,i),rating in train_mat.items()])
prediction_te = np.array([pred_func_nmf(u, i) for u,i,rating in test])


print("Training Error:")
print("MSE:",  MSE_err(prediction_tr,truth_tr))
print("MAE:",  MAE_err(prediction_tr,truth_tr))
    
print("Test Error:")
print("MSE:",  MSE_err(prediction_te,truth_te))
print("MAE:",  MAE_err(prediction_te,truth_te))

----------------------NMF---------------------------
Shapes :
(16, 16)
(237, 16)
Training Error:
MSE: 0.9487589144661032
MAE: 0.9432337597884808
Test Error:
MSE: 49.296604283096336
MAE: 6.533775280831252


In [83]:
print("----------------------SVD---------------------------")

## SVD Model

model = TruncatedSVD(n_components=150)

#get submatrices
U_svd = model.fit_transform(train_mat)
I_svd = model.components_.transpose()


def pred_func_svd(uid,iid):
    
    Uu = U_svd[uid]
    Ii = I_svd[iid]
    
    return np.dot(Uu, Ii)  

    
prediction_tr = np.array([pred_func_svd(u, i) for (u,i),rating in train_mat.items()])
prediction_te = np.array([pred_func_svd(u, i) for u,i,rating in test])


print("Training Error:")
print("MSE:",  MSE_err(prediction_tr,truth_tr))
print("MAE:",  MAE_err(prediction_tr,truth_tr))
    
print("Test Error:")
print("MSE:",  MSE_err(prediction_te,truth_te))
print("MAE:",  MAE_err(prediction_te,truth_te))

----------------------SVD---------------------------
Training Error:
MSE: 7.2488773e-12
MAE: 1.9341587e-06
Test Error:
MSE: 49.321429316912536
MAE: 6.535714407846865


In [84]:
print("----------------------MEAN ONLY---------------------------")


# compute mean training ratings (~3.5)
mean = np.mean([rating for (uid,iid),rating in train_mat.items()])


def pred_func_mean(uid,iid):
    
    
    return mean

print("mean rating is ", mean)


prediction_tr = np.array([pred_func_mean(u, i) for (u,i),rating in train_mat.items()])
prediction_te = np.array([pred_func_mean(u, i) for u,i,rating in test])

print("Training Error:")
print("MSE:",  MSE_err(prediction_tr,truth_tr))
print("MAE:",  MAE_err(prediction_tr,truth_tr))
    
print("Test Error:")
print("MSE:",  MSE_err(prediction_te,truth_te))
print("MAE:",  MAE_err(prediction_te,truth_te))

----------------------MEAN ONLY---------------------------
mean rating is  6.2208834
Training Error:
MSE: 8.115869
MAE: 2.395026
Test Error:
MSE: 6.704985852777229
MAE: 2.091365473611014


In [85]:
from sklearn.decomposition import NMF, TruncatedSVD


# (1) compute mean of training ratings
mean = mean




# (2) mean normalize training matrix
tmn = dok_matrix((num_user, num_item), dtype=np.float32)

for (uid,iid), rating in train_mat.items():
    tmn[uid,iid] = rating - mean

# (3) factorize matrix
model_norm = TruncatedSVD(n_components=150)

#get submatrices
U_msvd = model.fit_transform(tmn)
I_msvd = model.components_.transpose()

def pred_func_msvd(uid,iid): 
    
    Uu = U_msvd[uid]
    Ii = I_msvd[iid]
    
    return np.dot(Uu, Ii) + mean


prediction_tr = np.array([pred_func_msvd(u, i) for (u,i),rating in train_mat.items()])
prediction_te = np.array([pred_func_msvd(u, i) for u,i,rating in test])


print("Training Error:")
print("MSE:",  MSE_err(prediction_tr,truth_tr))
print("MAE:",  MAE_err(prediction_tr,truth_tr))
    
print("Test Error:")
print("MSE:",  MSE_err(prediction_te,truth_te))
print("MAE:",  MAE_err(prediction_te,truth_te))
    

Training Error:
MSE: 3.9292727e-12
MAE: 1.11262e-06
Test Error:
MSE: 6.70498565197497
MAE: 2.0913653884615218


In [86]:
from sklearn.decomposition import NMF, TruncatedSVD


def group_by_user(tuple_list):
    r_dic = {}
    for uid,iid,rating in tuple_list:
        list_rev = r_dic.get(uid,[])
        list_rev.append(rating)
    
        r_dic[uid] =list_rev
    return r_dic


def group_by_item(tuple_list):
    r_dic = {}
    for uid,iid,rating in tuple_list:
        list_rev = r_dic.get(iid,[])
        list_rev.append(rating)
    
        r_dic[iid] =list_rev
    return r_dic





# (1) compute means of training set
mean = mean

# user and item deviation to mean
u_means = {u:(np.mean(ratings - mean)) for u,ratings in group_by_user(train).items()}
i_means = {i:(np.mean(ratings) - mean) for i,ratings in group_by_item(train).items()}




# (2) normalize training matrix
tmn_k = dok_matrix((num_user, num_item), dtype=np.float32)

for (uid,iid), rating in train_mat.items():
    tmn_k[uid,iid] = rating - mean - u_means.get(uid,0) - i_means.get(iid,0)
    
# (3) factorize matrix
model_kor = TruncatedSVD(n_components=150)


U_ksvd = model.fit_transform(tmn_k)
I_ksvd = model.components_.transpose()


def pred_func_ksvd(uid,iid):
    Uu = U_ksvd[uid]
    Ii = I_ksvd[iid]
    Bu = u_means.get(uid,0)
    Bi = i_means.get(iid, 0)
    
    return np.dot(Uu, Ii) + mean + Bu + Bi


prediction_tr = np.array([pred_func_ksvd(u, i) for (u,i),rating in train_mat.items()])
prediction_te = np.array([pred_func_ksvd(u, i) for u,i,rating in test])


print("Training Error:")
print("MSE:",  MSE_err(prediction_tr,truth_tr))
print("MAE:",  MAE_err(prediction_tr,truth_tr))
    
print("Test Error:")
print("MSE:",  MSE_err(prediction_te,truth_te))
print("MAE:",  MAE_err(prediction_te,truth_te))

Training Error:
MSE: 1.2324639727784308e-12
MAE: 6.962742018929482e-07
Test Error:
MSE: 4.791732325396362
MAE: 1.777179756553365


In [87]:
# The dcg@k is the sum of the relevance, penalized gradually
def dcg_at_k(r, k):
    """Score is discounted cumulative gain (dcg)
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
        
    """
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        
    return 0.

# test values
# r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
# dcg_at_k(r, 1) => 3.0
# dcg_at_k(r, 2) => 4.2618595071429155
    

# And it's normalized version
def ndcg_at_k(r, k):
    """
        r: Relevance scores (list or numpy) in rank order
            (first element is the first item)
        k: Number of results to consider
    """
    dcg_max =  dcg_at_k(sorted(r)[::-1], k)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k) / dcg_max

# test values
# r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]
# ndcg_at_k(r, 1) => 1.0
# ndcg_at_k(r, 4) => 0.794285
    
r = [3, 2, 3, 0, 0, 1, 2, 2, 3, 0]    
ndcg_at_k(r, 4)   

0.7942854176010882

In [88]:
from random import shuffle

#1) Group (uid,iid,rating) per uid
def group_by_user(tuple_list):
    r_dic = {}
    for uid,iid,rating in tuple_list:
        list_rev = r_dic.get(uid,[])
        list_rev.append((uid,iid,rating))
    
        r_dic[uid] =list_rev
    return r_dic #returns {uid:[(uid,iid,rating),...],...}



userg_train = group_by_user(train)  #returns {uid:[(uid,iid,rating),...],...}
userg_test = group_by_user(test)


# Function to compute a random shuffle ndcg
def random_ndcg(uid_group_tuples,k=10):
    mean_ndcg = 0
    num_users = 0
    
    #for each test set
    for _,list_rating in uid_group_tuples.items():

        #shuffle real ratings.
        real_ratings = [rating for uid,iid,rating in list_rating]
        shuffle(real_ratings)
        pred_objects = real_ratings

        mean_ndcg += ndcg_at_k(pred_objects,k)
        num_users += 1

    return  mean_ndcg/num_users



#Function to compute ndcg on test set
def mean_ndcg_UI(U,I,pred_function,uid_group_tuples,k=10):
    mean_ndcg = 0
    num_users = 0
    
    #for each test set
    for _,list_rating in uid_group_tuples.items():
        
        #2)compute predictions
        pred_ratings = [pred_function(uid, iid) for uid, iid, rating in list_rating ]
        
        #3)to sort real ratings
        real_ratings = [rating for uid,iid,rating in list_rating]
        pred_objects = [ real_ratings[rid] for rid in np.argsort(pred_ratings)[::-1]]
        
        #4)and compute ndcg
        mean_ndcg += ndcg_at_k(pred_objects,k)
        num_users += 1

    return  mean_ndcg/num_users
    
    

print("train") 
print("-"*10)
print("mean == random", random_ndcg(userg_train))
print("ndcg nmf", mean_ndcg_UI(U_nmf,I_nmf,pred_func_nmf,userg_train)) 
print("ndcg svd", mean_ndcg_UI(U_svd,I_svd,pred_func_svd,userg_train))
print("ndcg svd + mean", mean_ndcg_UI(U_msvd,I_msvd,pred_func_nmf,userg_train))
print("ndcg svd koren", mean_ndcg_UI(U_ksvd,I_ksvd,pred_func_ksvd,userg_train))

print(" ")

print("test")    
print("-"*10) 
print("mean == random", random_ndcg(userg_test))
print("ndcg nmf", mean_ndcg_UI(U_nmf,I_nmf,pred_func_nmf,userg_test)) 
print("ndcg svd", mean_ndcg_UI(U_svd,I_svd,pred_func_svd,userg_test))
print("ndcg svd + mean", mean_ndcg_UI(U_msvd,I_msvd,pred_func_nmf,userg_test))
print("ndcg svd koren", mean_ndcg_UI(U_ksvd,I_ksvd,pred_func_ksvd,userg_test))


train
----------
mean == random 0.7714964009460706
ndcg nmf 1.0
ndcg svd 1.0
ndcg svd + mean 1.0
ndcg svd koren 1.0
 
test
----------
mean == random 0.9667664184149902
ndcg nmf 0.9694001957209294
ndcg svd 0.9760932453788536
ndcg svd + mean 0.9694001957209294
ndcg svd koren 0.9686241782530138


In [89]:
from collections import Counter

counts = Counter(iid for _,iid,_ in train)

#the popularity predictor
def pop_pred(uid,iid):
    return counts[iid]

#Random ndcg, return a shuffled lists of all possible ratings
def random_ndcg_full(k=10,default=0):
    mean_ndcg = 0
    num_users = 0
    
    for _,list_rating in userg_test.items():

        #all possible ratings
        real_ratings = [rating for uid,iid,rating in list_rating] + [default]*(num_item - len(list_rating) - len(userg_train[uid]))
        shuffle(real_ratings)
        pred_objects = real_ratings
        
        mean_ndcg += ndcg_at_k(pred_objects,k)
        num_users += 1

    return  mean_ndcg/len(userg_test)


def mean_ndcg_UI_FULL(U,I,pred_function,k=10,default=0):
    mean_ndcg = 0
    
    test_users = set(uid for uid,_,_ in test)
    
    for user in test_users:
        u_train_set = set(iid for _,iid,_ in userg_train[uid])
        u_test_dic =  {iid:rating for uid,iid,rating in userg_test[uid]}
        
        pred_ratings = []
        real_ratings = []
        
        
        for item in range(num_item):
            
            if item in u_train_set:
                continue
            else:
                p_rating = pred_function(user,item)
            
            pred_ratings.append(p_rating)
            real_ratings.append(u_test_dic.get(item,default))
            
        #pred_objects = [] ##### complete here
        pred_objects = [ real_ratings[rid] for rid in np.argsort(pred_ratings)[::-1]]
        
        mean_ndcg += ndcg_at_k(pred_objects,k)

    return  mean_ndcg/len(test_users)

print("mean == random", random_ndcg_full())
print("ndcg pop", mean_ndcg_UI_FULL(U_nmf,I_nmf,pop_pred)) 
print("ndcg nmf", mean_ndcg_UI_FULL(U_nmf,I_nmf,pred_func_nmf)) 
print("ndcg svd", mean_ndcg_UI_FULL(U_svd,I_svd,pred_func_svd))
print("ndcg svd + mean", mean_ndcg_UI_FULL(U_msvd,I_msvd,pred_func_nmf))
print("ndcg svd koren", mean_ndcg_UI_FULL(U_msvd,I_msvd,pred_func_ksvd))

mean == random 0.0
ndcg pop 0.18792186028912342
ndcg nmf 0.011745116268070217
ndcg svd 0.011745116268070217
ndcg svd + mean 0.011745116268070217
ndcg svd koren 0.0


In [100]:
def create_sparse_mat(data) : 
    # We first remap users and item to ids between (0,len(user)) and (0,len(item))
    u_dic = {}
    i_dic = {}

    all_data = []

    d_username_id = dict()
    d_itemname_id = dict()
    for uid,iid,rating in data:  # iterating on all data

        uk = u_dic.setdefault(uid,len(u_dic))
        ik = i_dic.setdefault(iid,len(i_dic))
        all_data.append((uk,ik,float(rating)))
        d_username_id[uid] = uk
        d_itemname_id[iid] = ik

    num_user = len(u_dic)
    num_item = len(i_dic)
    
    # (1) Create sparse matrix from all ratings
    Full = dok_matrix((num_user, num_item), dtype=np.float32)

    for uid,iid,rating in all_data:
        Full[uid,iid] = float(rating)
        
    return d_username_id, d_itemname_id, Full

In [128]:
def plus_proche_voisin(username, d_username_id, Full) :
    user_id = d_username_id[username]
    f = Full[user_id].todense()
    d_max = 0
    i_max = 0
    for i in range(0, Full.shape[0]) :
        if i != user_id :
            fi = Full[i].todense()
            d = sqeuclidean(f, fi)
            if d > d_max :
                i_max = i
                d_max = d
    return i_max

In [130]:
ppv = plus_proche_voisin('ikrani', d_username_id, Full)

In [131]:
d_username_id

{'=G=': 15,
 'B24': 1,
 'Brave_TraveIor': 3,
 'Franklie': 9,
 'Miles-10': 6,
 'SeriousJest': 14,
 'aliases-53334': 13,
 'alindsay-al': 8,
 'backup-50362': 2,
 'bigverybadtom': 7,
 'claudio_carvalho': 4,
 'ikrani': 0,
 'mike-ryan455': 12,
 'pavanratnaker': 11,
 'tinyone2': 10,
 'willz187': 5}

In [132]:
d_user['Brave_TraveIor']

{'bloodline': 9.0,
 'breaking-bad': 10.0,
 'californication': 10.0,
 'chosen': 10.0,
 'crash': 9.0,
 'desperate-housewives': 8.0,
 'fargo': 9.0,
 'how-to-get-away-with-murder': 7.0,
 'kingdom': 9.0,
 'louie': 10.0,
 'mad-dogs': 10.0,
 'magic-city': 10.0,
 'power': 10.0,
 'southland': 10.0,
 'terriers': 10.0,
 'the-knick': 10.0,
 'the-last-kingdom': 9.0,
 'the-leftovers': 10.0,
 'wentworth': 7.0}

In [133]:
d_user['ikrani']

{'avengers-assemble': 2.0,
 'beware-the-batman': 7.0,
 'cake-boss': 7.0,
 'gotham': 5.0,
 'my-little-pony-friendship-is-magic': 8.0,
 'over-the-garden-wall': 10.0,
 'street-fighter-ii-v': 9.0,
 'teen-titans-go-': 1.0,
 'the-13-ghosts-of-scooby-doo': 5.0,
 'the-a-team': 10.0,
 'the-guild': 1.0,
 'the-tom-and-jerry-show': 3.0,
 'ultimate-spider-man': 6.0}

In [104]:
f0 = Full[0].todense()
f1 = Full[1].todense()

In [116]:
from scipy.spatial.distance import sqeuclidean

In [117]:
sqeuclidean(f0, f1)

1560.0