# Prédiction de notes par contenu

In [1]:
import os
import re
import pandas as pd
from sklearn.decomposition import NMF
from scipy.sparse import dok_matrix
import numpy as np
import pandas as pd
from scipy.spatial.distance import sqeuclidean, cosine

In [2]:
# take as input two lists of ratings

def MSE_err(truth,pred):
    """
    computes MSE from real-pred difference
    """
    return np.mean((truth-pred)**2)

def MAE_err(truth,pred):
    """
    computes MAE from real-pred difference
    """
    return np.mean(abs(np.array(truth-pred)))
        

In [35]:
path = '/Vrac/PLDAC_addic7ed/ratings/ratings_60'
#path = "/Users/constancescherer/Desktop/ratings/ratings_60"
#path = "ratings_60"

In [8]:
d_user = dict() #{username : {serie: note, serie : note}}

for user in sorted(os.listdir(path)):
    username = re.sub(".txt", "", user)
    d_user[username] = dict()
    with open(path+"/"+user) as file: 
        lignes = file.readlines()
        for ligne in lignes :
            serie, rating = ligne.split(" ")
            rating = rating.rstrip("\n")
            rating = float(rating)
            
            d_user[username][serie] = rating

In [9]:
liste_series = set()
for username, d_s in d_user.items() :
    for serie, rating in d_s.items() :
        liste_series.add(serie)
liste_series = list(liste_series)
len(liste_series)

1357

In [10]:
data = []
for username, d_s in d_user.items() :
    for serie, rating in d_s.items() :
        data.append( (username, serie, rating) )

In [11]:
data[0]

('BeneCumb', 'the-prisoner', 8.0)

In [12]:
# We first remap users and item to ids between (0,len(user)) and (0,len(item))
u_dic = {} #{username : user id}
i_dic = {} #{item title : item id}
        
all_data = [] #[(user id, item id, rating)]
    
d_username_id = dict()
d_itemname_id = dict()
for uid,iid,rating in data:  # iterating on all data
    
    uk = u_dic.setdefault(uid,len(u_dic))
    ik = i_dic.setdefault(iid,len(i_dic))
    all_data.append((uk,ik,float(rating)))
    d_username_id[uid] = uk
    d_itemname_id[iid] = ik

num_user = len(u_dic)
num_item = len(i_dic)

print(str(num_user)+" users and "+str(num_item)+" items.")

60 users and 1357 items.


In [13]:
# (1) Create sparse matrix from all ratings
Full = dok_matrix((num_user, num_item), dtype=np.float32)

for uid,iid,rating in all_data:
    Full[uid,iid] = float(rating)
    
    
# (2) Factorizing matrix

model = NMF(n_components=25, init='random', random_state=0, max_iter=350)
U = model.fit_transform(Full) #users
I = model.components_      #items

I = I.transpose()
I.shape


(1357, 25)

In [14]:
# We take 10% of the train set as test data
train_mat = dok_matrix((num_user, num_item), dtype=np.float32)
test = []
train = []
    
for i,(uid,iid,rating) in enumerate(all_data):
    if i%10 == 0: #one out of 10 is for test
        test.append((uid,iid,rating))
    else:
        train.append((uid,iid,rating))
        train_mat[uid,iid] = rating
    
print("Number of train examples: ", train_mat.nnz)
print("Number of test examples: ", len(test))


Number of train examples:  3305
Number of test examples:  368


In [90]:
import pickle
with open("/Vrac/PLDAC_addic7ed/sim.p", 'rb') as pickle_file:
    similarities = pickle.load(pickle_file)

with open("/Vrac/PLDAC_addic7ed/pickle_most_similar.p", 'rb') as pickle_file:
    most_similar = pickle.load(pickle_file)

In [21]:
len(similarities[0])

3279

In [37]:
import fnmatch

In [38]:
def getDicts2(path):
    res = dict() #  keys : show id     values: dict(key:id season, value: nb  ep season)
    res2 = dict() # keys : show id     values: show title
    j = 0
    filenames= sorted(os.listdir(path)) # get all files' and folders' names in the current directory
    for filename in filenames: # loop through all the files and folders
        if filename[0] == '.' :
            continue
        if os.path.isdir(os.path.join(os.path.abspath(path), filename)): # check whether the current object is a folder or not
            show_path = path+"/"+filename
            l = []
            nb_saisons = sum(os.path.isdir(os.path.join(show_path, i)) for i in sorted(os.listdir(show_path)))
            for season in sorted(os.listdir(show_path)):
                if season[0] == '.' :
                    continue
                season_path = show_path+"/"+season
                nb_eps_saison = len(fnmatch.filter(os.listdir(season_path), '*.txt'))
                l.append(nb_eps_saison)
            seasons_list = list(range(1, nb_saisons+1))
            dico_serie = dict(zip(seasons_list, l))
            res[j] = dico_serie
            res2[j] = filename
            j += 1
    
    return res, res2

In [79]:
path_series = '/Vrac/PLDAC_addic7ed/addic7ed_clean'
d_info, d_name = getDicts(path_series)

In [80]:
d_name[0]

'1000___Battlestar_Galactica__The_Face_of_the_Enemy'

In [81]:
d_ind = {v: k for k, v in d_name.items()}

In [82]:
d_ind['1000___Battlestar_Galactica__The_Face_of_the_Enemy']

0

In [83]:
d_titre_filename = dict()
with open("titles/title-filename.txt") as file :
	lignes = file.readlines()
	for ligne in lignes :
		l = ligne.split(" ")
		titre = l[0]
		filename = l[1].rstrip('\n')
		d_titre_filename[titre] = filename

In [84]:
d_filename_titre = {v: k for k, v in d_titre_filename.items()}

In [85]:
d_id_username = {v: k for k, v in u_dic.items()}

In [86]:
d_id_serie = {v: k for k, v in i_dic.items()}

In [47]:
from collections import OrderedDict

In [91]:
def pred_content(uid, iid, k=3):
    """
    prédire la note de l'utilisateur uid pour la serie iid 
    (moyenne sur les k ppv de iid chez uid)
    """
    
    # récupérer toutes les séries notées par uid
    u = d_id_username[uid]
    series_notes = d_user[u]
    series = series_notes.keys()
    #notes = series_notes.values()
    
    # récupérer le vecteur de similarité entre iid et toutes les autres séries
    i = d_id_serie[iid]
    f = d_titre_filename[i]
    n_iid = -1
    if f in d_ind.keys() :
        n_iid = d_ind[f]
    
    simil = similarities[n_iid]
    
    # on parcourt les séries que l'utilisateur a vu
    series_ind = []
    for s in series :
        f = d_titre_filename[s]
        if f in d_ind.keys() :
            series_ind.append(d_ind[f])
        
    
    d_simil = {}
    for ind in series_ind :
        d_simil[ind] = simil[ind]
    
    
    sorted_x = sorted(d_simil.items(), key=lambda kv: kv[1])
    sorted_x.reverse()
     
    sorted_dict = OrderedDict(sorted_x)
    
    series_plus_similaires = list(sorted_dict)
    
    kppv = series_plus_similaires[:k]
    
    notes = []
    
    for ind in kppv :
        f = d_name[ind]
        t = d_filename_titre[f]
        n = series_notes[t]
        notes.append(n)
        
    return np.mean(notes)
        

In [92]:
## Getting the truth values
truth_tr = np.array([rating for (uid,iid),rating in train_mat.items()])
truth_te = np.array([rating for uid,iid,rating in test])

prediction_tr = np.array([pred_content(u, i) for (u,i),rating in train_mat.items()])
prediction_te = np.array([pred_content(u, i) for u,i,rating in test])


print("Training Error:")
print("MSE:",  MSE_err(prediction_tr,truth_tr))
print("MAE:",  MAE_err(prediction_tr,truth_tr))
    
print("Test Error:")
print("MSE:",  MSE_err(prediction_te,truth_te))
print("MAE:",  MAE_err(prediction_te,truth_te))

Training Error:
MSE: 2.903378719112455
MAE: 1.2027231467473525
Test Error:
MSE: 2.612922705314009
MAE: 1.1539855072463767
