# Recommandation - séries les plus vues

Nous avons remarqué un problème dans nos recommandations : on renvoit souvent les mêmes (qui sont en plus des séries assez peu connues - on pense que ce problème vient de leur biais très élevé, car le peu de gens les ayant notées les ont très bien notées).  
Pour régler ce problème, nous recourrons à l'heuristique suivante lorsque les notes max prédites pour un utilisateur sont toutes les mêmes : on recommande les séries les plus vues / les mieux notées.

In [1]:
import os
import re
import pandas as pd
from sklearn.decomposition import NMF
from scipy.sparse import dok_matrix
import numpy as np
import pandas as pd
from scipy.spatial.distance import sqeuclidean, cosine
import pickle
from collections import OrderedDict
import matplotlib.pyplot as plt
from utils.collaborative import *
from utils.similarities import *
from utils.predictions_content import *
from utils.ndcg import *
import operator
import pickle
from collections import OrderedDict
from utils.predictions_notes import *
from utils.recommandation import *

In [2]:
#path_d_user = "/Users/constancescherer/Desktop/pickles/d_user.p"
#path_sim = "/Users/constancescherer/Desktop/pickles/sim.p"
#path_most_sim = "/Users/constancescherer/Desktop/pickles/most_sim.p"

path_d_user = "/Vrac/PLDAC_addic7ed/pickles/d_user.p"
path_sim = "/Vrac/PLDAC_addic7ed/pickles/sim.p"
path_most_sim = "/Vrac/PLDAC_addic7ed/pickles/most_sim.p"
path_d_pert_user = "/Vrac/PLDAC_addic7ed/pickles/d_pert_user_k3.p"
path_d_user = "/Vrac/PLDAC_addic7ed/pickles/d_user.p"

# dictionnaire d_users
# {username : {serie : rating}}
with open(path_d_user, 'rb') as pickle_file:
    d_user = pickle.load(pickle_file)

# matrice des similarités cosinus
with open(path_sim, 'rb') as pickle_file:
    sim = pickle.load(pickle_file)

# dictionnaire des séries les plus similaires
with open(path_most_sim, 'rb') as pickle_file:
    most_similar = pickle.load(pickle_file)
    
with open(path_d_pert_user, 'rb') as pickle_file:
    d_pert_user = pickle.load(pickle_file)
    
with open(path_d_user, 'rb') as pickle_file:
    d_user = pickle.load(pickle_file)

In [3]:
#path_ratings = "/Users/constancescherer/Desktop/ratings/ratings_imdb/users"
path_ratings = "/Vrac/PLDAC_addic7ed/ratings/ratings_imdb/users"

liste_series = get_liste_series(d_user)
data = get_data(d_user)
all_data, num_user, num_item = get_all_data(data)
train, train_mat, test = get_train_test(num_user, num_item, all_data, test_size=10)
mean, u_means, i_means,U_ksvd, I_ksvd =  get_Uksvd_Iksvd(train, train_mat, num_user, num_item)
d_username_id, d_itemname_id, Full = create_sparse_mat(data)


#path_series = "/Users/constancescherer/Desktop/addic7ed_good_encoding"
path_series = '/Vrac/PLDAC_addic7ed/addic7ed_final'

d_info, d_name = getDicts(path_series)
d_ind = reverse_dict(d_name)
d_titre_filename = get_d_titre_filename("titles/title-filename.txt")
d_filename_titre = reverse_dict(d_titre_filename)
d_id_username = reverse_dict(d_username_id)
d_id_serie = reverse_dict(d_itemname_id)

reversed_u_dic, reversed_i_dic = create_reversed_dic(d_username_id, d_itemname_id)

## Popularité

Pour calculer la popularité d'une série, nous comptons le nombre d'utilisateurs ayant mis une note supérieure ou égale à 7. 

In [4]:
#path_series_ratings = "/Users/constancescherer/Desktop/ratings/ratings_imdb/series"
path_series_ratings = "/Vrac/PLDAC_addic7ed/ratings/ratings_imdb/series"
d_series = dict() #{serie : {username: note, username : note}}

for serie in sorted(os.listdir(path_series_ratings)):
    if serie[0] == '.' :
        continue
    seriename = re.sub(".txt", "", serie)
    dico = dict()
    with open(path_series_ratings+"/"+serie) as file: 
        lignes = file.readlines()
    for ligne in lignes :
        l = ligne.split(" ")
        username = l[0]
        rating = l[-1]
        #username, rating = ligne.split(" ")
        rating = rating.rstrip("\n")
        rating = float(rating)
        dico[username] = rating
        if len(dico) > 1 :
            d_series[seriename] = dico

In [5]:
d_pop = dict.fromkeys(d_series.keys(), 0)
for serie, d_notes in d_series.items() :
    for user, note in d_notes.items() :
        if note >= 7 :
            d_pop[serie] += 1

In [6]:
pickle.dump( d_pop, open( "/Vrac/PLDAC_addic7ed/pickles/d_pop.p", "wb" ))

## Anciennes recommandations

In [7]:
path_mat_corpus = "/Vrac/PLDAC_addic7ed/pickles_new/pickles/mat_voc_50000_mindf_20_max_df_0.4.p"
with open(path_mat_corpus, 'rb') as pickle_file:
    mat_corpus = pickle.load(pickle_file)
    
sim = similarities_from_sparse_mat(mat_corpus)

username = 'shannen-l-c'
top_reco = reco_content(username,
	d_username_id,
	d_itemname_id,
	d_name,
	d_user,
	d_ind, 
	d_titre_filename, 
	d_filename_titre, 
	d_id_username, 
	d_id_serie, 
	sim)

In [8]:
ndcg(top_reco, username, d_pert_user, d_user)

0.026598491194515538

## Nouvelles recommandations

Dans le cas où il y a plus de séries avec la note max que de recommandations.  
S'il y en a moins, on les renvoie toutes. Peut-être intéressant de les trier par popularité quand même parmi les groupes de séries ayant la même notes ?

In [11]:
def tri_par_pop(liste_series, d_pop) :
    """
    liste_series : liste de series ayant la meme note
    d_pop : dictionnaire des popularités
    """
    
    l_pop = dict()
    i = 0
    for serie in liste_series :
        if serie not in d_pop.keys() :
            i+= 1
            continue
        l_pop[i] = d_pop[serie]
        i += 1
    sorted_x = sorted(l_pop.items(), key=lambda kv: kv[1])
    sorted_x.reverse()

    sorted_dict = OrderedDict(sorted_x)
    reco = [liste_series[i] for i, pop in sorted_x]
    return reco

In [12]:
path_mat_corpus = "/Vrac/PLDAC_addic7ed/pickles_new/pickles/mat_voc_50000_mindf_20_max_df_0.4.p"
with open(path_mat_corpus, 'rb') as pickle_file:
    mat_corpus = pickle.load(pickle_file)
    
sim = similarities_from_sparse_mat(mat_corpus)
nb_reco = 10
username = 'shannen-l-c'
print('user = ', username)

uid = d_username_id[username]
d_notes = dict()
for serie, iid in d_itemname_id.items() :
    if serie not in d_user[username].keys() :
        # prediction
        p = pred_content(uid, iid, d_name,
	d_user, 
	d_ind, 
	d_titre_filename, 
	d_filename_titre, 
	d_id_username, 
	d_id_serie, 
	sim)
        if p > 10 :
            p = 10
        d_notes[serie] = p 


sorted_x = sorted(d_notes.items(), key=lambda kv: kv[1])
sorted_x.reverse()

sorted_dict = OrderedDict(sorted_x)
liste_notes_max = []
note_max = sorted_x[0][1]

for serie, note in sorted_x :
    if note == note_max :
        liste_notes_max.append(serie)
#print(len(liste_notes_max))        
if len(liste_notes_max) < nb_reco :
    reco = list(sorted_dict)
    top3_reco = reco[:nb_reco]
else :

    reco = tri_par_pop(liste_notes_max, d_pop)
    top3_reco = reco[:nb_reco]
    

print("recommandations = ", top3_reco)

user =  shannen-l-c
recommandations =  ['dexter', 'house-of-cards', 'nova', 'a-place-to-call-home', 'victorious', 'the-sarah-silverman-program', 'texas-rising', 'count-arthur-strong', 'how-not-to-live-your-life', 'joan-of-arcadia']


In [13]:
ndcg(top3_reco, username, d_pert_user, d_user)

0.05265332855649792

Très grosse amélioration !

In [None]:
    l_pop = dict()
    i = 0
    for serie in liste_notes_max :
        if serie not in d_pop.keys() :
            i+= 1
            continue
        l_pop[i] = d_pop[serie]
        i += 1
    sorted_x = sorted(l_pop.items(), key=lambda kv: kv[1])
    sorted_x.reverse()

    sorted_dict = OrderedDict(sorted_x)
    reco = [liste_notes_max[i] for i, pop in sorted_x]