# LOG6308 : Tp2 -  Approches contenu et techniques de graphes 

- Clément Bernard (2096223)
- Ghaith Dekhili (1858454)

## Importations 

In [379]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 
import os 
from scipy.spatial import distance
from sklearn.metrics.pairwise import cosine_similarity
import time
from sklearn.cluster import KMeans
from sklearn import preprocessing
from numpy.linalg import multi_dot

## Data

In [380]:
# The path where is the fold data
PATH_DATA = 'data'

In [381]:
# The items 
items = pd.read_csv(os.path.join(PATH_DATA, 'items.csv'), sep='|')
# User data 
u = pd.read_csv(os.path.join(PATH_DATA, 'u.csv'), sep='|')
# Votes of the user 
votes = pd.read_csv(os.path.join(PATH_DATA, 'votes.csv'), sep='|')

# Q1

## Creation of sparse matrix : User-item matrix

In [382]:
# The number of users 
N_USERS = u.shape[0] +1
# The number of items 
N_ITEMS = items.shape[0] +1 

In [383]:
def create_sparse_matrix(votes) : 
    ''' Create a User-Items sparse matrix '''
    # Create NaN for each items and users 
    data = {i : [np.nan for j in range(N_USERS )] for i in range(N_ITEMS)}
    def to_convert(x, data) :
        data[x['item.id']][x['user.id']] = x['rating']
        return None 
    votes.apply(to_convert , axis = 1 , args = [data])
    return pd.DataFrame(data)

In [384]:
user_item = create_sparse_matrix(votes)

In [385]:
user_item

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
0,,,,,,,,,,,...,,,,,,,,,,
1,,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,...,,,,,,,,,,
2,,4.0,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,5.0,...,,,,,,,,,,
940,,,,,2.0,,,4.0,5.0,3.0,...,,,,,,,,,,
941,,5.0,,,,,,4.0,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [386]:
# We drop the first user who has no grades
user_item= user_item.iloc[1:,1:]

In [387]:
def convert_to_binary(user_item):
    """
    returns a binary matrix by replacing votes equal or greater than 4 by 1 and 0 otherwise
    """
    user_item_binary = user_item.copy()
    # votes with 4 and 5 values become 1 and 0 otherwise 
    user_item_binary[user_item_binary < 4] = 0
    user_item_binary[user_item_binary >= 4] = 1
    return user_item_binary

In [388]:
def count_global_likes(user_item):
    """
    Counts global likes dislikes per item
    """
    #A df with number of likes and dislikes per item
    items_global_likes = pd.DataFrame(np.zeros((N_ITEMS-1, 2)), index = range(1,N_ITEMS))
    user_item_binary = convert_to_binary(user_item)
    for item in range(1,N_ITEMS):
        #Count number of 1(likes) and 0(dislikes) per item
        likes = user_item_binary[item].value_counts(normalize=False)
        #Fill with global number of likes and likes per item
        items_global_likes.loc[item,:] = likes
        #items_global_likes.loc[item,["dislikes","likes"]] = likes
    return user_item_binary, items_global_likes

In [390]:
def labelize_age(u):
    """
    labelize age based on users who are older than 25 are considered old
    rerturn a pandas Series with labelized ages
    """
    age_categorical = u[' age '].copy()
    age_categorical[u[' age '] > 25 ] = 'old'
    age_categorical[u[' age '] <= 25] = 'young'
    age_categorical = pd.DataFrame({'id ':[int(idx) for idx in u['id '].values],
                                    ' age ': age_categorical.values}, index = range(1,N_USERS))          
    return age_categorical

In [391]:
def return_dict_feature_ids(u,column, age = False):
    """
    returns a dictionary of a feature categories with their associated users ids
    """
    feature_per_ids = {}
    #A df with a feature and its associated ids users, for we you labelize ages 
    if age:
        df_feature = labelize_age(u)
    else:
        df_feature = u[['id ', column]].copy() 
    #group by feature per user_ids
    feature_per_ids = {name: group['id '].values for name,group in df_feature.groupby([column])}
    return feature_per_ids

In [393]:
def return_likes_dislikes_per_feature(u, user_item_binary, feature_per_ids):
    """
    returns likes dislikes matrices per job
    """
    #initializations
    item_likes_per_job = pd.DataFrame(np.nan, index = feature_per_ids.keys(), columns = range(1,N_ITEMS))
    item_dislikes_per_job = pd.DataFrame(np.nan, index = feature_per_ids.keys(), columns = range(1,N_ITEMS))
    #Loop over features and associated users ids
    for job, ids in feature_per_ids.items():
        item_likes_per_job.loc[job] = user_item_binary.loc[ids][user_item_binary.loc[ids]==1].count()
        item_dislikes_per_job.loc[job] = user_item_binary.loc[ids][user_item_binary.loc[ids]==0].count()
    
    return item_likes_per_job, item_dislikes_per_job

In [394]:
user_item_binary, items_global_likes = count_global_likes(user_item)
odds_likes = (items_global_likes.iloc[:,1]/items_global_likes.iloc[:,0])
proba_likes = odds_likes/(1+odds_likes)

In [395]:
#Returns a dictionary of jobs categories with their associated users ids
jobs_per_ids = return_dict_feature_ids(u, ' job ') 
#Return items likes and dislikes matrices per job
item_likes_per_job, item_dislikes_per_job = return_likes_dislikes_per_feature(u, user_item_binary, jobs_per_ids)
#calculate odds related to jobs
users_items_likes = (item_likes_per_job+1)/(((np.dot(pd.DataFrame(items_global_likes.iloc[:,1].replace(np.nan,0)+2),np.ones((1,item_likes_per_job.shape[0])))).T))
users_items_dislikes = (item_dislikes_per_job+1)/(((np.dot(pd.DataFrame(items_global_likes.iloc[:,0].replace(np.nan,0)+2),np.ones((1,item_likes_per_job.shape[0])))).T))
odds_jobs = pd.DataFrame(users_items_likes)/pd.DataFrame(users_items_dislikes)

In [396]:
#Returns a dictionary of gender categories with their associated users ids
gender_per_ids = return_dict_feature_ids(u, ' gender ')
#Return items likes and dislikes matrices per gender
item_likes_per_gender, item_dislikes_per_gender = return_likes_dislikes_per_feature(u, user_item_binary, gender_per_ids)
#calculate odds related to gender
users_items_likes_gender = (item_likes_per_gender+1)/(((np.dot(pd.DataFrame(items_global_likes.iloc[:,1].replace(np.nan,0)),np.ones((1,item_likes_per_gender.shape[0])))).T)+2)
users_items_dislikes_gender = (item_dislikes_per_gender+1)/(((np.dot(pd.DataFrame(items_global_likes.iloc[:,0].replace(np.nan,0)),np.ones((1,item_dislikes_per_gender.shape[0])))).T)+2)
odds_gender = pd.DataFrame(users_items_likes_gender)/pd.DataFrame(users_items_dislikes_gender)

In [397]:
#Returns a dictionary of age categories with their associated users ids
age_per_ids = return_dict_feature_ids(u, ' age ', age = True)
#Return items likes and dislikes matrices per age
item_likes_per_age, item_dislikes_per_age = return_likes_dislikes_per_feature(u, user_item_binary, age_per_ids)
#calculate odds related to age
users_items_likes_age = (item_likes_per_age + 1)/(((np.dot(pd.DataFrame(items_global_likes.iloc[:,1].replace(np.nan,0)),np.ones((1,item_likes_per_age.shape[0])))).T)+2)
users_items_dislikes_age = (item_dislikes_per_age + 1)/(((np.dot(pd.DataFrame(items_global_likes.iloc[:,0].replace(np.nan,0)),np.ones((1,item_dislikes_per_age.shape[0])))).T)+2)
odds_age = pd.DataFrame(users_items_likes_age)/pd.DataFrame(users_items_dislikes_age)

## Recommandation  de 10 films sur la base de trois catégories

In [398]:
def return_recommendations(job, gender, age, odds_likes, odds_jobs, odds_gender, odds_age):
    """
    return first 10 items recommendations based on input job, gender,age
    """
    odds = odds_likes*(odds_jobs.T)[job]*(odds_gender.T)[gender]*((odds_age.T))[age]
    return  (odds/(1+odds)).sort_values(ascending=False)[0:10]

In [399]:
return_recommendations('engineer', 'F', 'young', odds_likes = odds_likes, odds_jobs = odds_jobs, odds_gender = odds_gender, odds_age = odds_age)

483    0.967840
480    0.950883
115    0.928483
83     0.927111
107    0.927056
478    0.913225
114    0.911541
302    0.911324
918    0.908075
487    0.904690
dtype: float64

## Calcul des votes et du RMSE moyen 

In [400]:
def calculate_votes(user_item = user_item, new_u = new_u, odds_jobs = odds_jobs, odds_gender = odds_gender, odds_age = odds_age):
    """
    calculates votes based on probabilities
    """
    #A df which contains votes
    all_votes_predictions = pd.DataFrame(np.zeros((user_item.shape[0],user_item.shape[1])), index = range(1,N_USERS) ,columns = range(1,N_ITEMS))
    #A new df with age, gender and job in the needed labelization 
    s = pd.DataFrame(u[' gender '].values, index = range(1,N_USERS), columns = [' gender '])
    j = pd.DataFrame(u[' job '].values, index = range(1,N_USERS), columns = [' job '])
    a_c = labelize_age(u)
    new_u = pd.concat([a_c , s , j], axis = 1)
    #Calculate likes and dislikes averages
    likes_avgs = user_item[user_item >= 4].mean()
    dislikes_avgs = user_item[user_item < 4].mean()
    #Loop over the new df to calculate votes
    for index, row in new_u.iterrows():
        #Calculate odd by multiplying odds matrices
        all_odds = (odds_jobs.T[row[' job ']])*(odds_gender.T[row[' gender ']])*(odds_age.T[row[' age ']])
        #Calculates probabilities based on odds
        all_probs = all_odds/(1+all_odds)
        #Calculte votes per user
        user_votes = (all_probs*likes_avgs + (1-all_probs)*dislikes_avgs)
        #Fill in the df votes
        all_votes_predictions.loc[index,:] = user_votes
    
    return all_votes_predictions

In [407]:
def compute_rmse(prediction , user_item = user_item ) : 
    ''' Return the RMSE between the prediction and the user-item matrix '''
    # Index where the user_item is not null
    indexes_not_nan = user_item.notnull()
    rmse = np.sqrt(((prediction[indexes_not_nan] - user_item[indexes_not_nan])**2).mean())
    return rmse

In [405]:
all_votes_predictions = calculate_votes(user_item, new_u, odds_jobs, odds_gender, odds_age)

In [406]:
compute_rmse(all_votes_predictions, user_item).mean()

1.021163785917258