
# **Projet Factorisation Matricielle** 

> Implémentation de Baseline Estimates et SVD++




## Lecture des fichiers à partir du Google Drive                     (Authentification puis Téléchargement des données)

In [None]:
!pip install PyDrive

In [None]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [None]:
downloaded = drive.CreateFile({'id':"1ZNkUgjXSTmJk773T876mRRxBb8NJz4n8"})   # The id of the file ratings file
downloaded.GetContentFile('ratings.csv')        # The name of the file

downloaded = drive.CreateFile({'id':"1AVLQIgjfwphguD2Zk4CYg6mZx1jQUJWP"})   # The id of the file favorites file
downloaded.GetContentFile('favorites.csv')        # The name of the file

downloaded = drive.CreateFile({'id':"1nhHkhSwS4-an-CRrnXbPY4TzLDQcUNA1"})   # The id of the file bookmarks file
downloaded.GetContentFile('bookmarks.csv')        # The name of the file

downloaded = drive.CreateFile({'id':"1SVbczf812-0krGTuaIFyw68MvXbAMgUt"})   # The id of the file bookmarks file
downloaded.GetContentFile('bookmarks_idx_train.npy')        # The name of the file

downloaded = drive.CreateFile({'id':"1ystO6DcVsiovAWr_JmgsulYEDKIj_-WG"})   # The id of the file bookmarks file
downloaded.GetContentFile('bookmarks_idx_test.npy')        # The name of the file





## Implémentation



Dans tout ce qui suit, nous avons travaillé seulement avec les deux bibliothèques **Numpy** et **Pandas**

In [None]:
df 



---


**Data_Description() :** La fonction responsable de la lecture des données, la génération de **Interest Dataset** et la génération du **Training** & **Test** Datasets. 


---
Pour la génération du Interest Dataset, la formule utilisée est la suivante :   
$$
  r_{ui} = w_{ui} + n_{ui} + f_{ui}
$$



In [None]:
def Data_Description():
    
    first = True
    Data = pd.DataFrame({'id_profile':[], 'id_asset':[],'interest':[] })
    ratings = pd.read_csv("ratings.csv")
    favorites = pd.read_csv("favorites.csv")
    indexes_test = np.load('bookmarks_idx_test.npy')
    indexes_train = np.load('bookmarks_idx_train.npy')
    
    ratings.set_index(['id_profile','id_asset'],inplace=True)
    favorites.set_index(['id_profile','id_asset'],inplace=True)
    i=1    
    for bookmarks in pd.read_csv("bookmarks.csv",chunksize=1000000):
        i+=1
        D = pd.DataFrame()
        D['id_profile']=bookmarks.id_profile
        D['id_asset']=bookmarks.id_asset
        bookmarks["interest"]=1
        bookmarks.set_index(['id_profile','id_asset'],inplace=True)
        
        bfindex= set(bookmarks.index) & set(favorites.index)
        brindex = set(bookmarks.index) & set(ratings.index)
        bookmarks.loc[bfindex,'interest'] += 5
        try:
          bookmarks.loc[brindex,'interest'] += ratings.loc[set(brindex)]['score']
        except:
          
          df = bookmarks.loc[brindex]
          indexes = df[df.index.duplicated(keep='first')].index
          ratings_dup = ratings
          for elm in indexes:
            ratings_dup = ratings_dup.append(ratings.loc[elm])
          score = np.array(ratings_dup.loc[brindex].score)
          bookmarks.loc[brindex,'interest'] += score

        D['interest'] = np.array(bookmarks['interest'])
        Data =pd.concat([Data,D])
    Trainset = Data.loc[indexes_train]
    Testset  = Data.loc[indexes_test]

    return Trainset,Testset

**Create_bui() :** La fonction responsable de la création des matrices 
$$ B_{ui} $$    et  $$ R_{ui} $$ 


In [None]:
import numpy as np
import decimal
import matplotlib.pyplot as plt

def create_bui(Trainset,Bu,Bi,mu,UsersDic,MoviesDic):
  
  Rui = np.zeros([len(Trainset.id_profile.unique()),len(Trainset.id_asset.unique())])
  Bui = np.zeros([len(Trainset.id_profile.unique()),len(Trainset.id_asset.unique())])
  
  buIndex = np.zeros(len(Trainset.id_profile.unique()),dtype=int)
  biIndex = np.zeros(len(Trainset.id_asset.unique()),dtype=int)

  Users_Key = list(Trainset.id_profile.unique().astype(int))
  Users_value = range(len(Users_Key))
  zip_iterator = zip(Users_Key, Users_value)
  j_index = dict(zip_iterator) 

  Movies_Key = list(Trainset.id_asset.unique())
  Movies_value = range(len(Movies_Key))
  zip_iterator = zip(Movies_Key, Movies_value)
  k_index = dict(zip_iterator) 

  
  
  Trainset.set_index(['id_profile','id_asset'],inplace=True)

  for (u,i), elm in Trainset.iterrows():
    U = UsersDic[u]
    I = MoviesDic[i]
    j = j_index[u]
    k = k_index[i]
    Rui[j,k] = elm['interest']
    Bui[j,k] = mu + Bu[U] + Bi[I] 
    buIndex[j]= U 
    biIndex[k]= I

  return Rui,Bui,buIndex,biIndex


**gradient_J() :**  La fonction qui calcule les dérivés par rapport Bu et Bi. 

In [None]:
def gradient_J(Rui,Bui,Bu,Bi,BuI,BiI) :

  bJu = 2*np.apply_along_axis( np.sum, 1 , Bui-Rui ) + ( (2/512) * 0.002 * Bu[BuI] )

  bJi = 2* np.apply_along_axis(np.sum, 1, np.transpose(Bui-Rui)) + ( (2/512) * 0.002 * Bi[BiI])
  
  return bJu,bJi

**mse() :** calcule la RMSE, qui represente le taux d'erreur quadratique, qui a pour but d'évaluer la performance des résultats obtenus.


In [None]:
def mse( Testset,Trainset,Bu,Bi,mu ) :
  df_Bi = pd.DataFrame()
  df_Bi['Bi'] = Bi
  df_Bi['id_asset'] = Trainset.id_asset.unique()
  df_Bu = pd.DataFrame()
  df_Bu['Bu'] = Bu
  df_Bu['id_profile'] = Trainset.id_profile.unique()
  Testset = Testset.merge(df_Bi,on= 'id_asset')
  Testset = Testset.merge(df_Bu,on= 'id_profile')
  Testset['Rui'] = Testset['Bu'] + Testset['Bi'] + mu 
  X = Testset.Rui.values
  Y = Testset.interest.values  
  return (1/len(X)) * np.sum(np.square(Y - X))

**L'algorithme de la descente du gradient stochastique**  est présenté dans la fonction suivante, qui prend en paramètre le Traintest et Testset, et utilise la technique des mini batchs. 

In [None]:
def gradient_descent(Trainset,Testset):

  Users_Key = list(Trainset.id_profile.unique().astype(int))

  Users_value = range(len(Users_Key))
  zip_iterator = zip(Users_Key, Users_value)
  UsersDic = dict(zip_iterator) 

  Movies_Key = list(Trainset.id_asset.unique())
  Movies_value = range(len(Movies_Key))
  zip_iterator = zip(Movies_Key, Movies_value)
  MoviesDic = dict(zip_iterator) 

  Bu = np.zeros(len(Users_Key))
  Bi = np.zeros(len(Movies_Key))

  average = np.average(Trainset.interest)

  max_iter = 10
  nminibatchlen = int(len(Trainset)/512)
  Trainset.reset_index(inplace=True)
  for i in range(max_iter):
    begin = 0
    end = 511
    print('Mse = {}'.format(mse(Testset,Trainset,Bu,Bi,average)) )

    for j in range(nminibatchlen):
      Rui,Bui,BuI,BiI = create_bui(Trainset.loc[begin:end],Bu,Bi,average,UsersDic,MoviesDic)
      Bju,Bji= gradient_J(Rui,Bui,Bu,Bi,BuI,BiI)
      Bu[BuI] -= 0.001*Bju
      Bi[BiI] -= 0.001*Bji
      begin +=512
      end   +=512
        

    if(begin < len(Trainset)):
      Rui,Bui,BuI,BiI = create_bui(Trainset.loc[begin:],Bu,Bi,3.2,UsersDic,MoviesDic)
      Bju,Bji= gradient_J(Rui,Bui,Bu,Bi,BuI,BiI)
      Bu[BuI] -= 0.001*Bju
      Bi[BiI] -= 0.001*Bji
  
  return Bu,Bi
      
      


On extrait le Trainset et Testset a l'aide de la fonction Data_Description(). Tout les valeurs dupliquées en fonction de l'index ("id_profile","id_asset") seront également supprimées.


In [None]:
Trainset,Testset = Data_Description()
Trainset = Trainset[~Trainset.index.duplicated(keep='first')]
Testset  = Testset[~Testset.index.duplicated(keep='first')]

On exécute l'algorithme de la descente du gradient stochastique, on obtient comme résultat Bu et Bi, on peut également tester la performance de notre modèle avec la fonction mse() qui est déjà appelé dans la fonction graient_escent() à chaque itération du TestSet (un epoch) on affiche mse

In [None]:
Bu,Bi = gradient_descent(Trainset,Testset)

### **SVD++**

Dans cette partie, nous avons implémenté la méthode SVD++ de Koren 

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

numFactors =  20

Dans cette fonction on crée deux matrices de training et de test 

In [None]:
def train_test_split(n_users,n_items):
    
    Trainset,Testset = Data_Description()
    train_data = Trainset[~Trainset.index.duplicated(keep='first')]
    test_data  = Testset[~Testset.index.duplicated(keep='first')]

    R = np.zeros((n_users, n_items))
    length_train = len(train_data)
    length_test = len(test_data)
    sum_train = 0
    sum_test = 0
    item_by_users = {}
    users_by_item = {}
    rate_by_users = {}
    for line in train_data.itertuples():
        R[line[1] - 1, line[2] - 1] = line[3]
        sum_train += line[3]
        item_by_users.setdefault(line[1]-1, []).append(line[2]-1)
        users_by_item.setdefault(line[2]-1, []).append(line[1]-1)
        rate_by_users.setdefault(line[1]-1, []).append(line[3])
    average_train = float(sum_train/length_train)
    T = np.zeros((n_users, n_items))
    for line in test_data.itertuples():
        T[line[1] - 1, line[2] - 1] = line[3]
        sum_test += line[3]
    average_test = float(sum_test/length_test)
    return R, T, average_train,average_test,item_by_users,users_by_item,rate_by_users

# Index matrix for training data
def index_matrix(R,T):
    I = R.copy()
    I[I > 0] = 1
    I[I == 0] = 0
    I2 = T.copy()
    I2[I2 > 0] = 1
    I2[I2 == 0] = 0
    return I, I2

def prediction(P,Q):
    return np.dot(P.T,Q)



1.   **mse() :** calcule la RMSE, qui represente le taux d'erreur quadratique, qui a pour but d'évaluer la performance des résultats obtenus.
2.   **plot_rmse() :** afficher l'erreur selon le nombre d'epochs





In [None]:
def rmse(I, R, item_by_users,average, Q, P, Y, B_U, B_I):
    users, items = R.nonzero()
    sum = 0
    for u, i in zip(users, items):  # 75000 times
        n_u = len(users[users == u])
        if (R[u, i] > 5 or R[u, i] < 0):
            print("R[" + str(u) + "," + str(i) + "]=" + R[u, i])
        # print(n_u)
        pPlusY = np.zeros(numFactors)
        for j in item_by_users[u]:
            pPlusY = np.add(pPlusY, Y[j, :])
        # print(pPlusY)
        pPlusY = np.add(pPlusY / np.sqrt(n_u), P[:, u])
        error = R[u, i] - (average + B_U[u] + B_I[i] + prediction(pPlusY, Q[:, i]))
        sum += error**2
    return np.sqrt(sum/len(R[R > 0]))

def plotRMSE(n_epochs,train_errors,test_errors):
    plt.plot(range(n_epochs), train_errors, marker='o', label='Training Data')
    plt.plot(range(n_epochs), test_errors, marker='v', label='Test_data')
    plt.xlabel('Number of Epochs')
    plt.ylabel('RMSE')
    plt.legend()
    plt.grid()
    plt.show()

**svdpp() :** est la fonction SVD++ 

In [None]:
def svdpp():
    #df, n_users, n_items = loadData(path)
    ratings = pd.read_csv("ratings.csv")
    n_users = ratings['id_profile'].unique().shape[0]
    n_items = ratings['id_asset'].unique().shape[0]

    R, T, average_train, average_test, item_by_users, users_by_item, rate_by_users= train_test_split(n_users, n_items)
    I, I2 = index_matrix(R,T)

    gama1 = 0.01
    gama2 = 0.01
    lambda6 = 0.05
    lambda7 = 0.1
    num_epochs = 20

    m, n = R.shape
    users, items = R.nonzero()
    train_errors = []
    test_errors = []

    P = np.random.rand(numFactors,m) 
    Q = np.random.rand(numFactors,n) 
    Y = np.random.rand(n,numFactors) 
    B_U =  np.random.rand(m)
    B_I =  np.random.rand(n)

    pPlusY = {}

    for u in range(m):
        p = np.zeros(numFactors)
        for j in item_by_users[u]:
            p = np.add(p,Y[j,:])
        pPlusY[u] = p

    for epoch in range(num_epochs):

        print("epoch=" + str(epoch))
        count = 0
        for u, i in zip(users, items):  
            n_u = len(users[users == u])
            pPlusY[u] = np.add(pPlusY[u] / np.sqrt(n_u), P[:, u])
            error = R[u, i] - (average_train + B_U[u]+B_I[i]+prediction( pPlusY[u], Q[:, i]))
            print("Error="+str(error))
            P[:, u] += gama2 * (error * Q[:, i] - lambda7 * P[:, u])
            Q[:, i] += gama2 * (error * (P[:, u]+ 1 / np.sqrt(n_u) *   pPlusY[u]) - lambda7 * Q[:, i])

            for item in item_by_users[u]:
                Y[item, :] += gama2 * (error * 1 / np.sqrt(n_u) * Q[:, item] - lambda7 * Y[item,:])
            B_U[u] += gama1 * (error - lambda6 * B_U[u])
            B_I[i] += gama1 * (error - lambda6 * B_I[i])
            count += 1

        train_rmse = rmse(I, R, item_by_users,average_train, Q, P, Y, B_U, B_I)
        print("train_rmse="+str(train_rmse))
        test_rmse = rmse(I2, T, item_by_users,average_train, Q, P, Y, B_U, B_I)
        print("test_rmse=" + str(test_rmse))
        train_errors.append(train_rmse)
        test_errors.append(test_rmse)

    print("train_errors=" + str(train_errors))
    print("test_errors=" + str(test_errors))
    plotRMSE(num_epochs, train_errors, test_errors)



if __name__ == '__main__':
    
    svdpp()
