In [1]:
import pandas as pd
import numpy as np
from numpy.random import seed
seed(1)

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.neighbors import NearestNeighbors
from sklearn.model_selection import train_test_split

from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

import time
import multiprocessing as mp
import time
from itertools import product

In [4]:
!pip freeze | grep multiprocessing

'grep' n'est pas reconnu en tant que commande interne
ou externe, un programme ex‚cutable ou un fichier de commandes.


### 1. user-based recommendation algorithm

Quelques précision sur notre implémentation:

* Le code est organisé en classe avec des fonctions pour effectuer des transformations et des opération: fit, predict, evaluate
* Les paramètres de la classe sont les suivants:
    * ratings: la matrice de rating (userId, movieId, rating) - input data 
    * movies: movies names - input data 
    * tags: Users tags - input data 
    * similarity='cosine'  or 'distances'
    * Liste des nombres de voisins a tester Ex: [5, 10, 50]. Le paramètre avec le meilleur résultat sera sélectionné automatiquement
    * Size in percent of test data Ex 0.1 for 10%
    * score: Score to use for evaluating model on test data.  "precision", "recall", "f1_score"
    * top_n: Nombre de vidéos à recommander
* Les prédictions sont effectuées en utilisant la formule suivante:

    <img src="images/Capture.png">
    
    avec:
         u: l'utilisateur à qui nous devons recommander des vidéos
         i: la vidéo à recommander
         s(u,i): le score de recommandation calculé
         W_uv: la similarité entre l'utilisateur u et v
         r_u: la moyenne des notes de l'utilisateur u
         r_vi: la note de l'uitlisateur v sur la vidéo i
         


In [2]:
ratings = pd.read_csv('data/ml-latest-small/ratings.csv')
movies = pd.read_csv('data/ml-latest-small/movies.csv')
tags = pd.read_csv('data/ml-latest-small/tags.csv')

In [3]:
class User_cf():
    
    def __init__(self, ratings, movies, tags, 
                 similarity='cosine', # 'cosine' or 'distances'
                 n_neighbors = [1],
                 test_size = 0.1,
                 score = "precision",
                 top_n = 5
                ):
        
        self.ratings = ratings
        self.movies = movies
        self.tags = tags
        print("ratings.shape: %s, movies.shape: %s, tags.shape: %s"%(ratings.shape, movies.shape, tags.shape))
        
        self.similarity = similarity
        self.n_neighbors = n_neighbors
        self.test_size = test_size
        self.score = score
        self.top_n = top_n
        
        self.ratings_matrix = None
        self.similarity_matrix = None
        self.similarity_matrix = None
        self.best_k = None
        self.best_perf_test = None
        self.perf_result = []
        self.final_nearestNeighbors= None
        
        self.fit()
        
    def fit(self):

        # Pivot ratings
        self.ratings_matrix = pd.pivot_table(self.ratings, values='rating', index='userId', columns='movieId', aggfunc=np.mean)
       
        self.liked = (self.ratings_matrix.apply(lambda row: row>(row.mean()), axis=1)).astype(int)
#         print(self.liked.head())
        
        self.ratings_matrix = self.process_ratings_matrix(self.ratings_matrix)
        
        self.X_train, self.X_test = train_test_split(self.ratings_matrix, test_size=self.test_size, shuffle=False)
        print("X_train.shape: %s, X_test.shape: %s X_test_index_min %s"%(self.X_train.shape, self.X_test.shape, self.X_test.index.min()))
    
        self.liked_train, self.liked_test = train_test_split(self.liked, test_size=self.test_size, shuffle=False)
        
        
        ##### compute similarity
        
        if self.similarity == 'cosine':
            self.X_train_similarity = cosine_similarity(self.X_train)
            np.fill_diagonal(self.X_train_similarity, 0)
            self.X_test_similarity = cosine_similarity(self.X_train, self.X_test)
        else:
            self.X_train_similarity =  pairwise_distances(self.X_train)
            np.fill_diagonal(self.X_train_similarity, 0)
            self.X_test_similarity =  pairwise_distances(self.X_train, self.X_test)
            
        print("X_train_similarity.shape: %s, X_test_similarity.shape: %s"%(self.X_train_similarity.shape, 
                                                                           self.X_test_similarity.shape)
             )
        
        
        
        for k in self.n_neighbors:
            
            print('----->    k = %s'%k)
            
            # fit nearestNeighbors
            nearestNeighbors = NearestNeighbors(k)
            nearestNeighbors.fit(self.X_train)
            
            # compute Neighbors
            
            X_train_distances, X_train_indices = nearestNeighbors.kneighbors(self.X_train)
            X_test_distances, X_test_indices = nearestNeighbors.kneighbors(self.X_test)
            
            print("k: %s, X_train_indices.shape: %s, X_test_indices.shape: %s"%(k, 
                                                                                X_train_indices.shape, 
                                                                                X_test_indices.shape)
                 )
            
            # filter similarity only on Neighbors
            
            X_train_similarity_by_indice = self.get_similarity_of_neighbors(self.X_train_similarity, 
                                                                        X_train_indices
                                                                       )
            X_test_similarity_by_indice = self.get_similarity_of_neighbors(self.X_test_similarity, 
                                                                           X_test_indices
                                                                          )

    #         X_train_similarity_by_indice = self.X_train_similarity
    #         X_test_similarity_by_indice = self.X_test_similarity
    
    
            print("X_train_similarity_by_indice.shape: %s, X_test_similarity_by_indice.shape: %s"%
                  (X_train_similarity_by_indice.shape, X_test_similarity_by_indice.shape))
            
            
            # predict
    
            self.predict_train = self.predict(self.X_train.values.T, X_train_similarity_by_indice)
        
            self.predict_test = self.predict(self.X_train.values.T, X_test_similarity_by_indice)
            
            print("predict_train.shape: %s, predict_test.shape: %s"%
                  (self.predict_train.shape, self.predict_test.shape))
            
            # compute perf (R2, rmse) on train & test
            
            perf_train, perf_test = self.evaluate(self.predict_train, self.predict_test)
            
            if self.best_k == None:
                self.best_k = k
                self.best_perf_test = perf_test
            else:
                if perf_test > self.best_perf_test:
                    self.best_k = k
                    self.best_perf_test = perf_test
            
            self.perf_result.append((k, perf_train, perf_test))
            
            print("%s train: %s, %s test: %s"%(self.score, perf_train, self.score, perf_test))
            
            print("=====================================================================")
            
        print("best_k %s perf_test %s"%(self.best_k, self.best_perf_test))
        
        self.final_nearestNeighbors = self.fit_NearestNeighbors(self.best_k, self.ratings_matrix)
        
        self.perf_result = pd.DataFrame(self.perf_result, columns=['k', self.score+"_train", self.score+"_test"])
          
    def process_ratings_matrix(self, ratings_matrix):
        
        ##### compute ratings matrix
       
        ratings_matrix = ratings_matrix.apply(lambda row: row.fillna(row.mean()), axis=1)
        ratings_matrix = ratings_matrix.apply(lambda row: row-row.mean(), axis=1)
        
        # Drop Users & Movies with No ratings
#         ratings_matrix = ratings_matrix.dropna(how='all')
#         ratings_matrix = ratings_matrix.dropna(how='all', axis=1)
        
        return ratings_matrix


    def fit_NearestNeighbors(self, n_neighbors, data):
        
        nearestNeighbors = NearestNeighbors(n_neighbors=n_neighbors)
        nearestNeighbors.fit(data)
        
        return nearestNeighbors

    def get_similarity_of_neighbors(self, similarity, indices_neighbors):
        
        n = similarity.shape[0]
        m = similarity.shape[1]
            
        for i in range(m):
            row = np.zeros(n)
            row[indices_neighbors[i]] = 1
            row = row.reshape(len(row), 1)
            if i==0:
                X_indices_similarity = row
            else:
                X_indices_similarity = np.concatenate((X_indices_similarity, row), axis=1)
        
        return np.multiply(X_indices_similarity, similarity)
    
    def predict(self, ratings_data_processed, similarity_by_indice):
        
        predict = (np.dot(ratings_data_processed, similarity_by_indice))
        
        predict = predict / (0.00000001 + similarity_by_indice.sum(axis=0))
        
        return predict.T
    
    def evaluate(self, predict_train, predict_test):
        
        def get_n_predict(row, top_n=5):
    
            new_row = pd.Series(np.zeros(len(row)), index = row.index)

            arg_sort = np.argsort(row.values)[::-1]

            for i in range(top_n):
                new_row[row.index[arg_sort[i]]] = 1

            return new_row
        
        self.predict_train = pd.DataFrame(self.predict_train, index=self.liked_train.index, columns=self.liked_train.columns)
        self.predict_test = pd.DataFrame(self.predict_test, index=self.liked_test.index, columns=self.liked_test.columns)
        
        predict_train = self.predict_train.apply(lambda row: get_n_predict(row, top_n=self.top_n), axis=1)
        predict_test = self.predict_test .apply(lambda row: get_n_predict(row, top_n=self.top_n), axis=1)
        
#         print(predict_train.sum(axis=1)[:3])
#         print(self.liked_train.sum(axis=1)[:3])
        
#         print(predict_test.sum(axis=1)[:3])
#         print(self.liked_test.sum(axis=1)[:3])
        
        if (self.score == "precision"):
            train_score = precision_score(list(self.liked_train.values.flat), list(predict_train.values.flat), average='binary')
            test_score = precision_score(list(self.liked_test.values.flat), list(predict_test.values.flat), average='binary')
            return train_score, test_score
        elif (self.score == "recall"):
            train_score = recall_score(list(self.liked_train.values.flat), list(predict_train.values.flat), average='binary')
            test_score = recall_score(list(self.liked_test.values.flat), list(predict_test.values.flat), average='binary')
            return train_score, test_score
        elif (self.score == "f1_score"):
            train_score = f1_score(list(self.liked_train.values.flat), list(predict_train.values.flat), average='binary')
            test_score = f1_score(list(self.liked_test.values.flat), list(predict_test.values.flat), average='binary')
            return train_score, test_score
        else:
            raise Exception('Give a correct score')
      
    def get_recommandation_new_user(self, user_data_ratings, size_liked, size_recommended):
        
        user_data_ratings_matrix = pd.DataFrame(columns=self.ratings_matrix.columns, index=user_data_ratings.userId.unique())
        for i, row in user_data_ratings.iterrows():
            if row['movieId'] in list(self.ratings_matrix.columns):
                user_data_ratings_matrix.loc[row['userId'], row['movieId']] = row['rating']
            
        user_data_ratings_matrix = self.process_ratings_matrix(user_data_ratings_matrix)

#         print("user_data_ratings_matrix.shape: %s, self.ratings_matrix.shape: %s"%
#                   (user_data_ratings_matrix.shape, self.ratings_matrix.shape))
        
#         print(set(user_data_ratings_matrix.columns).difference(set(self.ratings_matrix.columns)))
        
        ##### compute similarity
        
        if self.similarity == 'cosine':
            similarity = cosine_similarity(self.ratings_matrix, user_data_ratings_matrix)
        else: # distance
            similarity =  pairwise_distances(self.ratings_matrix, user_data_ratings_matrix)

        ##### compute Neighbors
            
        distances, indices = self.final_nearestNeighbors.kneighbors(user_data_ratings_matrix)
        
        ##### filter similarity only on Neighbors

        similarity_by_indice = self.get_similarity_of_neighbors(similarity, indices)
#         similarity_by_indice = similarity

        ##### predict
        
        predict = self.predict(self.ratings_matrix.T, similarity_by_indice)
#         print(predict.shape)
        
        for user_id in user_data_ratings.userId.unique():
            
#             print("User_id %s"%user_id)
            
            print("\n ===> User %s \n\n %s best videos:\n"%(user_id, size_liked))
        
            user_movies = user_data_ratings[user_data_ratings['userId'] == user_id]
            user_movies = user_movies.merge(self.movies, on='movieId')
            user_movies = user_movies.sort_values('rating', ascending=False)
            user_movies#.userId.unique()

    #         print(user_movies.head(self.size_liked))

            for i, movie in user_movies.iloc[:size_liked].iterrows():
                print ('movieId: {:6}  /// rating: {:2.2f}   /// title: {:10s}  /// genres: {:10s}'.format(movie['movieId'], 
                                                                                            movie['rating'], 
                                                                                            movie['title'],
                                                                                            movie['genres'])
                      )

            print("\n ===> We recommand the %s following videos:\n"%size_recommended)

            # do not recommand videos ever seen by this user
            do_not_recommand = list(user_movies['movieId'].unique())
#             print(do_not_recommand)

            pos_user = np.where(user_data_ratings_matrix.index==user_id)[0][0]
#             print("pos_user %s %s"%(pos_user, predict[pos_user].shape))
    
            user_ratings = pd.Series(predict[pos_user], index = self.ratings_matrix.columns)
            user_ratings = user_ratings.sort_values(ascending=False)
            
            # remove movies seen by user
            user_ratings = user_ratings[~user_ratings.index.isin(do_not_recommand)]
            user_ratings = (pd.DataFrame({'rating': user_ratings.values}, index= user_ratings.index)).reset_index()

            user_movies_to_recommand = user_ratings.merge(self.movies, on='movieId')
            user_movies_to_recommand = user_movies_to_recommand.sort_values('rating', ascending=False)

            for i, movie in user_movies_to_recommand.iloc[:size_recommended].iterrows():
                print ('movieId: {:6}  /// title: {:10s}  /// genres: {:10s}'.format(movie['movieId'], 
#                                                                                      movie['rating'], 
                                                                                     movie['title'],
                                                                                     movie['genres']
                                                                                    )
                          )
                
            print("=======================================================================================")
                


In [4]:
# Fitting model

u_u_cf = User_cf(ratings, 
                      movies, 
                      tags ,  
                      similarity='cosine', # 'cosine' or 'distance'
                      n_neighbors = [10, 50, 200, 400],
                      test_size = 0.2,
                      score = "precision",
                      top_n = 5
                     )

ratings.shape: (100836, 4), movies.shape: (9742, 3), tags.shape: (3683, 4)
X_train.shape: (488, 9724), X_test.shape: (122, 9724) X_test_index_min 489
X_train_similarity.shape: (488, 488), X_test_similarity.shape: (488, 122)
----->    k = 10
k: 10, X_train_indices.shape: (488, 10), X_test_indices.shape: (122, 10)
X_train_similarity_by_indice.shape: (488, 488), X_test_similarity_by_indice.shape: (488, 122)
predict_train.shape: (488, 9724), predict_test.shape: (122, 9724)
precision train: 0.47090163934426227, precision test: 0.4737704918032787
----->    k = 50
k: 50, X_train_indices.shape: (488, 50), X_test_indices.shape: (122, 50)
X_train_similarity_by_indice.shape: (488, 488), X_test_similarity_by_indice.shape: (488, 122)
predict_train.shape: (488, 9724), predict_test.shape: (122, 9724)
precision train: 0.6282786885245901, precision test: 0.6147540983606558
----->    k = 200
k: 200, X_train_indices.shape: (488, 200), X_test_indices.shape: (122, 200)
X_train_similarity_by_indice.shape: (

In [None]:
# Get performance based on number of neighbors. The best 'k' is selected automatically

u_u_cf.perf_result.sort_values('k').set_index('k').plot()

In [6]:

# predict on some users from dataset

syntetic_ratings = ratings[ratings['userId'].isin([552, ])]
u_u_cf.get_recommandation_new_user(syntetic_ratings, 5, 5)


 ===> User 552 

 5 best videos:

movieId:   2502  /// rating: 5.00   /// title: Office Space (1999)  /// genres: Comedy|Crime
movieId:   7153  /// rating: 5.00   /// title: Lord of the Rings: The Return of the King, The (2003)  /// genres: Action|Adventure|Drama|Fantasy
movieId:   1196  /// rating: 5.00   /// title: Star Wars: Episode V - The Empire Strikes Back (1980)  /// genres: Action|Adventure|Sci-Fi
movieId:   1136  /// rating: 5.00   /// title: Monty Python and the Holy Grail (1975)  /// genres: Adventure|Comedy|Fantasy
movieId:   4878  /// rating: 5.00   /// title: Donnie Darko (2001)  /// genres: Drama|Mystery|Sci-Fi|Thriller

 ===> We recommand the 5 following videos:

movieId:    318  /// title: Shawshank Redemption, The (1994)  /// genres: Crime|Drama
movieId:     50  /// title: Usual Suspects, The (1995)  /// genres: Crime|Mystery|Thriller
movieId:    593  /// title: Silence of the Lambs, The (1991)  /// genres: Crime|Horror|Thriller
movieId:   4993  /// title: Lord of t

In [7]:

# predict on syntetic user

syntetic_ratings = pd.DataFrame({'userId':["Eric", "Eric", "Eric"], 
                                 'movieId':[58559, 91529, 98124], 
                                 'rating':[5, 5, 5], 
                                })

u_u_cf.get_recommandation_new_user(syntetic_ratings, 5, 5)


 ===> User Eric 

 5 best videos:

movieId:  58559  /// rating: 5.00   /// title: Dark Knight, The (2008)  /// genres: Action|Crime|Drama|IMAX
movieId:  91529  /// rating: 5.00   /// title: Dark Knight Rises, The (2012)  /// genres: Action|Adventure|Crime|IMAX
movieId:  98124  /// rating: 5.00   /// title: Batman: The Dark Knight Returns, Part 1 (2012)  /// genres: Action|Animation|Sci-Fi

 ===> We recommand the 5 following videos:

movieId: 193609  /// title: Andrew Dice Clay: Dice Rules (1991)  /// genres: Comedy    
movieId: 189713  /// title: BlacKkKlansman (2018)  /// genres: Comedy|Crime|Drama
movieId: 190183  /// title: The Darkest Minds (2018)  /// genres: Sci-Fi|Thriller
movieId: 193585  /// title: Flint (2017)  /// genres: Drama     
movieId: 193583  /// title: No Game No Life: Zero (2017)  /// genres: Animation|Comedy|Fantasy


In [8]:

# predict on syntetic user

syntetic_ratings = pd.DataFrame({'userId':["Eric", "Eric", "Eric"], 
                                 'movieId':[58559, 91529, 98124], 
                                 'rating':[5, 5, 5], 
                                })

u_u_cf.get_recommandation_new_user(syntetic_ratings, 5, 5)


 ===> User Eric 

 5 best videos:

movieId:  58559  /// rating: 5.00   /// title: Dark Knight, The (2008)  /// genres: Action|Crime|Drama|IMAX
movieId:  91529  /// rating: 5.00   /// title: Dark Knight Rises, The (2012)  /// genres: Action|Adventure|Crime|IMAX
movieId:  98124  /// rating: 5.00   /// title: Batman: The Dark Knight Returns, Part 1 (2012)  /// genres: Action|Animation|Sci-Fi

 ===> We recommand the 5 following videos:

movieId: 193609  /// title: Andrew Dice Clay: Dice Rules (1991)  /// genres: Comedy    
movieId: 189713  /// title: BlacKkKlansman (2018)  /// genres: Comedy|Crime|Drama
movieId: 190183  /// title: The Darkest Minds (2018)  /// genres: Sci-Fi|Thriller
movieId: 193585  /// title: Flint (2017)  /// genres: Drama     
movieId: 193583  /// title: No Game No Life: Zero (2017)  /// genres: Animation|Comedy|Fantasy


### 2.  item-based  recommendation algorithm

Quelques précision sur notre implémentation:

* Le code est organisé en classe avec des fonctions pour effectuer des transformations et des opération: fit, predict, evaluate
* Les paramètres de la classe sont les suivants:
    * ratings: la matrice de rating (userId, movieId, rating) - input data 
    * movies: movies names - input data 
    * tags: Users tags - input data 
    * similarity='cosine'  or 'distances'
    * Liste des nombres de voisins a tester Ex: [5, 10, 50]. Le paramètre avec le meilleur résultat sera sélectionné automatiquement
    * Size in percent of test data Ex 0.1 for 10%
    * score: Score to use for evaluating model on test data.  "precision", "recall", "f1_score"
    * top_n: Nombre de vidéos à recommander
* Les prédictions sont effectuées en utilisant la formule suivante:

    <img src="images/Capture1.png">
    
    avec:
         u: l'utilisateur à qui nous devons recommander des vidéos
         i: la vidéo à recommander
         N(i,u): les vidéos similaires à la vidéos i parmis les vidéos notées par u
         s(u,i): le score de recommandation calculé
         W_ij: la similarité entre les vidéos i et j
         r_j: la moyenne des notes de la vidéo j
         r_ui: la note de l'utlisateur u sur la vidéo i
         


In [34]:
class Item_cf():
    
    def __init__(self, ratings, movies, tags, 
                 similarity='cosine', # 'cosine' or 'distances'
                 n_neighbors = [1],
                 test_size = 0.1,
                 score = "precision",
                 top_n = 5,
                 num_procs = 5,
                 N = 50
                ):
        
        self.ratings = ratings
        self.movies = movies
        self.tags = tags
        print("ratings.shape: %s, movies.shape: %s, tags.shape: %s"%(ratings.shape, movies.shape, tags.shape))
        
        self.similarity = similarity
        self.n_neighbors = n_neighbors
        self.test_size = test_size
        self.score = score
        self.top_n = top_n
        self.num_procs = num_procs
        
        self.ratings_matrix = None
        self.X_train_similarity = None
        self.best_k = None
        self.best_perf_test = None
        self.perf_result = []
        
        self.transform_input_data()
        
        self.compute_similarity()
        
        start_time = time.time()
        
        self.N = N
        
        for k in self.n_neighbors:
            
            
            predict_train_score = self.predict(self.X_train.iloc[:], k)
            predict_test_score = self.predict(self.X_test.iloc[:], k)
            
#             predict_train_score = self.format_and_predict_parallel(self.X_train.iloc[:], k)
            
#             predict_train_score = pd.DataFrame(predict_train_score, 
#                                          index=self.X_train.iloc[:].index
#                                         )
            
#             predict_train_score.to_csv("data/predict/predict_train_score_k="+str(k)+".csv")

#             predict_test_score = self.format_and_predict_parallel(self.X_test.iloc[:], k)
            
#             predict_test_score = pd.DataFrame(predict_test_score, 
#                                          index=self.X_test.iloc[:].index
#                                         )
            
#             predict_test_score.to_csv("data/predict/predict_test_score_k="+str(k)+".csv")
            
            
            perf_train, perf_test = self.evaluate(self.liked_train.iloc[:], predict_train_score, 
                                                  self.liked_test.iloc[:], predict_test_score)
            
            print("\nk = %s --- train: %s test: %s\n"%(k, perf_train, perf_test))
            
            if self.best_k == None:
                self.best_k = k
                self.best_perf_test = perf_test
            else:
                if perf_test > self.best_perf_test:
                    self.best_k = k
                    self.best_perf_test = perf_test
            
            self.perf_result.append((k, perf_train, perf_test))
            
        
        self.perf_result = pd.DataFrame(self.perf_result, columns=['k', self.score+"_train", self.score+"_test"])

        elapsed_time = time.time() - start_time

        print("\n elapsed_time for k=%s : %s\n" % (k, elapsed_time))
        
    def evaluate(self, true_train, predict_train, true_test, predict_test):
        
        def get_n_predict(row, top_n=5):
    
            new_row = pd.Series(np.zeros(len(row)), index = row.index)

            arg_sort = np.argsort(row.values)[::-1]

            for i in range(top_n):
                new_row[row.index[arg_sort[i]]] = 1

            return new_row
        
        predict_train = predict_train.apply(lambda row: get_n_predict(row, top_n=self.top_n), axis=1)
        predict_test = predict_test .apply(lambda row: get_n_predict(row, top_n=self.top_n), axis=1)
        
        if (self.score == "precision"):
            train_score = precision_score(list(true_train.values.flat), list(predict_train.values.flat), average='binary')
            test_score = precision_score(list(true_test.values.flat), list(predict_test.values.flat), average='binary')
            return train_score, test_score
        elif (self.score == "recall"):
            train_score = recall_score(list(true_train.values.flat), list(predict_train.values.flat), average='binary')
            test_score = recall_score(list(true_test.values.flat), list(predict_test.values.flat), average='binary')
            return train_score, test_score
        elif (self.score == "f1_score"):
            train_score = f1_score(list(true_train.values.flat), list(predict_train.values.flat), average='binary')
            test_score = f1_score(list(true_test.values.flat), list(predict_test.values.flat), average='binary')
            return train_score, test_score
        else:
            raise Exception('Give a correct score')
            
            
        
        
    def predict(self, items, k):
        
        return items.apply(lambda user: self.get_score_by_user(user, k), axis=1)
    
    
    def format_and_predict_parallel(self, data, k):
        
        params = list(product(list(data.itertuples(name=False)), 
                                  [list(data.columns)],
                                  [k]
                                 )
                         )
  
        with mp.Pool(self.num_procs) as pool:
            result = pool.imap(self.predict_parallel, params, chunksize=int(len(data)/self.num_procs)+1)
            data_score = [x for x in result]
            
        return data_score
            
            
    def predict_parallel(self, params):
        
        user = params[0]
        columns = params[1]
        k = params[2]
        
        print(user[:5], k, columns[:5])
        
        user = pd.Series(user[1:], index=columns)
#         return user
        
        return self.get_score_by_user(user, k)
    
    
    
    def get_score_by_user(self, user, k):
        
        rated_videos = list(user[user!=0].index)
        
        score_by_movie_id = map(lambda movie_id: self.get_score_by_movie_id(user, movie_id, rated_videos, k), 
                                    self.X_train.columns)
        
        return pd.Series(score_by_movie_id, index = self.X_train.columns)
        
    
    def get_score_by_movie_id(self, user, movie_id, rated_videos, k):
        
        rated_videos_without_movie_id = list(set(rated_videos).difference(set([movie_id])))
        ranked_rated_videos = self.nn_ranked(movie_id, rated_videos_without_movie_id, k)
                
        Wij = self.X_train_similarity.loc[movie_id, ranked_rated_videos[:k]]
        Ruj = user[ranked_rated_videos[:k]]
        Rj = self.X_train[ranked_rated_videos[:k]].mean()
        Ri = self.X_train[movie_id].mean()
                
        score = (np.dot(Wij, (Ruj-Rj)) / (sum(np.abs(Wij)) + 0.0000001) ) + Ri
        
        return score
        
            
    
    def nn_ranked(self, movie_id, rated_videos, k):
        
        samples = self.X_train[rated_videos].transpose()
        input_sample = self.X_train[[movie_id]].transpose()
        
        neigh = NearestNeighbors(k)
        neigh.fit(samples.values)
        
        index_neighbors = neigh.kneighbors(input_sample, min(len(samples), k), return_distance=False)
        
        return list(samples.iloc[index_neighbors[0]].index)
        
        
#         self.fit()
        
    def transform_input_data(self):
        # Pivot ratings
        self.ratings_matrix = pd.pivot_table(self.ratings, values='rating', index='userId', columns='movieId', aggfunc=np.mean)
        # get rated videos. Videos where rating is not null
        self.rated = (~self.ratings_matrix.isna()).astype(int)
        # get liked videos. Videos where rating > mean_rating
        self.liked = (self.ratings_matrix.apply(lambda row: row>(row.mean()), axis=1)).astype(int)
        # filled not rated videos by 0
        self.ratings_matrix = self.ratings_matrix.fillna(0)
        # Create Train and test set
        self.X_train, self.X_test = train_test_split(self.ratings_matrix, test_size=self.test_size, shuffle=False)
        print("X_train.shape: %s, X_test.shape: %s"%(self.X_train.shape, self.X_test.shape))
        
        self.liked_train, self.liked_test = train_test_split(self.liked, test_size=self.test_size, shuffle=False)
        
        
    def compute_similarity(self):
        
        X_train_transpose = self.X_train.values.T
        
        if self.similarity == 'cosine':
            self.X_train_similarity = cosine_similarity(X_train_transpose)
            np.fill_diagonal(self.X_train_similarity, 0)
        elif self.similarity == 'distances':
            self.X_train_similarity =  pairwise_distances(X_train_transpose)
            np.fill_diagonal(self.X_train_similarity, 0)
        else:
            raise Exception('Give correct similarity')
            
        self.X_train_similarity = pd.DataFrame(self.X_train_similarity, 
                                               index=self.X_train.columns, columns=self.X_train.columns
                                              )
        
      
    def get_recommandation_new_user(self, user_data_ratings, size_liked, size_recommended):
        
        user_data_ratings_matrix = pd.DataFrame(columns=self.ratings_matrix.columns, index=user_data_ratings.userId.unique())
        for i, row in user_data_ratings.iterrows():
            if row['movieId'] in list(self.ratings_matrix.columns):
                user_data_ratings_matrix.loc[row['userId'], row['movieId']] = row['rating']
            
        # filled not rated videos by 0
        user_data_ratings_matrix = user_data_ratings_matrix.fillna(0)

        ##### predict
        
        predict = self.format_and_predict_parallel(user_data_ratings_matrix, self.best_k)
        predict = pd.DataFrame(predict, index=user_data_ratings_matrix.index)
        
        for user_id in user_data_ratings.userId.unique():
            
#             print("User_id %s"%user_id)
            
            print("\n ===> User %s \n\n %s best videos:\n"%(user_id, size_liked))
        
            user_movies = user_data_ratings[user_data_ratings['userId'] == user_id]
            user_movies = user_movies.merge(self.movies, on='movieId')
            user_movies = user_movies.sort_values('rating', ascending=False)

            for i, movie in user_movies.iloc[:size_liked].iterrows():
                print ('movieId: {:6}  /// rating: {:2.2f}   /// title: {:10s}  /// genres: {:10s}'.format(movie['movieId'], 
                                                                                            movie['rating'], 
                                                                                            movie['title'],
                                                                                            movie['genres'])
                      )

            print("\n ===> We recommand the %s following videos:\n"%size_recommended)

            # do not recommand videos ever seen by this user
            do_not_recommand = list(user_movies['movieId'].unique())
#             print(do_not_recommand)
        
            user_ratings = predict.loc[user_id]
        
            user_ratings = user_ratings.sort_values(ascending=False)
            
            # remove movies seen by user
            user_ratings = user_ratings[~user_ratings.index.isin(do_not_recommand)]
            user_ratings = (pd.DataFrame({'rating': user_ratings.values}, index= user_ratings.index)).reset_index()

            user_movies_to_recommand = user_ratings.merge(self.movies, on='movieId')
            user_movies_to_recommand = user_movies_to_recommand.sort_values('rating', ascending=False)

            for i, movie in user_movies_to_recommand.iloc[:size_recommended].iterrows():
                print ('movieId: {:6}  /// title: {:10s}  /// genres: {:10s}'.format(movie['movieId'], 
#                                                                                      movie['rating'], 
                                                                                     movie['title'],
                                                                                     movie['genres']
                                                                                    )
                          )
                
            print("=======================================================================================")

        
    

In [38]:
# Fitting model

i_i_cf = Item_cf(ratings.iloc[:3000], 
                 movies, 
                 tags ,  
                 similarity='cosine', # 'cosine' or 'distance'
                 n_neighbors = [3],
#                  n_neighbors = [3],
                 test_size = 0.2,
                 score = "precision",
                 top_n = 5,
                 num_procs = 2,
                 N = 1
                )

ratings.shape: (3000, 4), movies.shape: (9742, 3), tags.shape: (3683, 4)
X_train.shape: (16, 1768), X_test.shape: (4, 1768)

k = 3 --- train: 0.775 test: 0.5


 elapsed_time for k=3 : 90.87523698806763



In [39]:
# # Get performance based on number of neighbors. The best 'k' is selected automatically

# i_i_cf.perf_result.sort_values('k').set_index('k').plot()

In [40]:

# predict on some users from dataset

syntetic_ratings = ratings[ratings['userId'].isin([552, ])]
i_i_cf.get_recommandation_new_user(syntetic_ratings, 5, 5)

(552, 0, 0, 1.0, 0) 3 [1, 2, 3, 4, 5]

 ===> User 552 

 5 best videos:

movieId:   2502  /// rating: 5.00   /// title: Office Space (1999)  /// genres: Comedy|Crime
movieId:   7153  /// rating: 5.00   /// title: Lord of the Rings: The Return of the King, The (2003)  /// genres: Action|Adventure|Drama|Fantasy
movieId:   1196  /// rating: 5.00   /// title: Star Wars: Episode V - The Empire Strikes Back (1980)  /// genres: Action|Adventure|Sci-Fi
movieId:   1136  /// rating: 5.00   /// title: Monty Python and the Holy Grail (1975)  /// genres: Adventure|Comedy|Fantasy
movieId:   4878  /// rating: 5.00   /// title: Donnie Darko (2001)  /// genres: Drama|Mystery|Sci-Fi|Thriller

 ===> We recommand the 5 following videos:

movieId:    608  /// title: Fargo (1996)  /// genres: Comedy|Crime|Drama|Thriller
movieId:   1198  /// title: Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)  /// genres: Action|Adventure
movieId:   2858  /// title: American Beauty (1999)  /

In [41]:

# # predict on syntetic user

# syntetic_ratings = pd.DataFrame({'userId':["Eric", "Eric", "Eric"], 
#                                  'movieId':[1, 2, 3], 
#                                  'rating':[5, 5, 5], 
#                                 })

# i_i_cf.get_recommandation_new_user(syntetic_ratings, 5, 5)

('Eric', 5, 5, 5, 0) 3 [1, 2, 3, 4, 5]

 ===> User Eric 

 5 best videos:

movieId:      1  /// rating: 5.00   /// title: Toy Story (1995)  /// genres: Adventure|Animation|Children|Comedy|Fantasy
movieId:      2  /// rating: 5.00   /// title: Jumanji (1995)  /// genres: Adventure|Children|Fantasy
movieId:      3  /// rating: 5.00   /// title: Grumpier Old Men (1995)  /// genres: Comedy|Romance

 ===> We recommand the 5 following videos:

movieId:    356  /// title: Forrest Gump (1994)  /// genres: Comedy|Drama|Romance|War
movieId:    593  /// title: Silence of the Lambs, The (1991)  /// genres: Crime|Horror|Thriller
movieId:    318  /// title: Shawshank Redemption, The (1994)  /// genres: Crime|Drama
movieId:     47  /// title: Seven (a.k.a. Se7en) (1995)  /// genres: Mystery|Thriller
movieId:    527  /// title: Schindler's List (1993)  /// genres: Drama|War 
