# Pre-processing

In [842]:
import random
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.metrics.pairwise import pairwise_distances

In [846]:
user_profiles_detailed = pd.read_csv("data/user_profiles_hospitality_dataset_2020.csv", index_col="user_id")
user_profiles_detailed.head()

Unnamed: 0_level_0,9731_read,9731_likes,9731_posts,9731_comments,7174_read,7174_likes,7174_posts,7174_comments,12807_read,12807_likes,...,132077_posts,132077_comments,132078_read,132078_likes,132078_posts,132078_comments,50683_read,50683_likes,50683_posts,50683_comments
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
80c79718-b5ae-4e79-9b1a-b42461b934d0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
fee5578c-cbcd-402d-a698-db9a58af6fb2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
284881a1-833d-49d9-9b7f-42094fdbbca1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
be9ce283-a20f-4110-9523-e1c70d657add,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
0fe64dcb-547e-4f4f-a158-a66b5edd422f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [847]:
def compute_streams_score(df):
    """
    Compute for all user/stream a single value score for the interaction
    that a user has with a stream by combining the 
    - number of reads
    - number of likes
    - number of comments
    - number of posts
    """
    df_stream_score = pd.DataFrame()
    df_columns = df.columns
    for i in range(0, len(df_columns), 4):
        stream = df_columns[i][:-5]
        df_stream_score[stream] = df[df_columns[i]] * 1 \
                                + df[df_columns[i+1]] * 2 \
                                + df[df_columns[i+2]] * 3 \
                                + df[df_columns[i+3]] * 5
    return df_stream_score

In [848]:
user_profiles = compute_streams_score(user_profiles_detailed)
user_profiles.head()

Unnamed: 0_level_0,9731,7174,12807,138248,18966,11297,11298,29731,29733,11302,...,132070,132071,132072,132073,132074,132075,132076,132077,132078,50683
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
80c79718-b5ae-4e79-9b1a-b42461b934d0,0,0,0,0,0,0,0,0,6,0,...,0,0,0,0,0,0,0,0,0,0
fee5578c-cbcd-402d-a698-db9a58af6fb2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
284881a1-833d-49d9-9b7f-42094fdbbca1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
be9ce283-a20f-4110-9523-e1c70d657add,0,0,0,0,0,0,0,9,0,0,...,0,0,0,0,0,0,0,0,0,0
0fe64dcb-547e-4f4f-a158-a66b5edd422f,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [849]:
user_profiles.shape

(7767, 129)

# Evaluation

In [920]:
class ModelEvaluator:
    """
    Class that perform evaluation of models for a given dataframe
    """
    
    def __init__(self, df):
        self.df = df
        
    
    def compute_ap(self, model_recs, rank, eval_i_stream_pool):
        """
        Compute the average precision
        """
        ap_at_rank = 0
        
        rec_r_ir_history = []
        for rec in model_recs[:rank]:
            if rec in eval_i_stream_pool:
                rec_r_ir_history.append(1)
            else :
                rec_r_ir_history.append(0)
            
            p = sum(rec_r_ir_history) / len(rec_r_ir_history)
            
            ap_at_rank += p
        
        ap_at_rank /= rank
        
        return ap_at_rank
    
    
    def evaluate_model_for_user(self, model, user, nb_i_stream_eval_per_user, nb_u_stream_eval_per_user):
        """
        Ask a given model to produce recommendations for a given user
        and evaluate it
        """
        user_interacted_streams = [stream for stream in self.df.columns if self.df.loc[user, stream] > 0]
        user_uninteracted_streams = [stream for stream in self.df.columns if stream not in user_interacted_streams]
        
        eval_i_stream_pool = []
        eval_u_stream_pool = []
        try :
            eval_i_stream_pool = random.sample(user_interacted_streams, nb_i_stream_eval_per_user)
        except :
            eval_i_stream_pool = user_interacted_streams
            nb_i_stream_eval_per_user = len(user_interacted_streams)
        
        try :
            eval_u_stream_pool = random.sample(user_uninteracted_streams, nb_u_stream_eval_per_user)
        except :
            eval_u_stream_pool = user_uninteracted_streams
            nb_u_stream_eval_per_user = len(user_uninteracted_streams)
        
        eval_stream_pool = eval_i_stream_pool + eval_u_stream_pool
        
        model_recs = model.recommend_for_user(user, stream_pool=eval_stream_pool, top_n=nb_i_stream_eval_per_user + \
                                             nb_u_stream_eval_per_user)

        rec_at_5 = sum([1 for stream in eval_i_stream_pool if stream in model_recs[:5]]) / nb_i_stream_eval_per_user
        rec_at_10 = sum([1 for stream in eval_i_stream_pool if stream in model_recs[:10]]) / nb_i_stream_eval_per_user
        
        ap_at_5 = self.compute_ap(model_recs, 5, eval_i_stream_pool)
        ap_at_10 = self.compute_ap(model_recs, 10, eval_i_stream_pool)
        
        return {"R@5": rec_at_5, "R@10": rec_at_10, "AP@5": ap_at_5, "AP@10": ap_at_10}


    def evaluate_model(self, model, nb_user_eval, nb_i_stream_eval_per_user, nb_u_stream_eval_per_user):
        """
        Evaluate a model by evaluating it on a number of random user
        and averaging the metrics
        """
        avg_rec_at_5 = 0
        avg_rec_at_10 = 0
        map_at_5 = 0
        map_at_10 = 0
        
        users_with_enough_interacted_streams = [user for user in self.df.index if \
                                                sum([1 for stream in self.df.columns if self.df.loc[user, stream] > 0]) \
                                                > nb_i_stream_eval_per_user]
        
        for _ in tqdm(range(nb_user_eval)):
            user = users_with_enough_interacted_streams[np.random.randint(0, len(users_with_enough_interacted_streams))]
            model_user_eval = self.evaluate_model_for_user(model, user, nb_i_stream_eval_per_user, nb_u_stream_eval_per_user)
            avg_rec_at_5 += model_user_eval["R@5"]
            avg_rec_at_10 += model_user_eval["R@10"]
            map_at_5 += model_user_eval["AP@5"]
            map_at_10 += model_user_eval["AP@10"]

        avg_rec_at_5 /= nb_user_eval
        avg_rec_at_10 /= nb_user_eval
        map_at_5 /= nb_user_eval
        map_at_10 /= nb_user_eval

        return {"R@5": avg_rec_at_5, "R@10": avg_rec_at_10, "MAP@5": map_at_5, "MAP@10": map_at_10}

In [921]:
model_evaluator = ModelEvaluator(user_profiles)

# Popularity recommendation system
Rank the stream regarding how much the users interacts with it and recommend the streams with the most interactions

In [1058]:
class PopularityRecommender:
    """
    Class that recommend streams to user based on stream popularity
    """
    
    def __init__(self, df):
        self.df = df
    
    
    def rank_streams_by_popularity(self):
        """
        Rank stream by popularity
        """
        stream_popularity = {}
        for stream in self.df.columns:
            stream_popularity[stream] = self.df[stream].sum()

        ranked_streams = sorted(stream_popularity, key=stream_popularity.get, reverse=True)

        return ranked_streams
    

    def recommend_for_user(self, user, stream_pool=[], top_n=10):
        """
        Recommend top n popular stream to a user
        """
        if len(stream_pool) == 0:
            stream_pool = self.df.columns
        
        ranked_streams = self.rank_streams_by_popularity()
            
        possible_recommendations = [stream for stream in ranked_streams if stream in stream_pool]
        
        return possible_recommendations[:top_n]

In [1059]:
popularity_model = PopularityRecommender(user_profiles)
popularity_model.recommend_for_user("80c79718-b5ae-4e79-9b1a-b42461b934d0", top_n=10)

['2516',
 '33581',
 '14746',
 '4908',
 '10116',
 '9273',
 '10320',
 '68664',
 '11302',
 '24639']

In [1060]:
model_evaluator.evaluate_model(popularity_model, 100, 5, 45)

100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 119.81it/s]


{'R@5': 0.33999999999999986,
 'R@10': 0.5260000000000001,
 'MAP@5': 0.47249999999999964,
 'MAP@10': 0.37782976190476186}

# Collaborative filtering recommendation system

## user-user approach
Look a the most similar user and recommend the streams he interacts with

take the most interacted ones first in the similar userq

In [1061]:
class CFUser:
    """
    Class that recommend streams to user based on
    user similarity
    """
    
    def __init__(self, df):
        self.df = df
        self.users_similarities = pairwise_distances(df.to_numpy(), metric='cosine')

    
    def recommend_for_user(self, user, stream_pool=[], top_n=10):
        """
        Recommend top n popular stream to a user
        based on user similarity
        """
        
        if len(stream_pool) == 0:
            stream_pool = self.df.columns
            
        user_index = self.df.index.to_list().index(user)
        user_similarity = self.users_similarities[user_index].tolist()
        user_similarity[user_index] = 1.0
        
        recommended_streams = []
        
        while len(recommended_streams) < top_n:
            most_similar_user = user_similarity.index(min(user_similarity))

            if user_similarity[most_similar_user] >= 1.0: # TODO add sim treshold
                break

            for stream in self.df.columns:
                if self.df[stream][most_similar_user] > 0 and stream not in recommended_streams and stream in stream_pool:
                    recommended_streams.append(stream)

            user_similarity[most_similar_user] = 1.
            
        return recommended_streams

In [1062]:
cf_user_model = CFUser(user_profiles)
cf_user_model.recommend_for_user("80c79718-b5ae-4e79-9b1a-b42461b934d0", top_n=10)

['2513',
 '2516',
 '9273',
 '12807',
 '14888',
 '35676',
 '9731',
 '16032',
 '14666',
 '17762']

In [1081]:
model_evaluator.evaluate_model(cf_user_model, 100, 5, 45)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [07:54<00:00,  4.74s/it]


{'R@5': 0.6359999999999999,
 'R@10': 0.8099999999999993,
 'MAP@5': 0.7648666666666659,
 'MAP@10': 0.6253698412698415}

## item-item approach
Look a the most interacted stream, and recommend the streams that are similar to this one 

while not enough recommendation :
    take most interacted stream
    take similar stream (<0.5)
    take second most interacted stream
    take similar ones 
    .
    .
    .

In [1064]:
class CFItem:
    """
    Class that recommend streams to user based on
    stream similarity
    """
    
    def __init__(self, df):
        self.df = df
        self.streams_similarities = pairwise_distances(df.to_numpy().T, metric='cosine')

    
    def recommend_for_user(self, user, stream_pool=[], top_n=10):
        """
        Recommend top n popular stream to a user
        based on stream similarity
        """
    
        if len(stream_pool) == 0:
            stream_pool = self.df.columns
            
        user_streams_indexes = {stream: self.df.columns.to_list().index(stream) for stream in self.df.columns \
                                if self.df.loc[user, stream] > 0}
        
        stream_similarity = {stream: self.streams_similarities[stream_index] for stream, stream_index \
                             in user_streams_indexes.items()}
                
        recommended_streams = []
        for user_stream in stream_similarity.keys():
            for similarity in stream_similarity[user_stream]:
                if similarity < 0.9: # add treshold 
                    stream_index = stream_similarity[user_stream].tolist().index(similarity)
                    stream = self.df.columns[stream_index]
                    if stream not in recommended_streams and stream in stream_pool:
                        recommended_streams.append(stream)
                if len(recommended_streams) >= top_n:
                    break
            
        return recommended_streams

In [1065]:
cf_item_model = CFItem(user_profiles)
cf_item_model.recommend_for_user("80c79718-b5ae-4e79-9b1a-b42461b934d0", top_n=10)

['29733',
 '125545',
 '16032',
 '35676',
 '11298',
 '14888',
 '68664',
 '9273',
 '124493',
 '10320']

In [1066]:
model_evaluator.evaluate_model(cf_item_model, 100, 5, 45)

100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 271.29it/s]


{'R@5': 0.4700000000000001,
 'R@10': 0.728,
 'MAP@5': 0.5433333333333332,
 'MAP@10': 0.47401507936507953}

# Clusters of User recommendation system

In [1067]:
user_clusters = pd.read_csv("data/users_cluster_hospitality_dataset_2020.csv", index_col="user_id")
user_clusters.head()

Unnamed: 0_level_0,cluster_id
user_id,Unnamed: 1_level_1
80c79718-b5ae-4e79-9b1a-b42461b934d0,4
fee5578c-cbcd-402d-a698-db9a58af6fb2,8
284881a1-833d-49d9-9b7f-42094fdbbca1,8
be9ce283-a20f-4110-9523-e1c70d657add,4
0fe64dcb-547e-4f4f-a158-a66b5edd422f,29


## clusters only (for user similarity)
compute most popular streams for the cluster of the user

In [1068]:
class ClusterRecommender:
    """
    Class that recommend streams to user based on
    popularity of stream inside the user cluster
    """
    
    def __init__(self, df, users_clusters):
        self.df = df
        self.user_clusters = users_clusters
        self.clusters = self.compute_clusters()
        self.cluster_streams_popularities = {}
        
        
    def compute_clusters(self):
        """
        Compute a list of user per cluster
        """
        
        cluster_ids = set(self.user_clusters["cluster_id"])
        
        clusters = {}
        for cluster_id in cluster_ids:
            clusters[cluster_id] = [user for user in self.user_clusters.index if \
                                    user_clusters.loc[user, "cluster_id"] == cluster_id]
            
        return clusters
            
        
    def compute_cluster_stream_popularity(self, cluster_id):
        """
        Compute a ranking of popuar stream inside a cluster
        """
        
        try :
            return self.cluster_streams_popularities[cluster_id]
        except :
            streams_nb_interactions = {}
            
            for user in self.clusters[cluster_id]:
                for stream in self.df.columns:
                    nb_interactions = self.df.loc[user, stream]
                    if nb_interactions > 0:
                        try :
                            streams_nb_interactions[stream] += self.df.loc[user, stream]
                        except :
                            streams_nb_interactions[stream] = self.df.loc[user, stream]
           
            cluster_populars_streams = [stream for stream, popularity in \
                                        sorted(streams_nb_interactions.items(), key=lambda item: item[1], reverse=True)]
            
            return cluster_populars_streams
        
    
    def recommend_for_user(self, user, stream_pool=[], top_n=10):
        """
        Recommend streams to user based on
        cluster popularity
        """
        
        if len(stream_pool) == 0:
            stream_pool = self.df.columns
            
        user_cluster = self.user_clusters.loc[user, "cluster_id"]
            
        recommended_streams = self.compute_cluster_stream_popularity(user_cluster)
        
        recommended_streams = [stream for stream in recommended_streams if stream in stream_pool]
            
        return recommended_streams[:top_n]

In [1069]:
cluster_model = ClusterRecommender(user_profiles, user_clusters)
cluster_model.recommend_for_user("80c79718-b5ae-4e79-9b1a-b42461b934d0", top_n=10)

['2516',
 '68664',
 '76208',
 '2513',
 '24639',
 '10381',
 '46810',
 '30935',
 '11297',
 '4908']

In [1082]:
model_evaluator.evaluate_model(cluster_model, 100, 5, 45)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:35<00:00,  2.82it/s]


{'R@5': 0.5919999999999996,
 'R@10': 0.8099999999999997,
 'MAP@5': 0.7155666666666665,
 'MAP@10': 0.5655115079365082}

## Clusters and CF-User model
union or intersection, use score in both ?

In [1071]:
class ClusterCFUserRecommender:
    """
    Class that recommend streams to user based on a combination of
    user similarity and popularity in the cluster
    """
    
    def __init__(self, df, users_clusters):
        self.df = df
        self.cluster_model = ClusterRecommender(self.df, users_clusters)
        self.cf_user_model = CFUser(self.df)
        
    
    def recommend_for_user(self, user, stream_pool=[], top_n=10):
        """
        Recommend streams to user based on a combination of
        user similarity and popularity in the cluster
        """
        
        if len(stream_pool) == 0:
            stream_pool = self.df.columns
            
        cluster_recs = self.cluster_model.recommend_for_user(user, stream_pool=stream_pool, top_n=top_n)
        cf_user_recs = self.cf_user_model.recommend_for_user(user, stream_pool=stream_pool, top_n=top_n)
            
        stream_scores = {}
        for i, stream in enumerate(cluster_recs):
            if stream not in cf_user_recs:
                stream_score = (i + top_n) / 2
                stream_scores[stream] = stream_score
            else :
                stream_score = (i + cf_user_recs.index(stream)) / 2
                stream_scores[stream] = stream_score
        for i, stream in enumerate(cf_user_recs):
            if stream not in cluster_recs:
                stream_score = (i + top_n) / 2
                stream_scores[stream] = stream_score
        
        recommended_streams = [stream for stream, score in sorted(stream_scores.items(), key=lambda item: item[1])]
            
        return recommended_streams[:top_n]

In [1072]:
cluster_cf_user_model = ClusterCFUserRecommender(user_profiles, user_clusters)

In [1073]:
cluster_cf_user_model.recommend_for_user("80c79718-b5ae-4e79-9b1a-b42461b934d0", top_n=10)

['2516',
 '2513',
 '68664',
 '76208',
 '9273',
 '12807',
 '24639',
 '14888',
 '10381',
 '35676']

In [1075]:
model_evaluator.evaluate_model(cluster_cf_user_model, 100, 5, 45)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [09:18<00:00,  5.58s/it]


{'R@5': 0.6399999999999999,
 'R@10': 0.8419999999999994,
 'MAP@5': 0.7784999999999994,
 'MAP@10': 0.6399531746031745}

## Clusters and CF-Item model
union or intersection, use score in both ?

In [1077]:
class ClusterCFUserRecommender:
    """
    Class that recommend streams to user based on a combination of
    stream similarity and popularity in the cluster
    """
    
    def __init__(self, df, users_clusters):
        self.df = df
        self.cluster_model = ClusterRecommender(self.df, users_clusters)
        self.cf_item_model = CFItem(self.df)
        
    
    def recommend_for_user(self, user, stream_pool=[], top_n=10):
        """
        Recommend streams to user based on a combination of
        user similarity and popularity in the cluster
        """
        
        if len(stream_pool) == 0:
            stream_pool = self.df.columns
            
        cluster_recs = self.cluster_model.recommend_for_user(user, stream_pool=stream_pool, top_n=top_n)
        cf_item_recs = self.cf_item_model.recommend_for_user(user, stream_pool=stream_pool, top_n=top_n)
            
        stream_scores = {}
        for i, stream in enumerate(cluster_recs):
            if stream not in cf_item_recs:
                stream_score = (i + top_n) / 2
                stream_scores[stream] = stream_score
            else :
                stream_score = (i + cf_item_recs.index(stream)) / 2
                stream_scores[stream] = stream_score
        for i, stream in enumerate(cf_item_recs):
            if stream not in cluster_recs:
                stream_score = (i + top_n) / 2
                stream_scores[stream] = stream_score
        
        recommended_streams = [stream for stream, score in sorted(stream_scores.items(), key=lambda item: item[1])]
            
        return recommended_streams[:top_n]

In [1078]:
cluster_cf_item_model = ClusterCFUserRecommender(user_profiles, user_clusters)

In [1079]:
cluster_cf_item_model.recommend_for_user("80c79718-b5ae-4e79-9b1a-b42461b934d0", top_n=10)

['68664',
 '2516',
 '29733',
 '125545',
 '76208',
 '16032',
 '2513',
 '35676',
 '24639',
 '11298']

In [1080]:
model_evaluator.evaluate_model(cluster_cf_item_model, 100, 5, 45)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:44<00:00,  2.25it/s]


{'R@5': 0.6779999999999997,
 'R@10': 0.9139999999999994,
 'MAP@5': 0.7527666666666663,
 'MAP@10': 0.6486567460317457}