<a href="https://colab.research.google.com/github/grayhacked/AI/blob/main/recsys_TP2_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recommender systems: Content-based and hybrid models

Dataset: [Articles Sharing and Reading from CI&T Deskdrop](https://www.kaggle.com/gspmoreira/articles-sharing-reading-from-cit-deskdrop).

This dataset contains a real sample of **12 months logs (Mar. 2016 - Feb. 2017)** from CI&T's Internal Communication platform (DeskDrop) corresponding to about **73k logged users interactions** on more than **3k public articles** shared in the platform.

In [None]:
import pandas as pd
import numpy as np
from tqdm.notebook import trange, tqdm

## Load data

In [None]:
%%capture
!wget "https://github.com/eishkina-estia/ML2023/raw/main/data/CI&T DeskDrop.zip"
!unzip "CI&T DeskDrop.zip"

In [None]:
interactions_df = pd.read_csv('/content/CI&T DeskDrop/users_interactions.csv')
interactions_df.head()

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US
2,1465416190,VIEW,310515487419366995,-1130272294246983140,2631864456530402479,,,
3,1465413895,FOLLOW,310515487419366995,344280948527967603,-3167637573980064150,,,
4,1465412290,VIEW,-7820640624231356730,-445337111692715325,5611481178424124714,,,


In [None]:
interactions_df.personId = interactions_df.personId.astype(str)
interactions_df.contentId = interactions_df.contentId.astype(str)

In [None]:
nb_articles = len(interactions_df['contentId'].unique())
nb_users = len(interactions_df['personId'].unique())

print(f'Distinct articles: \t{nb_articles}')
print(f'Distinct users: \t{nb_users}')

Distinct articles: 	2987
Distinct users: 	1895


## Data preprocessing

There are different types of interactions (`eventType` column) logged in the dataset:

* `VIEW`: The user opened the article.
* `LIKE`: The user liked the article.
* `COMMENT CREATED`: The user created a comment in the article.
* `FOLLOW`: The user chose to be notified on any new comment in the article.
* `BOOKMARK`: The user bookmarked the article for easy return in the future.

We need to get some integrated quantitative measure to represent the interaction between a user and an article (an item).

Do the following preprocessing:

* Encode interaction types with numbers:
  * `VIEW`: `1.0`
  * `LIKE`: `2.0`
  * `COMMENT CREATED`: `2.5`
  * `FOLLOW`: `3.0`
  * `BOOKMARK`: `4.0`

* For each tuple (user, article) calculate:
  * base-2 logarithm of 1 plus the sum of all the numeric representations of the corresponding interactions,
  * timestamp corresponding to the last interaction.

* Keep in the dataset only those users who interacted with at least 5 articles (i.e. remove "cold" users). Display the percentage of dropped rows.

* Encapsulate all the previous steps in a function to be able to easily replace the logic if needed.

### Encode interaction types with numbers

In [None]:
interactions_df['eventType'].value_counts()

VIEW               61086
LIKE                5745
BOOKMARK            2463
COMMENT CREATED     1611
FOLLOW              1407
Name: eventType, dtype: int64

In [None]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0,
   'BOOKMARK': 2.5,
   'FOLLOW': 3.0,
   'COMMENT CREATED': 4.0,
}

In [None]:
interactions_df['eventStrength'] = interactions_df['eventType'].apply(lambda x: event_type_strength[x])
interactions_df['eventStrength']

0        1.0
1        1.0
2        1.0
3        3.0
4        1.0
        ... 
72307    2.0
72308    1.0
72309    1.0
72310    1.0
72311    1.0
Name: eventStrength, Length: 72312, dtype: float64

### Merge different interactions for each `<user,article>` pair

In [None]:
interactions_df = (
    interactions_df
    .groupby(['personId', 'contentId']).agg(
        eventStrength=('eventStrength', lambda x: np.log2(x.sum()+1)),
        last_timestamp=('timestamp', 'max')
    ).reset_index()
)

interactions_df

Unnamed: 0,personId,contentId,eventStrength,last_timestamp
0,-1007001694607905623,-5065077552540450930,1.000000,1470395911
1,-1007001694607905623,-6623581327558800021,1.000000,1487240080
2,-1007001694607905623,-793729620925729327,1.000000,1472834892
3,-1007001694607905623,1469580151036142903,1.000000,1487240062
4,-1007001694607905623,7270966256391553686,1.584963,1485994342
...,...,...,...,...
40705,998688566268269815,-401664538366009049,1.000000,1474567449
40706,998688566268269815,3456674717452933449,2.584963,1478802088
40707,998688566268269815,6881796783400625893,1.000000,1474567675
40708,998688566268269815,7174452660053929140,2.321928,1478812905


### Remove "cold" users

In [None]:
users_interactions_count = interactions_df.groupby('personId').size()
users_interactions_count

personId
-1007001694607905623      6
-1032019229384696495    648
-108842214936804958     270
-1093393486211919385      2
-1110220372195277179      3
                       ... 
987030274299018507        3
989049974880576288       19
98958074799194811         4
997469202936578234       17
998688566268269815        6
Length: 1895, dtype: int64

In [None]:
threshold_non_cold_users = 5

users_interactions_count_non_cold = users_interactions_count[users_interactions_count >= threshold_non_cold_users]
interactions_non_cold = interactions_df.loc[np.in1d(interactions_df.personId, users_interactions_count_non_cold.index)]

nb_users = len(users_interactions_count)
nb_users_non_cold = len(users_interactions_count_non_cold)

nb_ratings = len(interactions_df)
nb_ratings_non_cold_users = len(interactions_non_cold)

print(f'number of users: {nb_users}')
print(f'number of ratings (<user,item> pairs): {nb_ratings}')
print()

print(f'number of users with at least {threshold_non_cold_users} interactions: {nb_users_non_cold} ({nb_users_non_cold*100/nb_users:.1f}%)')
print(f'number of ratings (<user,item> pairs) for users with at least {threshold_non_cold_users} interactions: {nb_ratings_non_cold_users} ({nb_ratings_non_cold_users*100/nb_ratings:.1f}%)')

number of users: 1895
number of ratings (<user,item> pairs): 40710

number of users with at least 5 interactions: 1140 (60.2%)
number of ratings (<user,item> pairs) for users with at least 5 interactions: 39106 (96.1%)


## Modeling and evaluation

### Adding features

* Load metadata about the articles shared in the platform (`shared_articles.csv`)

* Keep only the articles corresponding to `CONTENT SHARED` event type. As you can see, there are two possible event types:
  * `CONTENT SHARED`: The article was shared in the platform and is available for users.
  * `CONTENT REMOVED`: The article was removed from the platform and not available for further recommendation.

* Merge articles metadata with interactions data.

* Do feature engineering. Explore the following variables and build a set of features based on them:

  * content type,
  * language,
  * title: define a few popular topics based on the [notebook](https://www.kaggle.com/code/gspmoreira/deskdrop-articles-topic-modeling/notebook),
  * domain: define a few popular domains base on the [notebook](https://www.kaggle.com/code/gspmoreira/deskdrop-datasets-eda/notebook)
  * define a few features representing user interests, for example by presence of keywords in titles weighted with interaction strength.


In [None]:
# Load metadata about the articles shared in the platform
articles_df = pd.read_csv('/content/CI&T DeskDrop/shared_articles.csv')
articles_df.head(2)

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en


In [None]:
# Keep only the articles corresponding to CONTENT SHARED event type.
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
articles_df = articles_df[['contentId', 'contentType', 'url', 'title', 'text', 'lang']]
articles_df['contentId'] = articles_df.contentId.astype(str)
articles_df.head(2)

Unnamed: 0,contentId,contentType,url,title,text,lang
1,-4110354420726924665,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,-7292285110016212249,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en


In [None]:
# content type
articles_df['contentType'].value_counts()

HTML     3027
RICH       10
VIDEO      10
Name: contentType, dtype: int64

In [None]:
articles_df['is_HTML'] = (articles_df['contentType'] == 'HTML').astype(int)
articles_df = articles_df.drop(columns='contentType')
articles_df['is_HTML'].value_counts()

1    3027
0      20
Name: is_HTML, dtype: int64

In [None]:
# language
articles_df['lang'].value_counts()

en    2211
pt     829
la       3
es       2
ja       2
Name: lang, dtype: int64

In [None]:
articles_df['lang_en'] = (articles_df['lang'] == 'en').astype(int)
articles_df['lang_pt'] = (articles_df['lang'] == 'pt').astype(int)
articles_df = articles_df.drop(columns='lang')

In [None]:
# title: define a few popular topics
import re

topics = {
    'business': ['company', 'companies', 'business', 'customers', 'consumers', 'product'],
    'ai': ['ai', 'machine learning', 'deep learning', 'data'],
    'software': ['software', 'system', 'systems', 'api', 'service', 'process', 'database', 'code']
}

articles_df['title'] = articles_df['title'].str.lower()

for topic, keywords in topics.items():
    keywords_re = '|'.join(keywords)
    is_topic = articles_df['title'].apply(lambda title: re.search(keywords_re, title) is not None).astype(int)
    articles_df[f'topic_{topic}'] = is_topic

    print('*' * 40)
    print(topic)
    print(is_topic.value_counts())

****************************************
business
0    2932
1     115
Name: title, dtype: int64
****************************************
ai
0    2460
1     587
Name: title, dtype: int64
****************************************
software
0    2819
1     228
Name: title, dtype: int64


In [None]:
# domain: define a few popular domains
import re

articles_df['urlDomain'] = articles_df['url'].apply(lambda x: re.sub(r'^http[s]*:\/\/', '', re.search(r'^http[s]*:\/\/[\w\.]*', x, re.IGNORECASE).group(0)))
print(articles_df['urlDomain'].isna().any())
articles_df[['urlDomain','url']].head()

False


Unnamed: 0,urlDomain,url
1,www.nytimes.com,http://www.nytimes.com/2016/03/28/business/dea...
2,cointelegraph.com,http://cointelegraph.com/news/bitcoin-future-w...
3,cloudplatform.googleblog.com,https://cloudplatform.googleblog.com/2016/03/G...
4,bitcoinmagazine.com,https://bitcoinmagazine.com/articles/ibm-wants...
5,www.coindesk.com,http://www.coindesk.com/ieee-blockchain-oxford...


In [None]:
domains_to_keep = articles_df['urlDomain'].value_counts().index[:10]
articles_df.drop(columns='url')
articles_df.loc[~articles_df['urlDomain'].isin(domains_to_keep), 'urlDomain'] = 'other'
articles_df['urlDomain'].value_counts()

other                           2368
techcrunch.com                   185
medium.com                       127
cloudplatform.googleblog.com      72
startupi.com.br                   50
www.imdb.com                      47
googlediscovery.com               43
exame.abril.com.br                43
www.mckinsey.com                  41
www.businessinsider.com           38
www.linkedin.com                  33
Name: urlDomain, dtype: int64

In [None]:
domains_to_keep

for d in domains_to_keep:
    articles_df[f'urldomain_{d}'] = (articles_df['urlDomain'] == d).astype(int)

    print('*' * 40)
    print(d)
    print(articles_df[f'urldomain_{d}'].value_counts())

****************************************
techcrunch.com
0    2862
1     185
Name: urldomain_techcrunch.com, dtype: int64
****************************************
medium.com
0    2920
1     127
Name: urldomain_medium.com, dtype: int64
****************************************
cloudplatform.googleblog.com
0    2975
1      72
Name: urldomain_cloudplatform.googleblog.com, dtype: int64
****************************************
startupi.com.br
0    2997
1      50
Name: urldomain_startupi.com.br, dtype: int64
****************************************
www.imdb.com
0    3000
1      47
Name: urldomain_www.imdb.com, dtype: int64
****************************************
googlediscovery.com
0    3004
1      43
Name: urldomain_googlediscovery.com, dtype: int64
****************************************
exame.abril.com.br
0    3004
1      43
Name: urldomain_exame.abril.com.br, dtype: int64
****************************************
www.mckinsey.com
0    3006
1      41
Name: urldomain_www.mckinsey.com, dtype

In [None]:
articles_df = articles_df.drop(columns = ['url', 'urlDomain', 'title', 'text'])
articles_df = articles_df.set_index('contentId')
articles_df.head()

Unnamed: 0_level_0,is_HTML,lang_en,lang_pt,topic_business,topic_ai,topic_software,urldomain_techcrunch.com,urldomain_medium.com,urldomain_cloudplatform.googleblog.com,urldomain_startupi.com.br,urldomain_www.imdb.com,urldomain_googlediscovery.com,urldomain_exame.abril.com.br,urldomain_www.mckinsey.com,urldomain_www.businessinsider.com,urldomain_www.linkedin.com
contentId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
-4110354420726924665,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-7292285110016212249,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
-6151852268067518688,1,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0
2448026894306402386,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0
-2826566343807132236,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0


In [None]:
print(len(interactions_non_cold))
interactions_content_based = interactions_non_cold.join(articles_df, on='contentId')
print(len(interactions_content_based))
# display(interactions_content_based)

39106
39106


In [None]:
# define a few features representing user interests, for example by presence of keywords in titles weighted with interaction strength.

for topic, keywords in topics.items():
    col_name = 'topic_'+topic
    mean_rating = interactions_content_based[interactions_content_based[col_name] == 1]['eventStrength'].mean()
    print(f'Mean rating for topic={topic}: {mean_rating:.2f}')

Mean rating for topic=business: 1.55
Mean rating for topic=ai: 1.50
Mean rating for topic=software: 1.47


In [None]:
# build user profile: user coefs by topic

def build_user_profile(x):
    res = np.zeros(len(topics))
    col_names = ['topic_'+topic_name for topic_name in topics.keys()]
    res_col_names = ['user_'+col_name for col_name in col_names]
    y = x['eventStrength']
    for index, col in enumerate(col_names):
        user_ratings_topic = y[x[col] == 1]
        if len(user_ratings_topic) != 0:
            res[index] = user_ratings_topic.mean()
    return pd.Series(res, index=res_col_names)

# test
user_0 = interactions_content_based['personId'] == interactions_content_based['personId'].iloc[0]
interactions_user_0 = interactions_content_based[user_0]
build_user_profile(interactions_user_0)

user_topic_business    0.0
user_topic_ai          1.0
user_topic_software    1.0
dtype: float64

In [None]:
user_profiles = interactions_content_based.groupby('personId').apply(build_user_profile)
user_profiles

Unnamed: 0_level_0,user_topic_business,user_topic_ai,user_topic_software
personId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
-1007001694607905623,0.000000,1.000000,1.000000
-1032019229384696495,2.244670,2.175039,2.163026
-108842214936804958,1.707673,1.580566,1.491726
-1119397949556155765,0.000000,0.000000,1.000000
-1130272294246983140,1.194988,1.307492,1.476723
...,...,...,...
953707509720613429,2.000000,0.000000,1.000000
983095443598229476,0.000000,1.000000,1.000000
989049974880576288,0.000000,1.194988,1.438722
997469202936578234,0.000000,2.584963,0.000000


In [None]:
interactions_content_based = interactions_content_based.join(user_profiles, on='personId')
interactions_content_based

Unnamed: 0,personId,contentId,eventStrength,last_timestamp,is_HTML,lang_en,lang_pt,topic_business,topic_ai,topic_software,...,urldomain_startupi.com.br,urldomain_www.imdb.com,urldomain_googlediscovery.com,urldomain_exame.abril.com.br,urldomain_www.mckinsey.com,urldomain_www.businessinsider.com,urldomain_www.linkedin.com,user_topic_business,user_topic_ai,user_topic_software
0,-1007001694607905623,-5065077552540450930,1.000000,1470395911,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
1,-1007001694607905623,-6623581327558800021,1.000000,1487240080,1.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,-1007001694607905623,-793729620925729327,1.000000,1472834892,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
3,-1007001694607905623,1469580151036142903,1.000000,1487240062,1.0,1.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
4,-1007001694607905623,7270966256391553686,1.584963,1485994342,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40705,998688566268269815,-401664538366009049,1.000000,1474567449,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
40706,998688566268269815,3456674717452933449,2.584963,1478802088,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
40707,998688566268269815,6881796783400625893,1.000000,1474567675,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
40708,998688566268269815,7174452660053929140,2.321928,1478812905,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Build a classifier (content-based model)

* Split data into train (75%) and test (25%) sets by timestamp.

* Build a classifier: predict whether there would be an interaction (whatever its strength) or not

  * You will need to add samples of negative class (no interaction). Add $4 \times \text {train length}$ random permutations of item id and article id.
  
  * Fit a classifier using only the processed features.

* Evaluate the content-based model using precision@10 metric:

  * To select 10 articles to recommend, use the probabilities corresponding to the positive class.

In [None]:
%%capture
!wget https://raw.githubusercontent.com/eishkina-estia/ML2023/main/recsys_bihar.py

In [None]:
# Module containing classes and functions defined in the previous assignments
import recsys_bihar as recsys

In [None]:
# Split data into train (75%) and test (25%) sets by timestamp.
X_train, X_test, y_train, y_test = recsys.train_test_split(
    interactions_content_based, test_size=0.25,
    user_col='personId', item_col='contentId',
    rating_col='eventStrength', time_col='last_timestamp')

X_train.shape, X_test.shape, y_train.shape, y_test.shape

users:   0%|          | 0/1140 [00:00<?, ?it/s]

((28869, 2), (10237, 2), (28869,), (10237,))

In [None]:
# add samples of negative class (no interaction). Add  4×train length  random permutations of item id and article id.
import itertools, random

users = X_train['personId'].unique()
items = X_train['contentId'].unique()

index_columns = ['personId', 'contentId']
X_train_neg = pd.DataFrame(itertools.product(users, items), columns=index_columns).set_index(index_columns)
drop_interactions_nonzero = X_train_neg.index.intersection(X_train.set_index(index_columns).index)
X_train_neg = X_train_neg.drop(drop_interactions_nonzero).reset_index()
keep_interactions_zero = random.sample(X_train_neg.index.tolist(), 4 * len(X_train))
X_train_neg = X_train_neg.loc[keep_interactions_zero]
len(X_train), len(X_train_neg)

(28869, 115476)

In [None]:
X_train_clf = pd.concat([X_train, X_train_neg], axis=0)
# merge with item profiles (articles_df), user profiles, and drop ids
X_train_clf = (X_train_clf
    .join(articles_df, on='contentId')
    .join(user_profiles, on='personId')
    .drop(columns=['contentId','personId'])
    .reset_index(drop=True))

y_train_clf = pd.Series([1]*len(X_train) + [0]*len(X_train_neg))

# nan for removed articles?
filter_nan = ~X_train_clf.isna().any(axis=1)
X_train_clf = X_train_clf.loc[filter_nan]
y_train_clf = y_train_clf.loc[filter_nan]

X_train_clf.shape, y_train_clf.shape

((143962, 19), (143962,))

In [None]:
X_train_clf.head()

Unnamed: 0,is_HTML,lang_en,lang_pt,topic_business,topic_ai,topic_software,urldomain_techcrunch.com,urldomain_medium.com,urldomain_cloudplatform.googleblog.com,urldomain_startupi.com.br,urldomain_www.imdb.com,urldomain_googlediscovery.com,urldomain_exame.abril.com.br,urldomain_www.mckinsey.com,urldomain_www.businessinsider.com,urldomain_www.linkedin.com,user_topic_business,user_topic_ai,user_topic_software
1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.116993,1.42091,1.538749
2,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.116993,1.42091,1.538749
3,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.116993,1.42091,1.538749
4,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.116993,1.42091,1.538749
5,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.116993,1.42091,1.538749


In [None]:
# Fit a classifier using only the processed features.
import xgboost as xgb

model_xgb = xgb.XGBClassifier()
model_xgb.fit(X_train_clf, y_train_clf)

#### Evaluation

Evaluate the content-based model using precision@10 metric

In [None]:
user_col, item_col, rating_col = 'personId', 'contentId', 'proba_interaction'

users = X_train[user_col].unique()
items = X_train[item_col].unique()

def get_recommended_items_clf(model_clf, user, k=None):

    if user not in users:
        return pd.DataFrame(columns=[item_col, rating_col])

    # all possible permutations
    X_user = pd.DataFrame(itertools.product([user], items), columns=[user_col, item_col]).set_index([user_col, item_col])

    # remove permutations from self.non_zero_ratings - used in fit
    drop_permutations = X_user.index.intersection(X_train.set_index([user_col, item_col]).index)
    X_user = X_user.drop(drop_permutations).reset_index()

    # merge with item profiles (articles_df), user profiles, and drop ids
    X_user = (X_user
        .join(articles_df, on=item_col)
        .join(user_profiles, on=user_col)
        .reset_index(drop=True))
    X_user = X_user.dropna(axis=0)

    y_pred = pd.Series(model_clf.predict_proba(X_user.drop(columns=[item_col,user_col]))[:,1], name=rating_col)

    recommended_items = pd.concat([X_user[item_col], y_pred], axis=1).sort_values(rating_col, ascending=False)
    if k is not None:
        recommended_items = recommended_items[:k]

    return recommended_items

def mean_precision_at_k_clf(model_clf, X_test, y_test, k, nb_random_users=None, selected_users=None):

    if nb_random_users is None:
        if selected_users is None:
            selected_users = users.tolist()
    else:
        selected_users = random.sample(users.tolist(), nb_random_users)

    # all possible permutations
    X_test_all = pd.DataFrame(itertools.product(selected_users, items), columns=[user_col, item_col]).set_index([user_col, item_col])

    # remove permutations from self.non_zero_ratings - used in fit
    drop_permutations = X_test_all.index.intersection(X_train.set_index([user_col, item_col]).index)
    X_test_all = X_test_all.drop(drop_permutations).reset_index()

    # merge with item profiles (articles_df), user profiles, and drop ids
    X_test_all = (X_test_all
        .join(articles_df, on=item_col)
        .join(user_profiles, on=user_col)
        .reset_index(drop=True))
    X_test_all = X_test_all.dropna(axis=0)

    item_candidates = X_test_all[item_col]

    y_pred_all = pd.Series(model_clf.predict_proba(X_test_all.drop(columns=[item_col,user_col]))[:,1], name=rating_col)

    data_pred = pd.concat([X_test_all[[user_col, item_col]], y_pred_all.rename(rating_col)], axis=1)
    data_test = pd.concat([X_test[[user_col, item_col]], y_test.rename(rating_col)], axis=1)

    # sort predicted ratings
    data_pred = data_pred.sort_values(rating_col, ascending=False)

    recommendations_by_user = {}
    precision_by_user = {}

    for u in tqdm(selected_users, desc='users'):
        R_u_k = data_pred.loc[data_pred[user_col] == u, item_col][:k]
        L_u = data_test.loc[data_test[user_col] == u, item_col]
        recommendations_by_user[u] = R_u_k.tolist()

        if len(R_u_k) != 0:
            precision_by_user[u] = sum(np.isin(R_u_k, L_u)) / len(R_u_k)
        else:
            precision_by_user[u] = 0

    recommendations_by_user = pd.Series(recommendations_by_user, name='recommendations')
    precision_by_user = pd.Series(precision_by_user, name='precision')
    res = pd.concat([recommendations_by_user, precision_by_user], axis=1)

    mean_precision = np.mean(precision_by_user)

    return mean_precision, res

In [None]:
# test
user = users[0]
get_recommended_items_clf(model_xgb, user, k=10)

Unnamed: 0,contentId,proba_interaction
302,4634963407423735625,0.848098
1786,4315784099325221836,0.848098
403,-6778286518036232293,0.848098
1478,2736373711035411385,0.848098
1940,-6479058008972757948,0.848098
1846,-4748027149000767298,0.848098
796,1415230502586719648,0.848098
826,-454649054276160610,0.848098
671,-2208293144000550811,0.848098
287,3548262914600772288,0.848098


In [None]:
mean_precision_at_10_xgb, details_xgb = mean_precision_at_k_clf(model_xgb, X_test, y_test, k=10, nb_random_users=100)
mean_precision_at_10_xgb

users:   0%|          | 0/100 [00:00<?, ?it/s]

0.005

### Build a hybrid model

* Merge outputs from the content-based and the collaborative filtering models to get $k=10$ recommendations per user (define your own algorithm).

* Evaluate the hybrid model using precision@k metric.

#### Collaborative filtering

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
%%time
print('start fitting...')
cf = recsys.CollaborativeFiltering(sim_method='pearson', user_based=True)
cf.fit(X_train, y_train, user_col='personId', item_col='contentId')

print('start predicting...')
y_pred = cf.predict(X_test)
rmse_score = mean_squared_error(y_test, y_pred, squared=False)
print(f'rmse = {rmse_score:.2f}')

start fitting...
start predicting...


predictions:   0%|          | 0/10237 [00:00<?, ?it/s]

rmse = 0.90
CPU times: user 4.3 s, sys: 128 ms, total: 4.43 s
Wall time: 4.42 s


In [None]:
# user0 = X_train['personId'].unique()[0]
# cf.get_recommended_items(user0)

In [None]:
mean_precision_at_10, details_cf = cf.mean_precision_at_k(X_test, y_test, k=10, nb_random_users=100)
mean_precision_at_10

predictions:   0%|          | 0/268499 [00:00<?, ?it/s]

users:   0%|          | 0/100 [00:00<?, ?it/s]

0.01

In [None]:
details_cf

Unnamed: 0,recommendations,precision
-4045556372014952225,"[-4336877432539963613, 2857117417189640073, -3...",0.0
-4952379459094765124,"[1854874463930846880, -4503975842879662368, 53...",0.0
-7103674836099895927,"[1854874463930846880, 943818026930898372, -152...",0.0
-375620451534537810,"[8749720044741011597, -330801551666885085, 865...",0.1
2446947580409722972,"[5338677278233757627, 5928346445655989915, 410...",0.0
...,...,...
2207651454681143681,"[2857117417189640073, 310515487419366995, -255...",0.0
-5873562008332486480,"[8657408509986329668, -1199490911632553070, 60...",0.0
1908339160857512799,"[4184543400419595673, 7414483722019578252, -16...",0.0
2553895156129400476,"[3306277069425849869, 2372438485070148864, 258...",0.0


#### Hybrid model

In [None]:
import math

def get_recommended_items_hybrid(recommended_items_ubcf, recommended_items_xgb, k=None):

    common_items = np.intersect1d(recommended_items_ubcf, recommended_items_xgb)

    append_ubcf = np.extract(~np.isin(recommended_items_ubcf, common_items), recommended_items_ubcf)
    append_xgb = np.extract(~np.isin(recommended_items_xgb, common_items), recommended_items_xgb)

    if k is not None:
        nb_common = len(common_items)
        nb_add_each = math.ceil((k - nb_common)/2)
        append_ubcf = append_ubcf[:nb_add_each]
        append_xgb = append_xgb[:nb_add_each]

    recommended_items = np.concatenate([common_items, append_ubcf, append_xgb])

    return recommended_items

def mean_precision_at_k_hybrid(X_test, recommended_items_by_user_ubcf, recommended_items_by_user_xgb, k):

    recommendations_by_user = {}
    precision_by_user = {}

    selected_users = np.intersect1d(
        list(recommended_items_by_user_ubcf.keys()),
        list(recommended_items_by_user_xgb.keys())
    )

    for u in tqdm(selected_users, desc='users'):

        R_u_k = get_recommended_items_hybrid(recommended_items_by_user_ubcf[u], recommended_items_by_user_xgb[u], k)
        L_u = X_test.loc[X_test[user_col] == u, item_col]
        recommendations_by_user[u] = R_u_k.tolist()

        if len(R_u_k) != 0:
            precision_by_user[u] = sum(np.isin(R_u_k, L_u)) / len(R_u_k)
        else:
            precision_by_user[u] = 0

    recommendations_by_user = pd.Series(recommendations_by_user, name='recommendations')
    precision_by_user = pd.Series(precision_by_user, name='precision')
    res = pd.concat([recommendations_by_user, precision_by_user], axis=1)

    mean_precision = np.mean(precision_by_user)

    return mean_precision, res

In [None]:
# test
print(get_recommended_items_hybrid([1,2,3,4,5], [3,4,1,6,7]))
print(get_recommended_items_hybrid([1,2,3,4,5], [3,4,1,6,7], k=5))

[1 3 4 2 5 6 7]
[1 3 4 2 6]


In [None]:
recommended_items_by_user_ubcf = details_cf['recommendations'].to_dict()
selected_users = list(recommended_items_by_user_ubcf.keys())

recommended_items_by_user_xgb = {}
for u in tqdm(selected_users, desc='users'):
    recommended_items_by_user_xgb[u] = get_recommended_items_clf(model_xgb, user, k=10)[item_col]

users:   0%|          | 0/100 [00:00<?, ?it/s]

In [None]:
mean_precision_hybrid, details_hybrid = mean_precision_at_k_hybrid(X_test, recommended_items_by_user_ubcf, recommended_items_by_user_xgb, k=10)
mean_precision_hybrid

users:   0%|          | 0/100 [00:00<?, ?it/s]

0.005