# Hybrid Recommendation System with LightFM

In ths notebook, we use [LightMF library](https://lightfm.readthedocs.io/ "LightFM library") to train a hybrid model using users and articles features to predict the ratings.


In [1]:
!cd .. && make dataset && cd notebooks

>>> Downloading and extracting data files...
Data files already downloaded.
>>> OK.



In [2]:
import os
import sys

# Add source directory to python path
sys.path.append(os.path.abspath("../"))


import logging
import random
from datetime import datetime
from pathlib import Path

import implicit
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import src.data.helpers as data_helpers
from implicit.als import AlternatingLeastSquares
from lightfm import LightFM
from pandas.api.types import is_numeric_dtype
from scipy import sparse
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from tqdm import tqdm

# Plotly as Pandas plotting backend
pd.options.plotting.backend = "plotly"

DATA_PATH = Path("../data/")
RAW_DATA_PATH = Path(DATA_PATH, "raw")

CACHE = dict()

In [3]:
NUM_EMBEDDINGS = 250

articles = pd.concat(
    [
        pd.read_csv(
            Path(RAW_DATA_PATH, "articles_metadata.csv"),
            parse_dates=["created_at_ts"],
            date_parser=lambda x: datetime.fromtimestamp(int(x) / 1000),
            dtype={
                "article_id": "category",
                "category_id": "category",
                "publisher_id": "category",
                "words_count": "int",
            },
        ),
        pd.DataFrame(
            pd.read_pickle(Path(RAW_DATA_PATH, "articles_embeddings.pickle")),
            columns=["embedding_" + str(i) for i in range(NUM_EMBEDDINGS)],
        ),
    ],
    axis=1,
).set_index("article_id")

articles = data_helpers.reduce_dataframe_memory_usage(
    articles.astype({"created_at_ts": "datetime64[ns]"})
)

articles_sample = articles.sample(frac=0.01, random_state=42)

articles.describe(include="all", datetime_is_numeric=True)

Unnamed: 0,category_id,created_at_ts,publisher_id,words_count,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,...,embedding_240,embedding_241,embedding_242,embedding_243,embedding_244,embedding_245,embedding_246,embedding_247,embedding_248,embedding_249
count,364047.0,364047,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,...,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0
unique,461.0,,1.0,,,,,,,,...,,,,,,,,,,
top,281.0,,0.0,,,,,,,,...,,,,,,,,,,
freq,12817.0,,364047.0,,,,,,,,...,,,,,,,,,,
mean,,2016-09-17 01:25:54.949498624,,190.897727,-0.238647,-0.963,0.118547,-0.279296,-0.068579,0.045944,...,-0.133286,-0.081914,-0.060347,0.023003,0.076946,0.084603,0.062819,0.099768,0.155917,-0.041092
min,,2006-09-27 13:14:35,,0.0,-0.991183,-0.996455,-0.968431,-0.994966,-0.994489,-0.985974,...,-0.990412,-0.989408,-0.990432,-0.993626,-0.989042,-0.996902,-0.992921,-0.984733,-0.976071,-0.988213
25%,,2015-10-15 18:00:43.500000,,159.0,-0.620072,-0.974056,-0.289953,-0.718816,-0.503425,-0.354579,...,-0.547684,-0.445079,-0.479989,-0.404508,-0.248653,-0.267072,-0.306548,-0.313598,-0.201402,-0.420694
50%,,2017-03-13 17:27:29,,186.0,-0.302581,-0.967605,0.124339,-0.391535,-0.093734,0.062636,...,-0.175781,-0.094113,-0.078034,0.000726,0.105649,0.133525,0.083315,0.128757,0.188355,-0.015232
75%,,2017-11-05 15:09:11,,218.0,0.098015,-0.959061,0.545112,0.10832,0.345024,0.446979,...,0.250641,0.270006,0.341105,0.459386,0.417347,0.461466,0.441831,0.531453,0.538111,0.334226
max,,2018-03-13 13:12:30,,6690.0,0.983694,-0.514728,0.998341,0.978092,0.996798,0.996343,...,0.996401,0.981789,0.991332,0.995299,0.978823,0.989324,0.991445,0.997583,0.990507,0.968462


In [4]:
articles_dummy = pd.get_dummies(articles.astype({"created_at_ts": "int"}))
articles_std = StandardScaler().fit_transform(articles_dummy)

pca = PCA(n_components=100, random_state=42)
pca.fit(articles_std)


px.line(
    x=range(1, len(pca.explained_variance_ratio_) + 1),
    y=pca.explained_variance_ratio_,
    title="PCA - Explained variance ratio",
    labels={"x": "Number of components", "y": "Explained variance ratio"},
)

In [5]:
articles_pca = PCA(n_components=25, random_state=42).fit_transform(articles_std)

articles_features_df = pd.DataFrame(
    articles_pca,
    columns=[f"pca_{i}" for i in range(articles_pca.shape[1])],
    index=articles.index,
)
articles_features_df

Unnamed: 0_level_0,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8,pca_9,...,pca_15,pca_16,pca_17,pca_18,pca_19,pca_20,pca_21,pca_22,pca_23,pca_24
article_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-1.904215,6.079079,1.537288,3.803354,-1.437657,0.300117,0.414942,1.152953,-0.518427,-1.580563,...,2.452330,-2.933797,-2.891124,2.202463,-0.271904,1.209298,-4.123378,3.279035,0.938082,-1.514533
1,-5.347634,-4.245106,-2.292850,3.663286,0.028960,3.627160,1.995622,-2.004382,0.694724,-3.107926,...,0.388457,-4.107143,1.576031,2.298082,1.705790,-3.961541,-3.657167,1.620356,-0.775952,0.864433
2,-3.696949,-0.448304,-6.074375,-0.811972,-0.613765,4.426349,-1.923623,-0.186540,-3.575506,-3.095512,...,-5.183629,-2.143729,-2.083800,-0.625227,-1.672333,-1.826249,-4.812046,0.313483,1.508683,-0.257272
3,-1.240911,-7.545885,-5.644175,1.761390,-2.215093,-1.675385,0.931860,-2.734978,0.209532,-4.369860,...,-5.014966,-0.547995,-1.997234,-2.591177,0.256643,-3.149014,-2.178933,-2.217641,0.947933,2.530837
4,-1.214285,-2.792402,-2.859502,0.522252,-3.329006,5.188328,0.040030,0.870313,-0.855068,-2.437743,...,-2.765760,-3.909355,-0.510423,-1.422974,0.831243,-3.846287,-2.838117,0.252872,3.382752,1.958392
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
364042,2.081017,-7.314420,8.566579,-0.309297,1.824057,-1.800716,1.581949,0.381537,1.649165,-1.735415,...,0.143416,-1.719001,0.596776,0.885950,-0.203514,0.243687,-0.569196,1.108324,-1.715699,0.259476
364043,0.884326,-2.186222,6.148029,3.325571,14.074191,4.150537,1.042010,3.243976,2.587766,-1.147832,...,0.266136,1.974945,1.283965,-1.979753,0.285304,2.080461,4.952106,1.876104,-1.604289,-1.634543
364044,1.386953,-9.478232,3.365109,3.498530,2.230966,0.297405,-2.376174,-1.420260,3.050013,4.069092,...,-3.047975,1.621384,0.588765,2.007028,2.347073,-2.376871,1.161152,-1.663997,-2.283985,-0.219270
364045,11.516807,-1.684870,0.044962,-0.397379,-2.364314,-2.258477,-2.525652,0.415384,1.558364,-2.119358,...,-1.849083,1.184907,0.586788,-0.456083,0.367469,-0.804240,1.702255,-0.819011,-2.050353,-1.100136


In [6]:
num_clusters = np.logspace(1, 7, num=25, base=2).astype(int)
inertia = []
calinski_harabasz = []
davies_bouldin = []
for k in tqdm(num_clusters):
    kmeans = MiniBatchKMeans(n_clusters=k, random_state=42).fit(articles_pca)
    inertia.append(kmeans.inertia_)
    calinski_harabasz.append(calinski_harabasz_score(articles_pca, kmeans.labels_))
    davies_bouldin.append(davies_bouldin_score(articles_pca, kmeans.labels_))


fig = px.line(
    x=num_clusters,
    y=inertia,
    title="Inertia",
    labels={"x": "Number of clusters", "y": "Inertia"},
)
fig.show()

fig = px.line(
    x=num_clusters,
    y=calinski_harabasz,
    title="Calinski-Harabasz",
    labels={"x": "Number of clusters", "y": "Calinski-Harabasz"},
)
fig.show()

fig = px.line(
    x=num_clusters,
    y=davies_bouldin,
    title="Davies-Bouldin",
    labels={"x": "Number of clusters", "y": "Davies-Bouldin"},
)
fig.show()

100%|██████████| 25/25 [01:51<00:00,  4.46s/it]


In [7]:
articles_clusters = MiniBatchKMeans(n_clusters=30, random_state=42).fit_predict(
    articles_pca
)

articles_clusters

array([19,  5, 29, ..., 17,  8,  4], dtype=int32)

In [8]:
clicks = pd.concat(
    [
        pd.read_csv(
            click_file_path,
            parse_dates=["session_start", "click_timestamp"],
            date_parser=lambda x: datetime.fromtimestamp(int(int(x) / 1000)),
            dtype={
                "user_id": "category",
                "session_id": "category",
                "session_size": "int",
                "click_article_id": "category",
                "click_environment": "category",
                "click_deviceGroup": "category",
                "click_os": "category",
                "click_country": "category",
                "click_region": "category",
                "click_referrer_type": "category",
            },
        ).replace(
            {
                "click_environment": {
                    "1": "1 - Facebook Instant Article",
                    "2": "2 - Mobile App",
                    "3": "3 - AMP (Accelerated Mobile Pages)",
                    "4": "4 - Web",
                },
                "click_deviceGroup": {
                    "1": "1 - Tablet",
                    "2": "2 - TV",
                    "3": "3 - Empty",
                    "4": "4 - Mobile",
                    "5": "5 - Desktop",
                },
                "click_os": {
                    "1": "1 - Other",
                    "2": "2 - iOS",
                    "3": "3 - Android",
                    "4": "4 - Windows Phone",
                    "5": "5 - Windows Mobile",
                    "6": "6 - Windows",
                    "7": "7 - Mac OS X",
                    "8": "8 - Mac OS",
                    "9": "9 - Samsung",
                    "10": "10 - FireHbbTV",
                    "11": "11 - ATV OS X",
                    "12": "12 - tvOS",
                    "13": "13 - Chrome OS",
                    "14": "14 - Debian",
                    "15": "15 - Symbian OS",
                    "16": "16 - BlackBerry OS",
                    "17": "17 - Firefox OS",
                    "18": "18 - Android",
                    "19": "19 - Brew MP",
                    "20": "20 - Chromecast",
                    "21": "21 - webOS",
                    "22": "22 - Gentoo",
                    "23": "23 - Solaris",
                },
            }
        )
        for click_file_path in tqdm(
            sorted(Path(RAW_DATA_PATH, "clicks/clicks").glob("clicks_hour_*.csv"))
        )
    ],
    sort=False,
    ignore_index=True,
    verify_integrity=True,
)

clicks = data_helpers.reduce_dataframe_memory_usage(
    clicks.astype(
        {
            "session_start": "datetime64[ns]",
            "session_size": "int",
            "click_timestamp": "datetime64[ns]",
        }
    )
)

clicks.describe(include="all", datetime_is_numeric=True)

100%|██████████| 385/385 [01:16<00:00,  5.06it/s]


Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
count,2988181.0,2988181.0,2988181,2988181.0,2988181.0,2988181,2988181,2988181,2988181,2988181.0,2988181.0,2988181.0
unique,322897.0,1048594.0,,,46033.0,,3,5,8,11.0,28.0,7.0
top,5890.0,1507563657895091.0,,,160974.0,,4 - Web,1 - Tablet,17 - Firefox OS,1.0,25.0,2.0
freq,1232.0,124.0,,,37213.0,,2904478,1823162,1738138,2852406.0,804985.0,1602601.0
mean,,,2017-10-08 16:17:08.013155328,3.901885,,2017-10-08 16:51:05.070374400,,,,,,
min,,,2017-10-01 04:37:03,2.0,,2017-10-01 05:00:00,,,,,,
25%,,,2017-10-04 15:35:52,2.0,,2017-10-04 16:20:52,,,,,,
50%,,,2017-10-08 22:09:00,3.0,,2017-10-08 22:35:30,,,,,,
75%,,,2017-10-11 21:16:54,4.0,,2017-10-11 21:43:24,,,,,,
max,,,2017-10-17 05:36:19,124.0,,2017-11-13 21:04:14,,,,,,


In [9]:
if Path(DATA_PATH, "processed/users.csv").exists():
    users = pd.read_csv(
        Path(DATA_PATH, "processed/users.csv"),
        index_col="user_id",
        parse_dates=["MEAN_click_timestamp"],
        dtype={
            "user_id": "category",
            "COUNT_clicks": "int",
            "TOP_click_environment": "category",
            "TOP_click_deviceGroup": "category",
            "TOP_click_os": "category",
            "TOP_click_country": "category",
            "TOP_click_region": "category",
            "TOP_click_referrer_type": "category",
        },
    )
else:
    users = (
        clicks.reset_index()
        .groupby(["user_id"])
        .agg(
            COUNT_clicks=("index", "count"),
            MEAN_click_timestamp=("click_timestamp", "mean"),
            TOP_click_environment=("click_environment", lambda x: x.mode()[0]),
            TOP_click_deviceGroup=("click_deviceGroup", lambda x: x.mode()[0]),
            TOP_click_os=("click_os", lambda x: x.mode()[0]),
            TOP_click_country=("click_country", lambda x: x.mode()[0]),
            TOP_click_region=("click_region", lambda x: x.mode()[0]),
            TOP_click_referrer_type=("click_referrer_type", lambda x: x.mode()[0]),
        )
    )
    users.to_csv(Path(DATA_PATH, "processed/users.csv"))

users = data_helpers.reduce_dataframe_memory_usage(
    users.astype(
        {
            "COUNT_clicks": "int",
            "MEAN_click_timestamp": "datetime64[ns]",
        }
    )
)

users.describe(include="all", datetime_is_numeric=True)

Unnamed: 0,COUNT_clicks,MEAN_click_timestamp,TOP_click_environment,TOP_click_deviceGroup,TOP_click_os,TOP_click_country,TOP_click_region,TOP_click_referrer_type
count,322897.0,322897,322897,322897,322897,322897.0,322897.0,322897.0
unique,,,3,4,8,11.0,28.0,7.0
top,,,4 - Web,1 - Tablet,17 - Firefox OS,1.0,25.0,2.0
freq,,,313195,178033,168975,309375.0,88147.0,199292.0
mean,9.254285,2017-10-08 14:16:32.833416192,,,,,,
min,2.0,2017-10-01 05:00:16,,,,,,
25%,2.0,2017-10-06 00:11:28,,,,,,
50%,4.0,2017-10-08 13:04:36.333333248,,,,,,
75%,10.0,2017-10-10 21:33:16.037037056,,,,,,
max,1232.0,2017-10-31 19:35:18.750000128,,,,,,


In [10]:
users_dummy = pd.get_dummies(users.astype({"MEAN_click_timestamp": "int"}))
users_std = StandardScaler().fit_transform(users_dummy)

pca = PCA(n_components=50, random_state=42)
pca.fit(users_std)


px.line(
    x=range(1, len(pca.explained_variance_ratio_) + 1),
    y=pca.explained_variance_ratio_,
    title="PCA - Explained variance ratio",
    labels={"x": "Number of components", "y": "Explained variance ratio"},
)

In [11]:
users_pca = PCA(n_components=10, random_state=42).fit_transform(users_std)

users_features_df = pd.DataFrame(
    users_pca,
    columns=[f"pca_{i}" for i in range(users_pca.shape[1])],
    index=users.index,
)
users_features_df

Unnamed: 0_level_0,pca_0,pca_1,pca_2,pca_3,pca_4,pca_5,pca_6,pca_7,pca_8,pca_9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,1.699164,-0.397796,-0.446491,-1.127296,0.663800,0.483216,0.715028,0.924418,-0.468858,-1.221070
1,-1.655270,0.228214,0.374687,-1.113772,0.300500,-0.940414,-0.742250,0.540289,0.539775,-0.487475
10,-2.056614,0.091305,-0.038654,2.007981,-0.688700,-0.454136,0.822261,-2.200997,-1.495055,0.471278
100,1.601323,-1.244009,-1.178582,1.780181,-0.127551,1.288546,-0.374174,-0.314391,0.227090,0.544389
1000,1.874431,-1.109712,-0.910541,-1.110672,-0.504261,1.237397,-0.263376,-0.316927,0.462357,0.520649
...,...,...,...,...,...,...,...,...,...,...
99995,-1.697570,0.243863,0.353953,-1.424104,-0.581714,-0.291931,0.416355,0.257983,0.681366,0.670510
99996,-1.649689,0.228985,0.373032,-1.153030,0.248697,-0.953786,-0.674827,0.588817,0.570464,-0.528344
99997,-1.729577,0.249486,0.358148,-1.269522,-0.346360,-0.214800,0.165836,0.072994,0.574554,0.876380
99998,1.791673,-0.467449,-0.416061,-1.212874,0.771840,-0.298143,0.487644,2.078244,-0.108107,-1.373768


In [12]:
num_clusters = np.logspace(1, 7, num=25, base=2).astype(int)
inertia = []
calinski_harabasz = []
davies_bouldin = []
for k in tqdm(num_clusters):
    kmeans = MiniBatchKMeans(n_clusters=k, random_state=42).fit(users_pca)
    inertia.append(kmeans.inertia_)
    calinski_harabasz.append(calinski_harabasz_score(users_pca, kmeans.labels_))
    davies_bouldin.append(davies_bouldin_score(users_pca, kmeans.labels_))


fig = px.line(
    x=num_clusters,
    y=inertia,
    title="Inertia",
    labels={"x": "Number of clusters", "y": "Inertia"},
)
fig.show()

fig = px.line(
    x=num_clusters,
    y=calinski_harabasz,
    title="Calinski-Harabasz",
    labels={"x": "Number of clusters", "y": "Calinski-Harabasz"},
)
fig.show()

fig = px.line(
    x=num_clusters,
    y=davies_bouldin,
    title="Davies-Bouldin",
    labels={"x": "Number of clusters", "y": "Davies-Bouldin"},
)
fig.show()

100%|██████████| 25/25 [00:18<00:00,  1.35it/s]


In [13]:
users_clusters = MiniBatchKMeans(n_clusters=30, random_state=42).fit_predict(users_pca)

users_clusters

array([21, 23, 11, ...,  2, 26, 12], dtype=int32)

In [14]:
def get_ratings_from_clicks(clicks):
    count_user_article_clicks = (
        clicks.reset_index()
        .groupby(["user_id", "click_article_id"])
        .agg(
            COUNT_user_article_clicks=("index", "count"),
        )
    )

    count_user_clicks = (
        clicks.reset_index()
        .groupby(["user_id"])
        .agg(
            COUNT_user_clicks=("index", "count"),
        )
    )

    ratings = count_user_article_clicks.join(count_user_clicks, on="user_id")
    ratings["rating"] = (
        ratings["COUNT_user_article_clicks"] / ratings["COUNT_user_clicks"]
    )

    ratings = data_helpers.reduce_dataframe_memory_usage(
        ratings["rating"]
        .reset_index()
        .rename({"click_article_id": "article_id"}, axis=1)
    )

    return ratings


ratings = get_ratings_from_clicks(clicks)

ratings_sample = ratings.sample(frac=0.01, random_state=42)

ratings.describe(include="all", datetime_is_numeric=True)

Unnamed: 0,user_id,article_id,rating
count,2950710.0,2950710.0,2950710.0
unique,322897.0,46033.0,
top,5890.0,160974.0,
freq,1048.0,34145.0,
mean,,,0.1093554
std,,,0.1293055
min,,,0.0008116883
25%,,,0.02702703
50%,,,0.05882353
75%,,,0.125


In [15]:
from lightfm.data import Dataset


dataset = Dataset()

dataset.fit(
    users=list(users.index),
    items=list(articles.index),
    user_features=users_features_df.columns.tolist(),
    item_features=articles_features_df.columns.tolist(),
)

dataset.interactions_shape()

(322897, 364047)

In [16]:
(interactions, weights) = dataset.build_interactions(
    (row.user_id, row.article_id, row.rating) for row in tqdm(ratings.itertuples())
)

2950710it [00:08, 355857.48it/s]


In [17]:
user_features = dataset.build_user_features(
    (user_id, dict(user_feats))
    for user_id, user_feats in tqdm(users_features_df.iterrows())
)

322897it [00:20, 15462.67it/s]


In [18]:
item_features = dataset.build_item_features(
    (article_id, dict(article_feats))
    for article_id, article_feats in tqdm(articles_features_df.iterrows())
)

364047it [00:40, 9040.42it/s]


In [19]:
model = LightFM(loss="warp", random_state=42)
model.fit(
    interactions=interactions,
    user_features=user_features,
    item_features=item_features,
    sample_weight=weights,
    epochs=20,
    num_threads=8,
    verbose=True,
)

Epoch: 100%|██████████| 20/20 [06:28<00:00, 19.42s/it]


<lightfm.lightfm.LightFM at 0x7fca94179c70>

In [20]:
user_id = "5890"

user_idx = dataset._user_id_mapping[user_id]

pred = model.predict(
    user_ids=user_idx,
    item_ids=list(dataset._item_id_mapping.values()),
)

In [21]:
pred_df = pd.DataFrame(
    {
        "article_id": list(dataset._item_id_mapping.keys()),
        "pred": pred,
    }
)

pred_df.sort_values("pred", ascending=False).reset_index(drop=True).head(20)

Unnamed: 0,article_id,pred
0,272143,40.422104
1,123909,38.064671
2,168623,37.152534
3,336221,35.968914
4,336245,34.323261
5,183176,33.101089
6,96210,32.374023
7,160417,32.352455
8,331116,31.572062
9,336223,30.77297


In [22]:
def aggregate_articles(articles):
    return articles.groupby(lambda x: True).agg(
        {
            col: "mean"
            if is_numeric_dtype(articles.dtypes[col])
            else lambda x: x.mode()[0]
            for col in articles.columns
        }
    )


def get_user_interest(user_id, clicks, articles, strategy="last_click"):
    user_id = str(user_id)

    if strategy == "last_click":
        last_clicked_article_id = (
            clicks.query("user_id == @user_id")
            .sort_values("click_timestamp", ascending=False)
            .reset_index(drop=True)
            .iloc[0]["click_article_id"]
        )
        interest = articles.query("article_id == @last_clicked_article_id")

    elif strategy == "last_session":
        last_session_id = (
            clicks.query("user_id == @user_id")
            .sort_values("click_timestamp", ascending=False)
            .reset_index(drop=True)
            .iloc[0]["session_id"]
        )
        session_article_ids = clicks.query("session_id == @last_session_id")[
            "click_article_id"
        ]
        interest = aggregate_articles(
            articles.query("article_id in @session_article_ids")
        ).drop(["article_id"], axis=1)

    elif strategy == "all_clicks":
        all_article_ids = clicks.query("user_id == @user_id")["click_article_id"]
        interest = aggregate_articles(
            articles.query("article_id in @all_article_ids")
        ).drop(["article_id"], axis=1)

    else:
        raise NotImplementedError

    return interest


def prepare_for_scale(articles, category_id):
    articles_copy = articles.drop(["article_id", "similarity"], axis=1, errors="ignore")
    articles_copy["category_id"] = articles_copy["category_id"].apply(
        lambda x: category_id if int(x) == category_id else 0
    )
    articles_copy["created_at_ts"] = articles_copy["created_at_ts"].apply(
        lambda x: x.value
    )

    return articles_copy


def get_closest_articles(interest, articles, n=10):
    category_id = interest["category_id"].iloc[0]

    scaler = StandardScaler()
    articles_std = scaler.fit_transform(prepare_for_scale(articles, category_id))
    interest_std = scaler.transform(prepare_for_scale(interest, category_id))

    articles = articles.copy()
    articles["similarity"] = cosine_similarity(interest_std, articles_std)[0]

    return (
        articles.sort_values("similarity", ascending=False).iloc[:n],
        scaler,
        articles_std,
        interest_std,
    )


def get_collaborative_reco(user_id, model, dataset, n=10):
    user_idx = dataset._user_id_mapping[user_id]

    pred = model.predict(
        user_ids=user_idx,
        item_ids=list(dataset._item_id_mapping.values()),
    )

    pred_df = pd.DataFrame(
        {
            "article_id": list(dataset._item_id_mapping.keys()),
            "pred": pred,
        }
    )

    return list(pred_df.sort_values("pred", ascending=False).head(n)["article_id"])

In [23]:
user_id = "5890"

interest = get_user_interest(
    user_id, clicks, articles.reset_index(), strategy="all_clicks"
)
category_id = interest["category_id"].iloc[0]

closest_article_ids = get_collaborative_reco(user_id, model, dataset)

scaler = StandardScaler()
articles_std = scaler.fit_transform(prepare_for_scale(articles, category_id))
articles_sample_std = scaler.transform(prepare_for_scale(articles_sample, category_id))

interest_std = scaler.transform(prepare_for_scale(interest, category_id))

closest_articles = articles.iloc[closest_article_ids].reset_index()
closest_articles_std = scaler.transform(
    prepare_for_scale(closest_articles, category_id)
)

In [24]:
pca = PCA(n_components=2)
articles_pca = pca.fit_transform(articles_sample_std)
interest_pca = pca.transform(interest_std)
closest_articles_pca = pca.transform(closest_articles_std)


# Plot the data in the PCA space
fig = px.scatter(
    x=articles_pca[:, 0],
    y=articles_pca[:, 1],
    color=articles_sample["category_id"],
    symbol=articles_sample["category_id"],
    title="PCA 2D",
    opacity=0.3,
    width=1200,
    height=800,
)
fig.add_scatter(
    x=interest_pca[:, 0],
    y=interest_pca[:, 1],
    mode="markers",
    marker=dict(color="green", size=30),
    text=f"User interest \n user_id: {user_id} \n category_id: {interest['category_id'].iloc[0]}",
)
fig.add_scatter(
    x=closest_articles_pca[:, 0],
    y=closest_articles_pca[:, 1],
    mode="markers",
    marker=dict(color=list(range(len(closest_articles_pca))), size=20),
    text=[
        f"rank: {i} / article_id: {a.article_id} / category_id: {a.category_id}"
        for i, a in enumerate(closest_articles.itertuples())
    ],
)
fig.show()

In [25]:
tsne = TSNE(n_components=2)
articles_tsne = tsne.fit_transform(
    np.concatenate((articles_sample_std, closest_articles_std, interest_std))
)

interest_tsne = articles_tsne[-1:]
articles_tsne = articles_tsne[:-1]

closest_articles_tsne = articles_tsne[-len(closest_articles) :]
articles_tsne = articles_tsne[: -len(closest_articles)]


# Plot the data in the t-SNE space
fig = px.scatter(
    x=articles_tsne[:, 0],
    y=articles_tsne[:, 1],
    color=articles_sample["category_id"],
    symbol=articles_sample["category_id"],
    title="t-SNE 2D",
    opacity=0.3,
    width=1200,
    height=800,
)
fig.add_scatter(
    x=interest_tsne[:, 0],
    y=interest_tsne[:, 1],
    mode="markers",
    marker=dict(color="green", size=30),
    text=f"User interest \n user_id: {user_id} \n category_id: {interest['category_id'].iloc[0]}",
)
fig.add_scatter(
    x=closest_articles_tsne[:, 0],
    y=closest_articles_tsne[:, 1],
    mode="markers",
    marker=dict(color=list(range(len(closest_articles_tsne))), size=20),
    text=[
        f"rank: {i} / article_id: {a.article_id} / category_id: {a.category_id}"
        for i, a in enumerate(closest_articles.itertuples())
    ],
)
fig.show()

In [26]:
users_last_click = (
    clicks.reset_index()
    .rename(columns={"index": "click_id"})
    .sort_values(by="click_timestamp")
    .groupby(["user_id"])
    .last()
)

X = get_ratings_from_clicks(clicks.drop(list(users_last_click["click_id"])))
y_true = dict(users_last_click["click_article_id"])

In [28]:
(interactions, weights) = dataset.build_interactions(
    (row.user_id, row.article_id, row.rating) for row in tqdm(X.itertuples())
)

model = LightFM(loss="warp", random_state=42)
model.fit(
    interactions=interactions,
    user_features=user_features,
    item_features=item_features,
    sample_weight=weights,
    epochs=20,
    num_threads=8,
    verbose=True,
)

2629418it [00:07, 337280.07it/s]
Epoch: 100%|██████████| 20/20 [05:32<00:00, 16.64s/it]


<lightfm.lightfm.LightFM at 0x7fca9886a430>

In [29]:
test_sample = random.sample(list(y_true.keys()), k=1000)

y_pred = dict(
    {
        user_id: get_collaborative_reco(user_id, model, dataset, n=1000)
        for user_id in tqdm(test_sample)
    }
)

100%|██████████| 1000/1000 [02:45<00:00,  6.05it/s]


In [30]:
def score_reco(y_true, y_pred):
    score = 0
    count = 0
    for user_id, pred_article_ids in y_pred.items():
        if user_id not in y_true.keys():
            logging.info(f"User {user_id} not found in true values")
            continue

        true_article_id = str(y_true[user_id])

        if true_article_id not in pred_article_ids:
            logging.info(
                f"Article {true_article_id} not found in predictions for user {user_id}"
            )
            continue

        rank = pred_article_ids.index(true_article_id) + 1
        score += 1 / rank
        count += 1

    # In range [0 , 1], higher is better
    return score / count


def mean_rank(y_true, y_pred):
    sum = 0
    count = 0
    for user_id, pred_article_ids in y_pred.items():
        if user_id not in y_true.keys():
            logging.info(f"User {user_id} not found in true values")
            continue

        true_article_id = str(y_true[user_id])

        if true_article_id not in pred_article_ids:
            logging.info(
                f"Article {true_article_id} not found in predictions for user {user_id}"
            )
            continue

        rank = pred_article_ids.index(true_article_id) + 1
        sum += rank
        count += 1

    # In range [1 , +Inf[, lower is better
    return sum / count


def mean_average_precision(y_true, y_pred, articles, k=10):
    average_precision = 0
    for user_id, pred_article_ids in y_pred.items():
        if user_id not in y_true.keys():
            logging.warning(f"User {user_id} not found in true values")
            continue

        true_category_id = articles.iloc[int(y_true[user_id])].category_id
        pred_categories = articles.iloc[
            [int(id) for id in pred_article_ids[:k]]
        ].category_id

        average_precision = (
            len(pred_categories[pred_categories == true_category_id]) / k
        )

    return average_precision / len(y_pred)

In [31]:
print(f"Score : {score_reco(y_true, y_pred)}")
print(f"Mean Rank : {mean_rank(y_true, y_pred)}")
print(
    f"Mean Average Precision : {mean_average_precision(y_true, y_pred, articles, k=1000)}"
)

Score : 0.038784880471139364
Mean Rank : 228.97051282051282
Mean Average Precision : 0.0001
