In [1]:
!cd .. && make dataset && cd notebooks

>>> Downloading and extracting data files...
Data files already downloaded.
>>> OK.



In [2]:
import os
import sys

# Add source directory to python path
sys.path.append(os.path.abspath("../"))


import logging
import random
from datetime import datetime
from pathlib import Path

import implicit
import numpy as np
import pandas as pd
import plotly.express as px
import src.data.helpers as data_helpers
from implicit.als import AlternatingLeastSquares
from pandas.api.types import is_numeric_dtype
from scipy import sparse
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from tqdm import tqdm

# Plotly as Pandas plotting backend
pd.options.plotting.backend = "plotly"

DATA_PATH = Path("../data/")
RAW_DATA_PATH = Path(DATA_PATH, "raw")

CACHE = dict()


In [3]:
NUM_EMBEDDINGS = 250

articles = pd.concat(
    [
        pd.read_csv(
            Path(RAW_DATA_PATH, "articles_metadata.csv"),
            parse_dates=["created_at_ts"],
            date_parser=lambda x: datetime.fromtimestamp(int(x) / 1000),
            dtype={
                "article_id": "category",
                "category_id": "category",
                "publisher_id": "category",
                "words_count": "int",
            },
        ),
        pd.DataFrame(
            pd.read_pickle(Path(RAW_DATA_PATH, "articles_embeddings.pickle")),
            columns=["embedding_" + str(i) for i in range(NUM_EMBEDDINGS)],
        ),
    ],
    axis=1,
).set_index("article_id")

articles = data_helpers.reduce_dataframe_memory_usage(
    articles.astype({"created_at_ts": "datetime64[ns]"})
)

articles_sample = articles.sample(frac=0.01, random_state=42)

articles.describe(include="all", datetime_is_numeric=True)


Unnamed: 0,category_id,created_at_ts,publisher_id,words_count,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,...,embedding_240,embedding_241,embedding_242,embedding_243,embedding_244,embedding_245,embedding_246,embedding_247,embedding_248,embedding_249
count,364047.0,364047,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,...,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0
unique,461.0,,1.0,,,,,,,,...,,,,,,,,,,
top,281.0,,0.0,,,,,,,,...,,,,,,,,,,
freq,12817.0,,364047.0,,,,,,,,...,,,,,,,,,,
mean,,2016-09-17 01:25:54.949498624,,190.897727,-0.238647,-0.963,0.118547,-0.279296,-0.068579,0.045944,...,-0.133286,-0.081914,-0.060347,0.023003,0.076946,0.084603,0.062819,0.099768,0.155917,-0.041092
min,,2006-09-27 13:14:35,,0.0,-0.991183,-0.996455,-0.968431,-0.994966,-0.994489,-0.985974,...,-0.990412,-0.989408,-0.990432,-0.993626,-0.989042,-0.996902,-0.992921,-0.984733,-0.976071,-0.988213
25%,,2015-10-15 18:00:43.500000,,159.0,-0.620072,-0.974056,-0.289953,-0.718816,-0.503425,-0.354579,...,-0.547684,-0.445079,-0.479989,-0.404508,-0.248653,-0.267072,-0.306548,-0.313598,-0.201402,-0.420694
50%,,2017-03-13 17:27:29,,186.0,-0.302581,-0.967605,0.124339,-0.391535,-0.093734,0.062636,...,-0.175781,-0.094113,-0.078034,0.000726,0.105649,0.133525,0.083315,0.128757,0.188355,-0.015232
75%,,2017-11-05 15:09:11,,218.0,0.098015,-0.959061,0.545112,0.10832,0.345024,0.446979,...,0.250641,0.270006,0.341105,0.459386,0.417347,0.461466,0.441831,0.531453,0.538111,0.334226
max,,2018-03-13 13:12:30,,6690.0,0.983694,-0.514728,0.998341,0.978092,0.996798,0.996343,...,0.996401,0.981789,0.991332,0.995299,0.978823,0.989324,0.991445,0.997583,0.990507,0.968462


In [4]:
articles_dummy = pd.get_dummies(articles.astype({"created_at_ts": "int"}))
articles_std = StandardScaler().fit_transform(articles_dummy)
articles_pca = PCA(n_components=80, random_state=42).fit_transform(articles_std)


articles_clusters = MiniBatchKMeans(
    n_clusters=80, random_state=42, verbose=1
).fit_predict(articles_pca)

articles_clusters


Init 1/3 with method: k-means++
Inertia for init 1/3: 33238.775954
Init 2/3 with method: k-means++
Inertia for init 2/3: 35120.645117
Init 3/3 with method: k-means++
Inertia for init 3/3: 39650.874525
Minibatch iteration 1/364100: mean batch inertia: 185.091758, ewa inertia: 185.091758 
Minibatch iteration 2/364100: mean batch inertia: 137.810366, ewa inertia: 185.065783 
Minibatch iteration 3/364100: mean batch inertia: 133.232467, ewa inertia: 185.037307 
Minibatch iteration 4/364100: mean batch inertia: 169.132695, ewa inertia: 185.028569 
Minibatch iteration 5/364100: mean batch inertia: 146.469057, ewa inertia: 185.007385 
Minibatch iteration 6/364100: mean batch inertia: 182.037095, ewa inertia: 185.005753 
Minibatch iteration 7/364100: mean batch inertia: 122.643684, ewa inertia: 184.971493 
Minibatch iteration 8/364100: mean batch inertia: 125.671083, ewa inertia: 184.938915 
Minibatch iteration 9/364100: mean batch inertia: 134.266886, ewa inertia: 184.911077 
[MiniBatchKMeans

array([33, 40, 40, ..., 11, 37, 41], dtype=int32)

In [5]:
clicks = pd.concat(
    [
        pd.read_csv(
            click_file_path,
            parse_dates=["session_start", "click_timestamp"],
            date_parser=lambda x: datetime.fromtimestamp(int(int(x) / 1000)),
            dtype={
                "user_id": "category",
                "session_id": "category",
                "session_size": "int",
                "click_article_id": "category",
                "click_environment": "category",
                "click_deviceGroup": "category",
                "click_os": "category",
                "click_country": "category",
                "click_region": "category",
                "click_referrer_type": "category",
            },
        ).replace(
            {
                "click_environment": {
                    "1": "1 - Facebook Instant Article",
                    "2": "2 - Mobile App",
                    "3": "3 - AMP (Accelerated Mobile Pages)",
                    "4": "4 - Web",
                },
                "click_deviceGroup": {
                    "1": "1 - Tablet",
                    "2": "2 - TV",
                    "3": "3 - Empty",
                    "4": "4 - Mobile",
                    "5": "5 - Desktop",
                },
                "click_os": {
                    "1": "1 - Other",
                    "2": "2 - iOS",
                    "3": "3 - Android",
                    "4": "4 - Windows Phone",
                    "5": "5 - Windows Mobile",
                    "6": "6 - Windows",
                    "7": "7 - Mac OS X",
                    "8": "8 - Mac OS",
                    "9": "9 - Samsung",
                    "10": "10 - FireHbbTV",
                    "11": "11 - ATV OS X",
                    "12": "12 - tvOS",
                    "13": "13 - Chrome OS",
                    "14": "14 - Debian",
                    "15": "15 - Symbian OS",
                    "16": "16 - BlackBerry OS",
                    "17": "17 - Firefox OS",
                    "18": "18 - Android",
                    "19": "19 - Brew MP",
                    "20": "20 - Chromecast",
                    "21": "21 - webOS",
                    "22": "22 - Gentoo",
                    "23": "23 - Solaris",
                },
            }
        )
        for click_file_path in tqdm(
            sorted(
                Path(RAW_DATA_PATH, "clicks/clicks").glob("clicks_hour_*.csv")
            )
        )
    ],
    sort=False,
    ignore_index=True,
    verify_integrity=True,
)

clicks = data_helpers.reduce_dataframe_memory_usage(
    clicks.astype(
        {
            "session_start": "datetime64[ns]",
            "session_size": "int",
            "click_timestamp": "datetime64[ns]",
        }
    )
)

clicks.describe(include="all", datetime_is_numeric=True)


100%|██████████| 385/385 [01:15<00:00,  5.11it/s]


Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
count,2988181.0,2988181.0,2988181,2988181.0,2988181.0,2988181,2988181,2988181,2988181,2988181.0,2988181.0,2988181.0
unique,322897.0,1048594.0,,,46033.0,,3,5,8,11.0,28.0,7.0
top,5890.0,1507563657895091.0,,,160974.0,,4 - Web,1 - Tablet,17 - Firefox OS,1.0,25.0,2.0
freq,1232.0,124.0,,,37213.0,,2904478,1823162,1738138,2852406.0,804985.0,1602601.0
mean,,,2017-10-08 16:17:08.013155328,3.901885,,2017-10-08 16:51:05.070374400,,,,,,
min,,,2017-10-01 04:37:03,2.0,,2017-10-01 05:00:00,,,,,,
25%,,,2017-10-04 15:35:52,2.0,,2017-10-04 16:20:52,,,,,,
50%,,,2017-10-08 22:09:00,3.0,,2017-10-08 22:35:30,,,,,,
75%,,,2017-10-11 21:16:54,4.0,,2017-10-11 21:43:24,,,,,,
max,,,2017-10-17 05:36:19,124.0,,2017-11-13 21:04:14,,,,,,


In [6]:
if Path(DATA_PATH, "processed/users.csv").exists():
    users = pd.read_csv(
        Path(DATA_PATH, "processed/users.csv"),
        index_col="user_id",
        parse_dates=["MEAN_click_timestamp"],
        dtype={
            "user_id": "category",
            "COUNT_clicks": "int",
            "TOP_click_environment": "category",
            "TOP_click_deviceGroup": "category",
            "TOP_click_os": "category",
            "TOP_click_country": "category",
            "TOP_click_region": "category",
            "TOP_click_referrer_type": "category",
        },
    )
else:
    users = (
        clicks.reset_index()
        .groupby(["user_id"])
        .agg(
            COUNT_clicks=("index", "count"),
            MEAN_click_timestamp=("click_timestamp", "mean"),
            TOP_click_environment=("click_environment", lambda x: x.mode()[0]),
            TOP_click_deviceGroup=("click_deviceGroup", lambda x: x.mode()[0]),
            TOP_click_os=("click_os", lambda x: x.mode()[0]),
            TOP_click_country=("click_country", lambda x: x.mode()[0]),
            TOP_click_region=("click_region", lambda x: x.mode()[0]),
            TOP_click_referrer_type=("click_referrer_type", lambda x: x.mode()[0]),
        )
    )
    users.to_csv(Path(DATA_PATH, "processed/users.csv"))

users = data_helpers.reduce_dataframe_memory_usage(
    users.astype(
        {
            "COUNT_clicks": "int",
            "MEAN_click_timestamp": "datetime64[ns]",
        }
    )
)

users.describe(include="all", datetime_is_numeric=True)


Unnamed: 0,COUNT_clicks,MEAN_click_timestamp,TOP_click_environment,TOP_click_deviceGroup,TOP_click_os,TOP_click_country,TOP_click_region,TOP_click_referrer_type
count,322897.0,322897,322897,322897,322897,322897.0,322897.0,322897.0
unique,,,3,4,8,11.0,28.0,7.0
top,,,4 - Web,1 - Tablet,17 - Firefox OS,1.0,25.0,2.0
freq,,,313195,178033,168975,309375.0,88147.0,199292.0
mean,9.254285,2017-10-08 14:16:32.833416192,,,,,,
min,2.0,2017-10-01 05:00:16,,,,,,
25%,2.0,2017-10-06 00:11:28,,,,,,
50%,4.0,2017-10-08 13:04:36.333333248,,,,,,
75%,10.0,2017-10-10 21:33:16.037037056,,,,,,
max,1232.0,2017-10-31 19:35:18.750000128,,,,,,


In [7]:
users_dummy = pd.get_dummies(users.astype({"MEAN_click_timestamp": "int"}))
users_std = StandardScaler().fit_transform(users_dummy)
users_pca = PCA(n_components=20, random_state=42).fit_transform(users_std)


users_clusters = MiniBatchKMeans(n_clusters=20, random_state=42).fit_predict(
    users_pca
)

users_clusters


array([1, 2, 8, ..., 2, 1, 2], dtype=int32)

In [8]:
def get_ratings_from_clicks(clicks):
    count_user_article_clicks = (
        clicks.reset_index()
        .groupby(["user_id", "click_article_id"])
        .agg(
            COUNT_user_article_clicks=("index", "count"),
        )
    )

    count_user_clicks = (
        clicks.reset_index()
        .groupby(["user_id"])
        .agg(
            COUNT_user_clicks=("index", "count"),
        )
    )

    ratings = count_user_article_clicks.join(count_user_clicks, on="user_id")
    ratings["rating"] = (
        ratings["COUNT_user_article_clicks"] / ratings["COUNT_user_clicks"]
    )

    ratings = data_helpers.reduce_dataframe_memory_usage(
        ratings["rating"]
        .reset_index()
        .rename({"click_article_id": "article_id"}, axis=1)
    )

    return ratings


ratings = get_ratings_from_clicks(clicks)

ratings_sample = ratings.sample(frac=0.01, random_state=42)

ratings.describe(include="all", datetime_is_numeric=True)


Unnamed: 0,user_id,article_id,rating
count,2950710.0,2950710.0,2950710.0
unique,322897.0,46033.0,
top,5890.0,160974.0,
freq,1048.0,34145.0,
mean,,,0.1093554
std,,,0.1293055
min,,,0.0008116883
25%,,,0.02702703
50%,,,0.05882353
75%,,,0.125


In [9]:
from lightfm.data import Dataset


dataset = Dataset()

dataset.fit(
    users=list(users.index),
    items=list(articles.index),
    user_features=users_clusters,
    item_features=articles_clusters,
)

dataset.interactions_shape()

(322897, 364047)

In [10]:
(interactions, weights) = dataset.build_interactions(
    (row.user_id, row.article_id, row.rating)
    for row in tqdm(ratings.itertuples())
)


2950710it [00:08, 347314.11it/s]


In [13]:
user_features = dataset.build_user_features(
    (users.index[i], [users_clusters[i]])
    for i in tqdm(range(len(users)))
)


100%|██████████| 322897/322897 [00:02<00:00, 150354.94it/s]


In [12]:
item_features = dataset.build_item_features(
    (articles.index[i], [articles_clusters[i]])
    for i in tqdm(range(len(articles)))
)


100%|██████████| 364047/364047 [00:02<00:00, 158916.37it/s]


In [16]:
from lightfm import LightFM


model = LightFM(random_state=42)
model.fit(
    interactions=interactions,
    user_features=user_features,
    item_features=item_features,
    sample_weight=weights,
    epochs=100,
    num_threads=8,
    verbose=True,
)


Epoch: 100%|██████████| 100/100 [02:13<00:00,  1.33s/it]


<lightfm.lightfm.LightFM at 0x7fd3d3ee5550>

In [26]:
user_id = "5890"

user_idx = dataset._user_id_mapping[user_id]

pred = model.predict(
    user_ids=0,
    item_ids=list(dataset._item_id_mapping.values()),
)


In [34]:
pred_df = pd.DataFrame(
    {
        "article_id": list(dataset._item_id_mapping.keys()),
        "pred": pred,
    }
)


In [40]:
pred_df.sort_values("pred", ascending=False).reset_index(drop=True).head(20)

Unnamed: 0,article_id,pred
0,183176,1.798231
1,272143,1.520692
2,96210,1.468045
3,20691,1.457591
4,234698,1.433725
5,257291,1.403257
6,218028,1.370147
7,199198,1.36051
8,300082,1.281321
9,124350,1.26467


In [None]:
--------------------------------------------------------------------------------

SyntaxError: invalid syntax (2557283705.py, line 1)

In [None]:
enc = OrdinalEncoder(dtype="int")


ratings_enc = pd.DataFrame(
    enc.fit_transform(ratings[["user_id", "article_id"]]),
    columns=["user_idx", "article_idx"],
)
ratings_enc["rating"] = ratings["rating"]


ratings_sparse = sparse.csr_matrix(
    (
        ratings_enc["rating"].values,
        (ratings_enc["user_idx"].values, ratings_enc["article_idx"].values),
    ),
    shape=(
        ratings_enc["user_idx"].max() + 1,
        ratings_enc["article_idx"].max() + 1,
    ),
)

ratings_sparse


<322897x46033 sparse matrix of type '<class 'numpy.float32'>'
	with 2950710 stored elements in Compressed Sparse Row format>

In [None]:
model = AlternatingLeastSquares(factors=64, regularization=0.05)
model.fit(ratings_sparse)




  0%|          | 0/15 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
def aggregate_articles(articles):
    return articles.groupby(lambda x: True).agg(
        {
            col: "mean"
            if is_numeric_dtype(articles.dtypes[col])
            else lambda x: x.mode()[0]
            for col in articles.columns
        }
    )


def get_user_interest(user_id, clicks, articles, strategy="last_click"):
    user_id = str(user_id)

    if strategy == "last_click":
        last_clicked_article_id = (
            clicks.query("user_id == @user_id")
            .sort_values("click_timestamp", ascending=False)
            .reset_index(drop=True)
            .iloc[0]["click_article_id"]
        )
        interest = articles.query("article_id == @last_clicked_article_id")

    elif strategy == "last_session":
        last_session_id = (
            clicks.query("user_id == @user_id")
            .sort_values("click_timestamp", ascending=False)
            .reset_index(drop=True)
            .iloc[0]["session_id"]
        )
        session_article_ids = clicks.query("session_id == @last_session_id")[
            "click_article_id"
        ]
        interest = aggregate_articles(
            articles.query("article_id in @session_article_ids")
        ).drop(["article_id"], axis=1)

    elif strategy == "all_clicks":
        all_article_ids = clicks.query("user_id == @user_id")[
            "click_article_id"
        ]
        interest = aggregate_articles(
            articles.query("article_id in @all_article_ids")
        ).drop(["article_id"], axis=1)

    else:
        raise NotImplementedError

    return interest


def prepare_for_scale(articles, category_id):
    articles_copy = articles.drop(
        ["article_id", "similarity"], axis=1, errors="ignore"
    )
    articles_copy["category_id"] = articles_copy["category_id"].apply(
        lambda x: category_id if int(x) == category_id else 0
    )
    articles_copy["created_at_ts"] = articles_copy["created_at_ts"].apply(
        lambda x: x.value
    )

    return articles_copy


def get_closest_articles(interest, articles, n=10):
    category_id = interest["category_id"].iloc[0]

    scaler = StandardScaler()
    articles_std = scaler.fit_transform(
        prepare_for_scale(articles, category_id)
    )
    interest_std = scaler.transform(prepare_for_scale(interest, category_id))

    articles = articles.copy()
    articles["similarity"] = cosine_similarity(interest_std, articles_std)[0]

    return (
        articles.sort_values("similarity", ascending=False).iloc[:n],
        scaler,
        articles_std,
        interest_std,
    )


def get_collaborative_reco(user_idx, model, ratings_sparse, n=10):
    article_idxs, _ = model.recommend(
        user_idx,
        ratings_sparse[user_idx],
        N=n,
        filter_already_liked_items=False,
    )

    return article_idxs


In [None]:
user_id = "5890"
user_idx = enc.categories_[0].tolist().index(user_id)

interest = get_user_interest(user_id, clicks, articles, strategy="all_clicks")
category_id = interest["category_id"].iloc[0]

closest_article_idxs = get_collaborative_reco(user_idx, model, ratings_sparse)
closest_article_ids = enc.categories_[1][closest_article_idxs]

scaler = StandardScaler()
articles_std = scaler.fit_transform(prepare_for_scale(articles, category_id))
articles_sample_std = scaler.transform(
    prepare_for_scale(articles_sample, category_id)
)

interest_std = scaler.transform(prepare_for_scale(interest, category_id))

closest_articles = articles[articles["article_id"].isin(closest_article_ids)]
closest_articles_std = scaler.transform(
    prepare_for_scale(closest_articles, category_id)
)


In [None]:
pca = PCA(n_components=2)
articles_pca = pca.fit_transform(articles_sample_std)
interest_pca = pca.transform(interest_std)
closest_articles_pca = pca.transform(closest_articles_std)


# Plot the data in the PCA space
fig = px.scatter(
    x=articles_pca[:, 0],
    y=articles_pca[:, 1],
    color=articles_sample["category_id"],
    symbol=articles_sample["category_id"],
    title="PCA 2D",
    opacity=0.3,
    width=1200,
    height=800,
)
fig.add_scatter(
    x=interest_pca[:, 0],
    y=interest_pca[:, 1],
    mode="markers",
    marker=dict(color="green", size=30),
    text=f"User interest \n user_id: {user_id} \n category_id: {interest['category_id'].iloc[0]}",
)
fig.add_scatter(
    x=closest_articles_pca[:, 0],
    y=closest_articles_pca[:, 1],
    mode="markers",
    marker=dict(color=list(range(len(closest_articles_pca))), size=20),
    text=[
        f"rank: {i} / article_id: {a.article_id} / category_id: {a.category_id}"
        for i, a in enumerate(closest_articles.itertuples())
    ],
)
fig.show()


In [None]:
tsne = TSNE(n_components=2)
articles_tsne = tsne.fit_transform(
    np.concatenate((articles_sample_std, closest_articles_std, interest_std))
)

interest_tsne = articles_tsne[-1:]
articles_tsne = articles_tsne[:-1]

closest_articles_tsne = articles_tsne[-len(closest_articles) :]
articles_tsne = articles_tsne[: -len(closest_articles)]


# Plot the data in the t-SNE space
fig = px.scatter(
    x=articles_tsne[:, 0],
    y=articles_tsne[:, 1],
    color=articles_sample["category_id"],
    symbol=articles_sample["category_id"],
    title="t-SNE 2D",
    opacity=0.3,
    width=1200,
    height=800,
)
fig.add_scatter(
    x=interest_tsne[:, 0],
    y=interest_tsne[:, 1],
    mode="markers",
    marker=dict(color="green", size=30),
    text=f"User interest \n user_id: {user_id} \n category_id: {interest['category_id'].iloc[0]}",
)
fig.add_scatter(
    x=closest_articles_tsne[:, 0],
    y=closest_articles_tsne[:, 1],
    mode="markers",
    marker=dict(color=list(range(len(closest_articles_tsne))), size=20),
    text=[
        f"rank: {i} / article_id: {a.article_id} / category_id: {a.category_id}"
        for i, a in enumerate(closest_articles.itertuples())
    ],
)
fig.show()


In [None]:
users_last_click = (
    clicks.reset_index()
    .rename(columns={"index": "click_id"})
    .sort_values(by="click_timestamp")
    .groupby(["user_id"])
    .last()
)

X = get_ratings_from_clicks(clicks.drop(list(users_last_click["click_id"])))
y_true = dict(users_last_click["click_article_id"])


In [None]:
enc = OrdinalEncoder(dtype="int")


X_enc = pd.DataFrame(
    enc.fit_transform(X[["user_id", "article_id"]]),
    columns=["user_idx", "article_idx"],
)
X_enc["rating"] = X["rating"]


X_sparse = sparse.csr_matrix(
    (
        X_enc["rating"].values,
        (X_enc["user_idx"].values, X_enc["article_idx"].values),
    ),
    shape=(
        X_enc["user_idx"].max() + 1,
        X_enc["article_idx"].max() + 1,
    ),
)


In [None]:
model = AlternatingLeastSquares(factors=64, regularization=0.05)
model.fit(X_sparse)


  0%|          | 0/15 [00:00<?, ?it/s]

In [None]:
test_sample = random.sample(list(y_true.keys()), k=10000)

y_pred = dict(
    {
        user_id: list(
            enc.categories_[1][
                get_collaborative_reco(
                    enc.categories_[0].tolist().index(user_id),
                    model,
                    ratings_sparse,
                )
            ]
        )
        for user_id in tqdm(test_sample)
    }
)


100%|██████████| 10000/10000 [08:40<00:00, 19.21it/s]


In [None]:
def score_reco(y_true, y_pred):
    score = 0
    count = 0
    for user_id, pred_article_ids in y_pred.items():
        if user_id not in y_true.keys():
            logging.info(f"User {user_id} not found in true values")
            continue

        true_article_id = str(y_true[user_id])

        if true_article_id not in pred_article_ids:
            logging.info(
                f"Article {true_article_id} not found in predictions for user {user_id}"
            )
            continue

        rank = pred_article_ids.index(true_article_id) + 1
        score += 1 / rank
        count += 1

    # In range [0 , 1], higher is better
    return score / count


def mean_rank(y_true, y_pred):
    sum = 0
    count = 0
    for user_id, pred_article_ids in y_pred.items():
        if user_id not in y_true.keys():
            logging.info(f"User {user_id} not found in true values")
            continue

        true_article_id = str(y_true[user_id])

        if true_article_id not in pred_article_ids:
            logging.info(
                f"Article {true_article_id} not found in predictions for user {user_id}"
            )
            continue

        rank = pred_article_ids.index(true_article_id) + 1
        sum += rank
        count += 1

    # In range [1 , +Inf[, lower is better
    return sum / count


def mean_average_precision(y_true, y_pred, articles, k=10):
    average_precision = 0
    for user_id, pred_article_ids in y_pred.items():
        if user_id not in y_true.keys():
            logging.warning(f"User {user_id} not found in true values")
            continue

        true_category_id = articles.iloc[int(y_true[user_id])].category_id
        pred_categories = articles.iloc[
            [int(id) for id in pred_article_ids[:k]]
        ].category_id

        average_precision = (
            len(pred_categories[pred_categories == true_category_id]) / k
        )

    return average_precision / len(y_pred)


In [None]:
print(f"Score : {score_reco(y_true, y_pred)}")
print(f"Mean Rank : {mean_rank(y_true, y_pred)}")
print(
    f"Mean Average Precision : {mean_average_precision(y_true, y_pred, articles, k=1000)}"
)


Score : 0.38104122245913313
Mean Rank : 4.345997286295794
Mean Average Precision : 8e-07
