In [1]:
!cd .. && make dataset && cd notebooks

>>> Downloading and extracting data files...
Data files already downloaded.
>>> OK.



In [2]:
import os
import sys

# Add source directory to python path
sys.path.append(os.path.abspath("../"))


import logging
import random
from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import src.data.helpers as data_helpers
from tqdm import tqdm

# Plotly as Pandas plotting backend
pd.options.plotting.backend = "plotly"

RAW_DATA_PATH = "../data/raw"

CACHE = dict()

In [3]:
NUM_EMBEDDINGS = 250

articles = pd.concat(
    [
        pd.read_csv(
            Path(RAW_DATA_PATH, "articles_metadata.csv"),
            parse_dates=["created_at_ts"],
            date_parser=lambda x: datetime.fromtimestamp(int(x) / 1000),
            dtype={
                "article_id": "category",
                "category_id": "category",
                "publisher_id": "category",
                "words_count": "int",
            },
        ),
        pd.DataFrame(
            pd.read_pickle(Path(RAW_DATA_PATH, "articles_embeddings.pickle")),
            columns=["embedding_" + str(i) for i in range(NUM_EMBEDDINGS)],
        ),
    ],
    axis=1,
)

articles = data_helpers.reduce_dataframe_memory_usage(
    articles.astype({"created_at_ts": "datetime64[ns]"})
)

articles_sample = articles.sample(frac=0.01, random_state=42)

articles.describe(include="all", datetime_is_numeric=True)

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,...,embedding_240,embedding_241,embedding_242,embedding_243,embedding_244,embedding_245,embedding_246,embedding_247,embedding_248,embedding_249
count,364047.0,364047.0,364047,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,...,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0
unique,364047.0,461.0,,1.0,,,,,,,...,,,,,,,,,,
top,0.0,281.0,,0.0,,,,,,,...,,,,,,,,,,
freq,1.0,12817.0,,364047.0,,,,,,,...,,,,,,,,,,
mean,,,2016-09-17 01:25:54.949498624,,190.897727,-0.238647,-0.963,0.118547,-0.279296,-0.068579,...,-0.133286,-0.081914,-0.060347,0.023003,0.076946,0.084603,0.062819,0.099768,0.155917,-0.041092
min,,,2006-09-27 13:14:35,,0.0,-0.991183,-0.996455,-0.968431,-0.994966,-0.994489,...,-0.990412,-0.989408,-0.990432,-0.993626,-0.989042,-0.996902,-0.992921,-0.984733,-0.976071,-0.988213
25%,,,2015-10-15 18:00:43.500000,,159.0,-0.620072,-0.974056,-0.289953,-0.718816,-0.503425,...,-0.547684,-0.445079,-0.479989,-0.404508,-0.248653,-0.267072,-0.306548,-0.313598,-0.201402,-0.420694
50%,,,2017-03-13 17:27:29,,186.0,-0.302581,-0.967605,0.124339,-0.391535,-0.093734,...,-0.175781,-0.094113,-0.078034,0.000726,0.105649,0.133525,0.083315,0.128757,0.188355,-0.015232
75%,,,2017-11-05 15:09:11,,218.0,0.098015,-0.959061,0.545112,0.10832,0.345024,...,0.250641,0.270006,0.341105,0.459386,0.417347,0.461466,0.441831,0.531453,0.538111,0.334226
max,,,2018-03-13 13:12:30,,6690.0,0.983694,-0.514728,0.998341,0.978092,0.996798,...,0.996401,0.981789,0.991332,0.995299,0.978823,0.989324,0.991445,0.997583,0.990507,0.968462


In [4]:
clicks = pd.concat(
    [
        pd.read_csv(
            click_file_path,
            parse_dates=["session_start", "click_timestamp"],
            date_parser=lambda x: datetime.fromtimestamp(int(int(x) / 1000)),
            dtype={
                "user_id": "category",
                "session_id": "category",
                "session_size": "int",
                "click_article_id": "category",
                "click_environment": "category",
                "click_deviceGroup": "category",
                "click_os": "category",
                "click_country": "category",
                "click_region": "category",
                "click_referrer_type": "category",
            },
        ).replace(
            {
                "click_environment": {
                    "1": "1 - Facebook Instant Article",
                    "2": "2 - Mobile App",
                    "3": "3 - AMP (Accelerated Mobile Pages)",
                    "4": "4 - Web",
                },
                "click_deviceGroup": {
                    "1": "1 - Tablet",
                    "2": "2 - TV",
                    "3": "3 - Empty",
                    "4": "4 - Mobile",
                    "5": "5 - Desktop",
                },
                "click_os": {
                    "1": "1 - Other",
                    "2": "2 - iOS",
                    "3": "3 - Android",
                    "4": "4 - Windows Phone",
                    "5": "5 - Windows Mobile",
                    "6": "6 - Windows",
                    "7": "7 - Mac OS X",
                    "8": "8 - Mac OS",
                    "9": "9 - Samsung",
                    "10": "10 - FireHbbTV",
                    "11": "11 - ATV OS X",
                    "12": "12 - tvOS",
                    "13": "13 - Chrome OS",
                    "14": "14 - Debian",
                    "15": "15 - Symbian OS",
                    "16": "16 - BlackBerry OS",
                    "17": "17 - Firefox OS",
                    "18": "18 - Android",
                    "19": "19 - Brew MP",
                    "20": "20 - Chromecast",
                    "21": "21 - webOS",
                    "22": "22 - Gentoo",
                    "23": "23 - Solaris",
                },
            }
        )
        for click_file_path in tqdm(
            sorted(Path(RAW_DATA_PATH, "clicks/clicks").glob("clicks_hour_*.csv"))
        )
    ],
    sort=False,
    ignore_index=True,
    verify_integrity=True,
)

clicks = data_helpers.reduce_dataframe_memory_usage(
    clicks.astype(
        {"session_start": "datetime64[ns]", "click_timestamp": "datetime64[ns]"}
    )
)

clicks.describe(include="all", datetime_is_numeric=True)

100%|██████████| 385/385 [00:55<00:00,  6.88it/s]


Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
count,2988181.0,2988181.0,2988181,2988181.0,2988181.0,2988181,2988181,2988181,2988181,2988181.0,2988181.0,2988181.0
unique,322897.0,1048594.0,,,46033.0,,3,5,8,11.0,28.0,7.0
top,5890.0,1507563657895091.0,,,160974.0,,4 - Web,1 - Tablet,17 - Firefox OS,1.0,25.0,2.0
freq,1232.0,124.0,,,37213.0,,2904478,1823162,1738138,2852406.0,804985.0,1602601.0
mean,,,2017-10-08 16:17:08.013155328,3.901885,,2017-10-08 16:51:05.070374400,,,,,,
min,,,2017-10-01 04:37:03,2.0,,2017-10-01 05:00:00,,,,,,
25%,,,2017-10-04 15:35:52,2.0,,2017-10-04 16:20:52,,,,,,
50%,,,2017-10-08 22:09:00,3.0,,2017-10-08 22:35:30,,,,,,
75%,,,2017-10-11 21:16:54,4.0,,2017-10-11 21:43:24,,,,,,
max,,,2017-10-17 05:36:19,124.0,,2017-11-13 21:04:14,,,,,,


In [5]:
def get_ratings_from_clicks(clicks):
    count_user_article_clicks = (
        clicks.reset_index()
        .groupby(["user_id", "click_article_id"])
        .agg(
            COUNT_user_article_clicks=("index", "count"),
        )
    )

    count_user_clicks = (
        clicks.reset_index()
        .groupby(["user_id"])
        .agg(
            COUNT_user_clicks=("index", "count"),
        )
    )

    ratings = count_user_article_clicks.join(count_user_clicks, on="user_id")
    ratings["rating"] = (
        ratings["COUNT_user_article_clicks"] / ratings["COUNT_user_clicks"]
    )

    ratings = data_helpers.reduce_dataframe_memory_usage(
        ratings["rating"]
        .reset_index()
        .rename({"click_article_id": "article_id"}, axis=1)
    )

    return ratings


ratings = get_ratings_from_clicks(clicks)

ratings_sample = ratings.sample(frac=0.01, random_state=42)

ratings

Unnamed: 0,user_id,article_id,rating
0,0,157541,0.125000
1,0,160158,0.125000
2,0,233470,0.125000
3,0,313996,0.125000
4,0,68866,0.125000
...,...,...,...
2950705,99998,64329,0.071429
2950706,99999,168784,0.250000
2950707,99999,225055,0.250000
2950708,99999,272143,0.250000


In [6]:
import torch
import cornac
import pandas as pd
from recommenders.datasets.python_splitters import python_random_split
from recommenders.evaluation.python_evaluation import (
    map_at_k,
    ndcg_at_k,
    precision_at_k,
    recall_at_k,
)
from recommenders.models.cornac.cornac_utils import predict_ranking

print("System version: {}".format(sys.version))
print("PyTorch version: {}".format(torch.__version__))
print("Cornac version: {}".format(cornac.__version__))

System version: 3.9.7 (default, Sep 10 2021, 14:59:43) 
[GCC 11.2.0]
PyTorch version: 1.11.0+cu102
Cornac version: 1.14.2


In [7]:
# top k items to recommend
TOP_K = 10

In [8]:
train, test = python_random_split(ratings.sample(frac=0.01, random_state=42))

In [9]:
train_set = cornac.data.Dataset.from_uir(train.itertuples(index=False), seed=42)

print("Number of users: {}".format(train_set.num_users))
print("Number of items: {}".format(train_set.num_items))

Number of users: 20069
Number of items: 4063


In [10]:
bivae = cornac.models.BiVAECF(
    seed=42, use_gpu=torch.cuda.is_available(), verbose=True
).fit(train_set)

100%|██████████| 100/100 [08:31<00:00,  5.11s/it, loss_i=0.647, loss_u=0.15]


In [11]:
all_predictions = data_helpers.reduce_dataframe_memory_usage(
    predict_ranking(
        bivae,
        test,
        usercol="user_id",
        itemcol="article_id",
        # remove_seen=True,
    ).astype({"user_id": "string", "article_id": "string"})
)
all_predictions

Unnamed: 0,user_id,article_id,prediction
0,58930,283837,0.000035
1,58930,354086,0.000651
2,58930,209122,0.000047
3,58930,15220,0.000036
4,58930,161585,0.000019
...,...,...,...
81540342,30077,176850,0.000014
81540343,30077,132650,0.000016
81540344,30077,32547,0.000009
81540345,30077,97480,0.000007


In [12]:
test

Unnamed: 0,user_id,article_id,rating
2816674,90065,285877,0.007874
2154054,51648,64329,0.015625
2940472,99145,118813,0.010000
2802585,8907,336286,0.005319
132470,111228,224658,0.222222
...,...,...,...
1564117,276316,58180,0.500000
1201096,22054,83622,0.083333
2399811,64044,236596,0.009709
1719346,307458,83647,0.500000


In [13]:
eval_map = map_at_k(
    test,
    all_predictions,
    col_user="user_id",
    col_item="article_id",
    col_prediction="prediction",
    k=TOP_K,
)
eval_ndcg = ndcg_at_k(
    test,
    all_predictions,
    col_user="user_id",
    col_item="article_id",
    col_prediction="prediction",
    k=TOP_K,
)
eval_precision = precision_at_k(
    test,
    all_predictions,
    col_user="user_id",
    col_item="article_id",
    col_prediction="prediction",
    k=TOP_K,
)
eval_recall = recall_at_k(
    test,
    all_predictions,
    col_user="user_id",
    col_item="article_id",
    col_prediction="prediction",
    k=TOP_K,
)

print(
    "MAP:\t%f" % eval_map,
    "NDCG:\t%f" % eval_ndcg,
    "Precision@K:\t%f" % eval_precision,
    "Recall@K:\t%f" % eval_recall,
    sep="\n",
)