In [1]:
!cd .. && make dataset && cd notebooks

>>> Downloading and extracting data files...
Data files already downloaded.
>>> OK.



In [2]:
from datetime import datetime
from pathlib import Path

import pandas as pd
import plotly.express as px
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

pd.options.plotting.backend = "plotly"


RAW_DATA_PATH = "../data/raw"

In [3]:
articles_metadata = pd.read_csv(
    Path(RAW_DATA_PATH, "articles_metadata.csv"),
    parse_dates=["created_at_ts"],
    date_parser=lambda x: datetime.fromtimestamp(int(x) / 1000),
    dtype={
        "article_id": "category",
        "category_id": "category",
        "publisher_id": "category",
        "words_count": "int",
    },
)

articles_metadata = articles_metadata.astype({"created_at_ts": "datetime64[ns]"})

articles_metadata.describe(include="all", datetime_is_numeric=True)

Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
count,364047.0,364047.0,364047,364047.0,364047.0
unique,364047.0,461.0,,1.0,
top,0.0,281.0,,0.0,
freq,1.0,12817.0,,364047.0,
mean,,,2016-09-17 01:25:54.949498624,,190.897727
min,,,2006-09-27 13:14:35,,0.0
25%,,,2015-10-15 18:00:43.500000,,159.0
50%,,,2017-03-13 17:27:29,,186.0
75%,,,2017-11-05 15:09:11,,218.0
max,,,2018-03-13 13:12:30,,6690.0


In [4]:
clicks = pd.concat(
    [
        pd.read_csv(
            click_file_path,
            parse_dates=["session_start", "click_timestamp"],
            date_parser=lambda x: datetime.fromtimestamp(int(int(x) / 1000)),
            dtype={
                "user_id": "category",
                "session_id": "category",
                "session_size": "int",
                "click_article_id": "category",
                "click_environment": "category",
                "click_deviceGroup": "category",
                "click_os": "category",
                "click_country": "category",
                "click_region": "category",
                "click_referrer_type": "category",
            },
        ).replace(
            {
                "click_environment": {
                    "1": "1 - Facebook Instant Article",
                    "2": "2 - Mobile App",
                    "3": "3 - AMP (Accelerated Mobile Pages)",
                    "4": "4 - Web",
                },
                "click_deviceGroup": {
                    "1": "1 - Tablet",
                    "2": "2 - TV",
                    "3": "3 - Empty",
                    "4": "4 - Mobile",
                    "5": "5 - Desktop",
                },
                "click_os": {
                    "1": "1 - Other",
                    "2": "2 - iOS",
                    "3": "3 - Android",
                    "4": "4 - Windows Phone",
                    "5": "5 - Windows Mobile",
                    "6": "6 - Windows",
                    "7": "7 - Mac OS X",
                    "8": "8 - Mac OS",
                    "9": "9 - Samsung",
                    "10": "10 - FireHbbTV",
                    "11": "11 - ATV OS X",
                    "12": "12 - tvOS",
                    "13": "13 - Chrome OS",
                    "14": "14 - Debian",
                    "15": "15 - Symbian OS",
                    "16": "16 - BlackBerry OS",
                    "17": "17 - Firefox OS",
                    "18": "18 - Android",
                    "19": "19 - Brew MP",
                    "20": "20 - Chromecast",
                    "21": "21 - webOS",
                    "22": "22 - Gentoo",
                    "23": "23 - Solaris",
                },
            }
        )
        for click_file_path in tqdm(
            sorted(Path(RAW_DATA_PATH, "clicks/clicks").glob("clicks_hour_*.csv"))
        )
    ],
    sort=False,
    ignore_index=True,
    verify_integrity=True,
)

clicks = clicks.astype(
    {"session_start": "datetime64[ns]", "click_timestamp": "datetime64[ns]"}
)

clicks.describe(include="all", datetime_is_numeric=True)

100%|██████████| 385/385 [01:09<00:00,  5.56it/s]


Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
count,2988181.0,2988181.0,2988181,2988181.0,2988181.0,2988181,2988181,2988181,2988181,2988181.0,2988181.0,2988181.0
unique,322897.0,1048594.0,,,46033.0,,3,5,8,11.0,28.0,7.0
top,5890.0,1507563657895091.0,,,160974.0,,4 - Web,1 - Tablet,17 - Firefox OS,1.0,25.0,2.0
freq,1232.0,124.0,,,37213.0,,2904478,1823162,1738138,2852406.0,804985.0,1602601.0
mean,,,2017-10-08 16:17:08.013155328,3.901885,,2017-10-08 16:51:05.070374400,,,,,,
min,,,2017-10-01 04:37:03,2.0,,2017-10-01 05:00:00,,,,,,
25%,,,2017-10-04 15:35:52,2.0,,2017-10-04 16:20:52,,,,,,
50%,,,2017-10-08 22:09:00,3.0,,2017-10-08 22:35:30,,,,,,
75%,,,2017-10-11 21:16:54,4.0,,2017-10-11 21:43:24,,,,,,
max,,,2017-10-17 05:36:19,124.0,,2017-11-13 21:04:14,,,,,,


In [5]:
articles_embeddings = pd.read_pickle(Path(RAW_DATA_PATH, "articles_embeddings.pickle"))

articles = pd.DataFrame(
    articles_embeddings,
    columns=["embedding_" + str(i) for i in range(articles_embeddings.shape[1])],
)
articles["words_count"] = articles_metadata["words_count"]
articles["category_id"] = articles_metadata["category_id"]
articles["article_id"] = articles_metadata["article_id"]

articles.describe(include="all", datetime_is_numeric=True)

articles_sample = articles.sample(frac=0.01)

In [9]:
def get_user_interest(user_id, articles, strategy="last_click"):
    user_id = str(user_id)

    if strategy == "last_click":
        last_clicked_article_id = (
            clicks.query("user_id == @user_id")
            .sort_values("click_timestamp", ascending=False)
            .reset_index(drop=True)
            .iloc[0]["click_article_id"]
        )
        interest = articles.query("article_id == @last_clicked_article_id")

    elif strategy == "last_session_mean":
        last_session_id = (
            clicks.query("user_id == @user_id")
            .sort_values("click_timestamp", ascending=False)
            .reset_index(drop=True)
            .iloc[0]["session_id"]
        )
        session_article_ids = clicks.query("session_id == @last_session_id")[
            "click_article_id"
        ]
        session_articles = articles.query("article_id in @session_article_ids")

        interest = session_articles.drop(["article_id", "category_id"], axis=1).mean(
            axis=0
        )
        interest["category_id"] = (
            session_articles["category_id"].value_counts().idxmax()
        )
        interest = interest.to_frame().T

    else:
        raise NotImplementedError

    return interest


def get_closest_articles(interest, articles, n=10):
    articles = articles.copy()
    articles["similarity"] = cosine_similarity(
        interest.drop("article_id", axis=1, errors="ignore"),
        articles.drop("article_id", axis=1),
    )[0]

    return articles.sort_values("similarity", ascending=False).iloc[1 : n + 1]


user_id = "5890"

interest = get_user_interest(user_id, articles, strategy="last_session_mean")
print(interest)

closest_articles = get_closest_articles(interest, articles)
closest_articles.head()

  embedding_0 embedding_1 embedding_2 embedding_3 embedding_4 embedding_5  \
0   -0.311286   -0.967772    0.051973   -0.317989    -0.26122     0.04942   

  embedding_6 embedding_7 embedding_8 embedding_9  ... embedding_242  \
0   -0.265075   -0.081486    0.000408     0.09121  ...      0.118577   

  embedding_243 embedding_244 embedding_245 embedding_246 embedding_247  \
0     -0.027042      0.071259      0.207788     -0.073207      0.270678   

  embedding_248 embedding_249 words_count category_id  
0      0.204427      0.050999      212.25         331  

[1 rows x 252 columns]
        embedding_0  embedding_1  embedding_2  embedding_3  embedding_4  \
327511    -0.288757    -0.967714    -0.203851    -0.660697    -0.487163   
206586    -0.432658    -0.965757     0.010155     0.123008    -0.426574   
247186    -0.271355    -0.958472     0.204601    -0.467751     0.048008   
283684    -0.585605    -0.960567    -0.140772    -0.191224    -0.539724   
247907    -0.557934    -0.967221     0

In [12]:
pca = PCA(n_components=2)
articles_pca = pca.fit_transform(
    articles_sample[
        ["embedding_" + str(i) for i in range(articles_embeddings.shape[1])]
    ]
)

In [33]:
interest_pca = pca.transform(
    interest[["embedding_" + str(i) for i in range(articles_embeddings.shape[1])]]
)

closest_articles_pca = pca.transform(
    closest_articles[
        ["embedding_" + str(i) for i in range(articles_embeddings.shape[1])]
    ]
)

# Plot the data in the PCA space
fig = px.scatter(
    x=articles_pca[:, 0],
    y=articles_pca[:, 1],
    color=articles_sample["category_id"],
    symbol=articles_sample["category_id"],
    title="PCA 2D",
    opacity=0.3,
    width=1200,
    height=800,
)
fig.add_scatter(
    x=interest_pca[:, 0],
    y=interest_pca[:, 1],
    mode="markers",
    marker=dict(color="green", size=30),
    text=f"User interest \n user_id: {user_id} \n category_id: {interest['category_id'].iloc[0]}",
)
fig.add_scatter(
    x=closest_articles_pca[:, 0],
    y=closest_articles_pca[:, 1],
    mode="markers",
    marker=dict(color=list(range(len(closest_articles_pca))), size=20),
    text=[
        f"rank: {i} / article_id: {a.article_id} / category_id: {a.category_id}"
        for i, a in enumerate(closest_articles.itertuples())
    ],
)
fig.show()