In [1]:
!cd .. && make dataset && cd notebooks

>>> Downloading and extracting data files...
Data files already downloaded.
>>> OK.



In [2]:
# Base
import os
import sys

sys.path.append(os.path.abspath(".."))

from datetime import datetime
from pathlib import Path

import pandas as pd
from tqdm import tqdm


pd.options.plotting.backend = "plotly"


RAW_DATA_PATH = "../data/raw"


In [3]:
articles_metadata = pd.read_csv(
    Path(RAW_DATA_PATH, "articles_metadata.csv"),
    parse_dates=["created_at_ts"],
    date_parser=lambda x: datetime.fromtimestamp(int(x) / 1000),
    dtype={
        "article_id": "category",
        "category_id": "category",
        "publisher_id": "category",
        "words_count": "int",
    },
)
articles_metadata.describe(include="all")


  articles_metadata.describe(include="all")


Unnamed: 0,article_id,category_id,created_at_ts,publisher_id,words_count
count,364047.0,364047.0,364047,364047.0,364047.0
unique,364047.0,461.0,359552,1.0,
top,0.0,281.0,2015-02-28 01:00:03,0.0,
freq,1.0,12817.0,11,364047.0,
first,,,2006-09-27 13:14:35,,
last,,,2018-03-13 13:12:30,,
mean,,,,,190.897727
std,,,,,59.502766
min,,,,,0.0
25%,,,,,159.0


In [4]:
clicks = pd.concat(
    [
        pd.read_csv(
            click_file_path,
            parse_dates=["session_start", "click_timestamp"],
            date_parser=lambda x: datetime.fromtimestamp(int(int(x) / 1000)),
            dtype={
                "user_id": "category",
                "session_id": "category",
                "session_size": "int",
                "click_article_id": "category",
                "click_environment": "category",
                "click_deviceGroup": "category",
                "click_os": "category",
                "click_country": "category",
                "click_region": "category",
                "click_referrer_type": "category",
            },
        ).replace(
            {
                "click_environment": {
                    "1": "1 - Facebook Instant Article",
                    "2": "2 - Mobile App",
                    "3": "3 - AMP (Accelerated Mobile Pages)",
                    "4": "4 - Web",
                },
                "click_deviceGroup": {
                    "1": "1 - Tablet",
                    "2": "2 - TV",
                    "3": "3 - Empty",
                    "4": "4 - Mobile",
                    "5": "5 - Desktop",
                },
                "click_os": {
                    "1": "1 - Other",
                    "2": "2 - iOS",
                    "3": "3 - Android",
                    "4": "4 - Windows Phone",
                    "5": "5 - Windows Mobile",
                    "6": "6 - Windows",
                    "7": "7 - Mac OS X",
                    "8": "8 - Mac OS",
                    "9": "9 - Samsung",
                    "10": "10 - FireHbbTV",
                    "11": "11 - ATV OS X",
                    "12": "12 - tvOS",
                    "13": "13 - Chrome OS",
                    "14": "14 - Debian",
                    "15": "15 - Symbian OS",
                    "16": "16 - BlackBerry OS",
                    "17": "17 - Firefox OS",
                    "18": "18 - Android",
                    "19": "19 - Brew MP",
                    "20": "20 - Chromecast",
                    "21": "21 - webOS",
                    "22": "22 - Gentoo",
                    "23": "23 - Solaris",
                },
            }
        )
        for click_file_path in tqdm(
            sorted(
                Path(RAW_DATA_PATH, "clicks/clicks").glob("clicks_hour_*.csv")
            )
        )
    ],
    sort=False,
    ignore_index=True,
    verify_integrity=True,
)

clicks.describe(include="all")


100%|██████████| 385/385 [01:01<00:00,  6.28it/s]


Unnamed: 0,user_id,session_id,session_start,session_size,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
count,2988181.0,2988181.0,2988181,2988181.0,2988181.0,2988181,2988181,2988181,2988181,2988181.0,2988181.0,2988181.0
unique,322897.0,1048594.0,646874,,46033.0,1016184,3,5,8,11.0,28.0,7.0
top,5890.0,1507563657895091.0,2017-10-09 17:40:57,,160974.0,2017-10-10 13:27:25,4 - Web,1 - Tablet,17 - Firefox OS,1.0,25.0,2.0
freq,1232.0,124.0,127,,37213.0,24,2904478,1823162,1738138,2852406.0,804985.0,1602601.0
mean,,,,3.901885,,,,,,,,
std,,,,3.929941,,,,,,,,
min,,,,2.0,,,,,,,,
25%,,,,2.0,,,,,,,,
50%,,,,3.0,,,,,,,,
75%,,,,4.0,,,,,,,,


In [5]:
articles_embeddings = pd.read_pickle(
    Path(RAW_DATA_PATH, "articles_embeddings.pickle")
)

articles = pd.DataFrame(
    articles_embeddings,
    columns=[
        "embedding_" + str(i) for i in range(articles_embeddings.shape[1])
    ],
)
articles["words_count"] = articles_metadata["words_count"]
articles["category_id"] = articles_metadata["category_id"]
articles["article_id"] = articles_metadata["article_id"]

articles.describe(include="all")


Unnamed: 0,embedding_0,embedding_1,embedding_2,embedding_3,embedding_4,embedding_5,embedding_6,embedding_7,embedding_8,embedding_9,...,embedding_243,embedding_244,embedding_245,embedding_246,embedding_247,embedding_248,embedding_249,words_count,category_id,article_id
count,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,...,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0,364047.0
unique,,,,,,,,,,,...,,,,,,,,,461.0,364047.0
top,,,,,,,,,,,...,,,,,,,,,281.0,0.0
freq,,,,,,,,,,,...,,,,,,,,,12817.0,1.0
mean,-0.238645,-0.963335,0.118548,-0.279295,-0.068579,0.045945,-0.168319,-0.216564,-0.00681,-0.019156,...,0.023003,0.076947,0.084603,0.062819,0.099768,0.155917,-0.041094,190.897727,,
std,0.451497,0.022651,0.49505,0.505067,0.506723,0.483288,0.514183,0.456211,0.519036,0.448782,...,0.507181,0.426143,0.467998,0.462678,0.499648,0.454356,0.458819,59.502766,,
min,-0.991183,-0.996455,-0.968431,-0.994966,-0.994489,-0.985974,-0.99509,-0.995926,-0.991316,-0.990957,...,-0.993626,-0.989042,-0.996902,-0.992921,-0.984733,-0.976071,-0.988213,0.0,,
25%,-0.620072,-0.974056,-0.289953,-0.718816,-0.503425,-0.354579,-0.610391,-0.604471,-0.446108,-0.377319,...,-0.404508,-0.248653,-0.267072,-0.306548,-0.313598,-0.201402,-0.420694,159.0,,
50%,-0.302581,-0.967605,0.124339,-0.391535,-0.093734,0.062636,-0.259313,-0.257692,-0.025777,0.000807,...,0.000726,0.105649,0.133525,0.083315,0.128757,0.188355,-0.015232,186.0,,
75%,0.098015,-0.959061,0.545112,0.10832,0.345024,0.446979,0.244957,0.138783,0.432606,0.343574,...,0.459386,0.417347,0.461466,0.441831,0.531453,0.538111,0.334226,218.0,,


In [9]:
from sklearn.metrics.pairwise import cosine_similarity


def get_user_last_clicked_article(user_id):
    user_id = str(user_id)
    user_clicks = clicks.query("user_id == @user_id")
    user_clicks = user_clicks.sort_values(
        "click_timestamp", ascending=False
    ).reset_index(drop=True)
    return user_clicks.iloc[0]["click_article_id"]


def get_closest_articles(article_id, articles, n=10):
    article = articles.query("article_id == @article_id")

    similarities = cosine_similarity(
        article.drop("article_id", axis=1), articles.drop("article_id", axis=1)
    )
    similarities = similarities.reshape(similarities.shape[1])
    similarities = pd.DataFrame(
        {
            "article_id": articles["article_id"],
            "similarity": similarities,
        }
    )
    similarities = similarities.sort_values("similarity", ascending=False)
    print(similarities.iloc[1 : n + 1])
    return list(similarities.iloc[1 : n + 1]["article_id"])


user_id = "5890"

last_article_id = get_user_last_clicked_article(user_id)
print(last_article_id)

closest_articles = get_closest_articles(last_article_id, articles)
print(closest_articles)



66380
       article_id  similarity
125186     125186    0.999772
70310       70310    0.999768
73316       73316    0.999759
70405       70405    0.999758
71028       71028    0.999752
138157     138157    0.999749
67908       67908    0.999742
67423       67423    0.999737
150004     150004    0.999736
72222       72222    0.999734
['125186', '70310', '73316', '70405', '71028', '138157', '67908', '67423', '150004', '72222']
