In [1]:
from datetime import datetime
from itertools import product

import numpy as np
import pandas as pd
from rectools import Columns
%matplotlib notebook
from matplotlib import pyplot as plt
from tqdm.auto import tqdm

from metrics_impl import pfound, pfound_fast, mrr_naive, mrr_numba, mrr_pandas
from validation_impl import UsersKFoldPOut




# Homework 2.
## Pfound
### Just data preparation

In [2]:
qid_query = pd.read_csv("hidden_task/qid_query.tsv", sep="\t", names=["qid", "query"])
qid_url_rating = pd.read_csv("hidden_task/qid_url_rating.tsv", sep="\t",
                             names=["qid", "url", "rating"])
hostid_url = pd.read_csv("hidden_task/hostid_url.tsv", sep="\t", names=["hostid", "url"])

qid_url_rating_hostid = pd.merge(qid_url_rating, hostid_url, on="url")
qid_url_rating_hostid

Unnamed: 0,qid,url,rating,hostid
0,402111,http://24-job.com/board/job_australia/232-1-2-...,0.07,7
1,402111,http://24-job.com/board/job_australia/232-1-2-...,0.07,7
2,402111,http://802351.info/5964-v-avstralii.html,0.00,13
3,402111,http://auscommunity.com/blog/jobs/,0.00,53
4,402111,http://auscommunity.com/tag/%D1%84%D0%BE%D1%82...,0.00,53
...,...,...,...,...
798,99543,http://www.youtube.com/watch?v=QDcomRWogFE,0.14,1155
799,99543,http://www.youtube.com/watch?v=Y3n47xZb0b4,0.14,1155
800,99543,http://www.yuga.ru/articles/culture?id=3378,0.00,1156
801,99543,http://www.zexe.de/modules.php?name=Pages&pa=s...,0.14,1160


### Assertion
Proofing that result of fast implementation equals to sample's

In [3]:
qid_pfound = qid_url_rating_hostid.groupby('qid')  # группируем по qid и вычисляем pfound
pfound_vals = pd.DataFrame()
pfound_vals["default"] = qid_pfound.apply(pfound).T
pfound_vals["fast"] = qid_pfound.apply(pfound_fast).T
pfound_vals

Unnamed: 0_level_0,default,fast
qid,Unnamed: 1_level_1,Unnamed: 2_level_1
5308,0.41582,0.41582
48815,0.655448,0.655448
49587,0.493599,0.493599
55082,0.497771,0.497771
58989,0.497771,0.497771
60304,0.735836,0.735836
63179,0.460028,0.460028
70357,0.655448,0.655448
70618,0.263596,0.263596
79514,0.764755,0.764755


### Performance
Compare basic and fast implementation by `timeit`

In [4]:
%timeit qid_pfound.apply(pfound)

10.2 ms ± 464 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [5]:
%timeit qid_pfound.apply(pfound_fast)

7.96 ms ± 84.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## MRR
### Another data preparation

In [6]:
interactions = pd.read_csv('kion_train/interactions.csv')

interactions.rename(
    columns={
        'track_id': Columns.Item,
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    },
    inplace=True)

interactions[Columns.Datetime] = pd.to_datetime(interactions[Columns.Datetime])

In [7]:
def generate_subsample(users_count, top_k):
    users = np.random.choice(interactions[Columns.User].unique(), users_count, replace=False)
    df = interactions[interactions[Columns.User].isin(users)].reset_index(drop=True)
    del df[Columns.Datetime], df[Columns.Weight], df['watched_pct']

    recs = np.random.choice(df[Columns.Item], size=(users_count, top_k))
    return df, users, recs

In [8]:
top_k = 10
df, users, recs = generate_subsample(10000, top_k)
target = df.values

### Assertion
Proofing that all the results from different implementations are equal

In [9]:
mrr_table = pd.DataFrame(columns=["value"])
mrr_table.loc["naive"] = mrr_naive(users, target, recs)
mrr_table.loc["numba"] = mrr_numba(users, target, recs)
mrr_table.loc["pandas"] = mrr_pandas(users, df, recs)
mrr_table

Unnamed: 0,value
naive,0.075359
numba,0.075359
pandas,0.075359


### Performance
Comparing the *naive*, *numba* and *pandas* implementations by `timeit`

In [10]:
%timeit mrr_naive(users, target, recs)

1.86 s ± 79.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [11]:
%timeit mrr_numba(users, target, recs)

361 ms ± 37.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [12]:
%timeit mrr_pandas(users, df, recs)

22.5 ms ± 1.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Visualisation
Preparing data for the plots, using the `timing` function to help calculate approximate time for evaluation

In [13]:
def timing(f, args):
    start = datetime.now()
    f(*args)
    end = datetime.now()
    return (end - start).total_seconds()


In [14]:
users_counts = [100, 1000, 10000, 100000]
top_ks = [10, 50, 100]
impls = ["naive", "numba", "pandas"]

f_impls = {
    "naive": mrr_naive,
    "numba": mrr_numba,
    "pandas": mrr_pandas
}

scores = pd.DataFrame(list(product(users_counts, top_ks, impls)),
                      columns=['users_count', 'top_k', "implementation"])
timings = np.array([])
scores

Unnamed: 0,users_count,top_k,implementation
0,100,10,naive
1,100,10,numba
2,100,10,pandas
3,100,50,naive
4,100,50,numba
5,100,50,pandas
6,100,100,naive
7,100,100,numba
8,100,100,pandas
9,1000,10,naive


In [15]:
for users_count in tqdm(users_counts):
    for top_k in tqdm(top_ks):
        df, users, recs = generate_subsample(users_count, top_k)
        target = df.values

        timings = np.append(timings, [
            timing(f_impls["naive"], [users, target, recs]),
            timing(f_impls["numba"], [users, target, recs]),
            timing(f_impls["pandas"], [users, df, recs])
        ])

scores["time"] = timings
scores

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Unnamed: 0,users_count,top_k,implementation,time
0,100,10,naive,0.003048
1,100,10,numba,0.036429
2,100,10,pandas,0.005084
3,100,50,naive,0.02228
4,100,50,numba,0.064467
5,100,50,pandas,0.008199
6,100,100,naive,0.020522
7,100,100,numba,0.067199
8,100,100,pandas,0.00977
9,1000,10,naive,0.050333


In [16]:
colors = {
    "naive": "red",
    "numba": "blue",
    "pandas": "green"
}

fig = plt.figure(figsize=(6, 6))
fig.tight_layout()
ax = fig.add_subplot(111, projection='3d')
ax.set_title("Metrics implementations comparison")

ax.set_xlabel("User count (log2(number))")

ax.set_ylabel("Top-k (number)")

ax.set_zlabel("Evaluation time (log2(ms))")

xs = np.log2(scores[scores["implementation"] == "naive"]["users_count"])

ys = scores[scores["implementation"] == "naive"]["top_k"]
for impl in impls:
    zs = np.log2(1000 * scores[scores["implementation"] == impl]["time"])
    ax.scatter(xs = xs, ys = ys, zs = zs, c=colors[impl], label=impl)
ax.legend(loc="best")
plt.show()

<IPython.core.display.Javascript object>

## UserKFolds
The data splits into k folds
Important thing is that according to the lecture, P-out means that there would be at least P items for every user in *test* (arguing with the task's description so idk for now whether I am right)

In [17]:
cv = UsersKFoldPOut(n_folds=5, p=10)

for i, (train_mask, test_mask) in enumerate(cv.split(interactions)):
    train = interactions[train_mask]
    test = interactions[test_mask]
    print(f'Fold#{i} | Train: {train.shape[0]}, Test: {test.shape[0]}')


  train = interactions[train_mask]
  test = interactions[test_mask]


Fold#0 | Train: 4830352, Test: 645899
Fold#1 | Train: 4830352, Test: 645899
Fold#2 | Train: 4830352, Test: 645899
Fold#3 | Train: 4830352, Test: 645899
Fold#4 | Train: 4830352, Test: 645899
