In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
import pandas as pd
import cudf
import cupy as cp
import numpy as np
from tqdm import tqdm

sys.path.append('../')
from src.retriever import PopularItem, FavoriteItem, CoOccurrenceItem, ConcatRetriever
from src.utils import get_data_period, period_extraction

In [2]:
df = cudf.read_csv('../data/processed/train.csv', parse_dates=['time_stamp'])

In [3]:
date_th = '2017-04-16'
train_period = 7
eval_period = 7
top_n = 30

train_start, train_end, eval_start, eval_end = get_data_period(date_th, train_period, eval_period)
users = period_extraction(df, eval_start, eval_end)['user_id'].unique().to_arrow().tolist()

In [4]:
config_popular = {
    'train_start_date': train_start,
    'train_end_date': train_end,
    'eval_start_date': eval_start,
    'eval_end_date': eval_end,
    'top_n': top_n
}

retriever = PopularItem(**config_popular)
retriever.fit(df)
retriever.search(users)
scores = retriever.evaluate(df, verbose=True)

[PopularItem] n=12,011, n_items=30.0 max_ndcg=0.1111, recall=0.0709, precision=0.0201


In [5]:
config_favorite = {
    'train_start_date': train_start,
    'train_end_date': train_end,
    'eval_start_date': eval_start,
    'eval_end_date': eval_end,
    'top_n': top_n
}

retriever = FavoriteItem(**config_favorite)
retriever.fit(df)
retriever.search(users)
scores = retriever.evaluate(df, verbose=True)

[FavoriteItem] n=12,011, n_items=7.6 max_ndcg=0.2618, recall=0.1666, precision=0.1731


In [6]:
config_cooccurrence = {
    'train_start_date': train_start,
    'train_end_date': train_end,
    'eval_start_date': eval_start,
    'eval_end_date': eval_end,
    'top_n': top_n,
    'output_path': Path(f'co-occurrence_{date_th}_t{train_period}_e{eval_period}_n{top_n}.pickle'),
}

retriever = CoOccurrenceItem(**config_cooccurrence)
retriever.fit(df)
retriever.search(users)
scores = retriever.evaluate(df, verbose=True)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 295610/295610 [1:42:27<00:00, 48.08it/s]


[CoOccurrenceItem] n=12,011, n_items=23.2 max_ndcg=0.1935, recall=0.1370, precision=0.0434


In [7]:
retrievers = [
    FavoriteItem(**config_favorite),
    CoOccurrenceItem(**config_cooccurrence),
    PopularItem(**config_popular),
]
config_concat = {
    'train_start_date': train_start,
    'train_end_date': train_end,
    'eval_start_date': eval_start,
    'eval_end_date': eval_end,
    'top_n': top_n,
    'retrievers': retrievers,
}

retriever = ConcatRetriever(**config_concat)
retriever.fit(df)
retriever.search(users)
scores = retriever.evaluate(df, verbose=True)

pairs = retriever.get_pairs(df)
filename = f'pairs_{date_th}_t{train_period}_e{eval_period}_n{top_n}.csv'
pairs.to_csv(filename, index=False)
pairs.head()

[ConcatRetriever] n=12,011, n_items=30.0 max_ndcg=0.3645, recall=0.2508, precision=0.0708


Unnamed: 0,user_id,product_id,target,rated
0,0000019_B,00012052_b,0,0
1,0000019_B,00012487_b,0,0
2,0000019_B,00001760_b,0,0
3,0000019_B,00003917_b,0,0
4,0000019_B,00013563_b,0,0


In [12]:
date_th = '2017-04-30'
train_period = 7
eval_period = 7
top_n = 30

train_start, train_end, eval_start, eval_end = get_data_period(date_th, train_period, eval_period)
users = period_extraction(df, eval_start, eval_end)['user_id'].unique().to_arrow().tolist()


test_users = pd.read_csv('../data/raw/test.tsv', delimiter='\t')['user_id'].tolist()

retrievers = [
    FavoriteItem(**config_favorite),
    CoOccurrenceItem(**config_cooccurrence),
    PopularItem(**config_popular),
]
config_concat = {
    'train_start_date': train_start,
    'train_end_date': train_end,
    'eval_start_date': eval_start,
    'eval_end_date': eval_end,
    'top_n': top_n,
    'retrievers': retrievers,
}

retriever = ConcatRetriever(**config_concat)
retriever.fit(df)
retriever.search(test_users)
#scores = retriever.evaluate(df, verbose=True)

pairs = retriever.get_pairs(df)
filename = f'test_pairs_{date_th}_t{train_period}_e{eval_period}_n{top_n}.csv'
pairs.to_csv(filename, index=False)
pairs.head()

Unnamed: 0,user_id,product_id,target,rated
0,0028734_A,00009492_a,0,0
1,0028734_A,00009413_a,0,0
2,0028734_A,00002511_a,0,0
3,0028734_A,00010901_a,0,0
4,0028734_A,00012717_a,0,0
