In [1]:
%load_ext autoreload
%autoreload 2
import sys
from pathlib import Path
import pandas as pd
import cudf
import cupy as cp
import numpy as np
from tqdm import tqdm

sys.path.append('../')
from src.retriever import PopularItem, FavoriteItem, CoOccurrenceItem, ConcatRetriever
from src.utils import get_data_period, period_extraction

In [2]:
df = cudf.read_csv('../data/processed/train.csv', parse_dates=['time_stamp'])

In [3]:
date_th = '2017-04-09'
train_period = 7
eval_period = 7
top_n = 30

train_start, train_end, eval_start, eval_end = get_data_period(date_th, train_period, eval_period)
users = period_extraction(df, eval_start, eval_end)['user_id'].unique().to_arrow().tolist()

In [4]:
config_popular = {
    'train_start_date': train_start,
    'train_end_date': train_end,
    'eval_start_date': eval_start,
    'eval_end_date': eval_end,
    'top_n': top_n
}

retriever = PopularItem(**config_popular)
retriever.fit(df)
retriever.search(users)
scores = retriever.evaluate(df, verbose=True)

[PopularItem] n=12,503, n_items=30.0 max_ndcg=0.1200, recall=0.0785, precision=0.0228


In [5]:
config_favorite = {
    'train_start_date': train_start,
    'train_end_date': train_end,
    'eval_start_date': eval_start,
    'eval_end_date': eval_end,
    'top_n': top_n
}

retriever = FavoriteItem(**config_favorite)
retriever.fit(df)
retriever.search(users)
scores = retriever.evaluate(df, verbose=True)

[FavoriteItem] n=12,503, n_items=8.0 max_ndcg=0.2741, recall=0.1755, precision=0.1736


In [6]:
config_cooccurrence = {
    'train_start_date': train_start,
    'train_end_date': train_end,
    'eval_start_date': eval_start,
    'eval_end_date': eval_end,
    'top_n': top_n,
    'output_path': Path(f'co-occurrence_{date_th}_t{train_period}_e{eval_period}_n{top_n}.pickle'),
}

retriever = CoOccurrenceItem(**config_cooccurrence)
retriever.fit(df)
retriever.search(users)
scores = retriever.evaluate(df, verbose=True)

[CoOccurrenceItem] n=12,503, n_items=23.2 max_ndcg=0.2000, recall=0.1431, precision=0.0442


In [7]:
retrievers = [
    FavoriteItem(**config_favorite),
    CoOccurrenceItem(**config_cooccurrence),
    PopularItem(**config_popular),
]
config_concat = {
    'train_start_date': train_start,
    'train_end_date': train_end,
    'eval_start_date': eval_start,
    'eval_end_date': eval_end,
    'top_n': top_n,
    'retrievers': retrievers,
}

retriever = ConcatRetriever(**config_concat)
retriever.fit(df)
retriever.search(users)
scores = retriever.evaluate(df, verbose=True)

pairs = retriever.get_pairs(df)
filename = f'pairs_{date_th}_t{train_period}_e{eval_period}_n{top_n}.csv'
pairs.to_csv(filename, index=False)
pairs.head()

[ConcatRetriever] n=12,503, n_items=30.0 max_ndcg=0.3784, recall=0.2623, precision=0.0732


Unnamed: 0,user_id,product_id,target,rated
0,0000029_C,00146634_c,0,0
1,0000029_C,00000312_c,0,0
2,0000029_C,00156487_c,0,0
3,0000029_C,00239358_c,0,0
4,0000029_C,00141797_c,0,0


In [10]:
date_th = '2017-04-30'
train_period = 7
eval_period = 7
top_n = 30

train_start, train_end, eval_start, eval_end = get_data_period(date_th, train_period, eval_period)

config_popular = {
    'train_start_date': train_start,
    'train_end_date': train_end,
    'eval_start_date': eval_start,
    'eval_end_date': eval_end,
    'top_n': top_n
}

config_favorite = {
    'train_start_date': train_start,
    'train_end_date': train_end,
    'eval_start_date': eval_start,
    'eval_end_date': eval_end,
    'top_n': top_n
}
config_cooccurrence = {
    'train_start_date': train_start,
    'train_end_date': train_end,
    'eval_start_date': eval_start,
    'eval_end_date': eval_end,
    'top_n': top_n,
    'output_path': Path(f'co-occurrence_{date_th}_t{train_period}_e{eval_period}_n{top_n}.pickle'),
}

retrievers = [
    FavoriteItem(**config_favorite),
    CoOccurrenceItem(**config_cooccurrence),
    PopularItem(**config_popular),
]
config_concat = {
    'train_start_date': train_start,
    'train_end_date': train_end,
    'eval_start_date': eval_start,
    'eval_end_date': eval_end,
    'top_n': top_n,
    'retrievers': retrievers,
}

test_users = pd.read_csv('../data/raw/test.tsv', delimiter='\t')['user_id'].tolist()

retriever = ConcatRetriever(**config_concat)
retriever.fit(df)
retriever.search(test_users)
#scores = retriever.evaluate(df, verbose=True)

pairs = retriever.get_pairs(df, target=False)
filename = f'test_pairs_{date_th}_t{train_period}_e{eval_period}_n{top_n}.csv'
pairs.to_csv(filename, index=False)
pairs.head()

Unnamed: 0,user_id,product_id
0,0000008_A,00008092_a
1,0000008_A,00006560_a
2,0000008_A,00006259_a
3,0000008_A,00004596_a
4,0000008_A,00000269_a
