In [1]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cudf
from pathlib import Path

from src.utils import get_data_period
from src.features import UserActionScore, ItemActionScore, UserItemActionScore, ConcatFeatureTransformer
from src.features import UserAttribute, ItemAttribute

In [2]:
TRAIN_PATH = '../data/processed/train.csv'
EXP_NO = 'exp005'
OUTPUT_DIR = Path(f'../data/{EXP_NO}')

date_th_list = ['2017-04-16', '2017-04-23', '2017-04-30']
train_flag_list = [True, True, False]
train_period = 14
eval_period = 7
top_n = 100

In [3]:
for date_th, train_flag in zip(date_th_list, train_flag_list):
    print('='*50)
    print(f'# date_th={date_th}, train_flag={train_flag}')
    user_log = cudf.read_csv(TRAIN_PATH, parse_dates=['time_stamp'])
    pairs_path = OUTPUT_DIR/f'pairs_{date_th}_t{train_period}_e{eval_period}_n{top_n}.csv'
    pairs = cudf.read_csv(pairs_path)
    if train_flag:
        pairs = pairs[pairs['rated']==1].reset_index(drop=True)
    
    train_start_date, train_end_date, _, _ = get_data_period(date_th, train_period, eval_period)
    
    config_common = {
        'start_date': train_start_date,
        'end_date': train_end_date,
    }
    
    config_decay = {'decay_rate':1.0, **config_common}
    config_nondecay = {'decay_rate':0.8, **config_common}
    
    
    config_itemattr = {
        'item_cluster_path': '../model/item2vec/exp005/item_cluster.csv',
        **config_common
    }
    
    config_userattr = {
        'model_dir': Path('../model/item2vec/exp005/'),
        **config_common
    }
    
    config_concat_feature_transformer = {
        'feature_transformers': [
            ItemAttribute(**config_itemattr),
            UserAttribute(**config_userattr),
            UserActionScore(**config_decay),
            ItemActionScore(**config_decay),
            UserItemActionScore(**config_decay),
            UserActionScore(**config_nondecay),
            ItemActionScore(**config_nondecay),
            UserItemActionScore(**config_nondecay),
        ],
        **config_common,
    }
        
    feature_transformer = ConcatFeatureTransformer(**config_concat_feature_transformer)
    features = feature_transformer.fit_transform(user_log, pairs)
    features = cudf.concat([pairs[['user_id', 'product_id', 'target', 'rated']], features], axis=1)
    features.to_csv(OUTPUT_DIR/f'features_{date_th}_t{train_period}_e{eval_period}_n{top_n}.csv', index=False)
    print(f'features.shape: {features.shape}')

# date_th=2017-04-16, train_flag=True
features.shape: (1201100, 31)
# date_th=2017-04-23, train_flag=True
features.shape: (1150100, 31)
# date_th=2017-04-30, train_flag=False
features.shape: (1159800, 31)


In [4]:
list(features.columns)

['user_id',
 'product_id',
 'target',
 'rated',
 'item_cluster_id',
 'item_category',
 'user_cluster_id',
 'cv-score-r1.0_by_user',
 'click-score-r1.0_by_user',
 'pv-score-r1.0_by_user',
 'other-score-r1.0_by_user',
 'cv-score-r1.0_by_item',
 'click-score-r1.0_by_item',
 'pv-score-r1.0_by_item',
 'other-score-r1.0_by_item',
 'cv-score-r1.0_by_user-item',
 'click-score-r1.0_by_user-item',
 'pv-score-r1.0_by_user-item',
 'other-score-r1.0_by_user-item',
 'cv-score-r0.8_by_user',
 'click-score-r0.8_by_user',
 'pv-score-r0.8_by_user',
 'other-score-r0.8_by_user',
 'cv-score-r0.8_by_item',
 'click-score-r0.8_by_item',
 'pv-score-r0.8_by_item',
 'other-score-r0.8_by_item',
 'cv-score-r0.8_by_user-item',
 'click-score-r0.8_by_user-item',
 'pv-score-r0.8_by_user-item',
 'other-score-r0.8_by_user-item']