In [1]:
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import cudf
from pathlib import Path

from src.utils import get_data_period
from src.features import UserActionScore, ItemActionScore, UserItemActionScore, ConcatFeatureTransformer
from src.features import UserAttribute, ItemAttribute
from src.features import WeeklyActionSimilarity, TimelyActionSimilarity
from src.features import ItemGroupActionScore
from src.features import Item2ItemSimilarity

In [2]:
TRAIN_PATH = '../data/processed/train.csv'
EXP_NO = 'exp006'
OUTPUT_DIR = Path(f'../data/{EXP_NO}')

date_th_list = ['2017-04-16', '2017-04-23', '2017-04-30']
train_flag_list = [True, True, False]
train_period = 14
eval_period = 7
top_n = 100

In [3]:
for date_th, train_flag in zip(date_th_list, train_flag_list):
    print('='*50)
    print(f'# date_th={date_th}, train_flag={train_flag}')
    user_log = cudf.read_csv(TRAIN_PATH, parse_dates=['time_stamp'])
    pairs_path = OUTPUT_DIR/f'pairs_{date_th}_t{train_period}_e{eval_period}_n{top_n}.csv'
    pairs = cudf.read_csv(pairs_path)
    if train_flag:
        pairs = pairs[pairs['rated']==1].reset_index(drop=True)
    
    train_start_date, train_end_date, _, _ = get_data_period(date_th, train_period, eval_period)
    
    config_common = {
        'start_date': train_start_date,
        'end_date': train_end_date,
    }
    
    config_decay = {'decay_rate':1.0, **config_common}
    config_nondecay = {'decay_rate':0.8, **config_common}
    
    
    config_itemattr = {
        'item_cluster_path': '../model/item2vec/exp006/item_cluster.csv',
        **config_common
    }
    
    config_userattr = {
        'model_dir': Path('../model/item2vec/exp006/'),
        **config_common
    }
    
    config_item2item = {
        'model_dir': Path('../model/item2vec/exp006/'),
        **config_common
    }
    
    config_itemgroup= {
        'decay_rate': 1.0,
        'item_cluster_path': '../model/item2vec/exp006/item_cluster.csv',
        **config_common
    }
    
    config_concat_feature_transformer = {
        'feature_transformers': [
            #WeeklyActionSimilarity(**config_decay),
            #TimelyActionSimilarity(**config_decay),
            Item2ItemSimilarity(**config_item2item),
            ItemAttribute(**config_itemattr),
            UserAttribute(**config_userattr),
            #
            UserActionScore(**config_decay),
            ItemActionScore(**config_decay),
            UserItemActionScore(**config_decay),
            UserActionScore(**config_nondecay),
            ItemActionScore(**config_nondecay),
            UserItemActionScore(**config_nondecay),
            
            #ItemGroupActionScore(**config_itemgroup),
        ],
        **config_common,
    }
        
    feature_transformer = ConcatFeatureTransformer(**config_concat_feature_transformer)
    features = feature_transformer.fit_transform(user_log, pairs)
    features = cudf.concat([pairs[['user_id', 'product_id', 'target', 'rated']], features], axis=1)
    features.to_csv(OUTPUT_DIR/f'features_{date_th}_t{train_period}_e{eval_period}_n{top_n}.csv', index=False)
    print(f'features.shape: {features.shape}')

# date_th=2017-04-16, train_flag=True
features.shape: (1201100, 33)
# date_th=2017-04-23, train_flag=True
features.shape: (1150100, 33)
# date_th=2017-04-30, train_flag=False
features.shape: (1159800, 33)


In [4]:
list(features.columns)

['user_id',
 'product_id',
 'target',
 'rated',
 'avg_sim',
 'max_sim',
 'item_cluster_id',
 'item_category',
 'user_cluster_id',
 'cv-score-r1.0_by_user',
 'click-score-r1.0_by_user',
 'pv-score-r1.0_by_user',
 'other-score-r1.0_by_user',
 'cv-score-r1.0_by_item',
 'click-score-r1.0_by_item',
 'pv-score-r1.0_by_item',
 'other-score-r1.0_by_item',
 'cv-score-r1.0_by_user-item',
 'click-score-r1.0_by_user-item',
 'pv-score-r1.0_by_user-item',
 'other-score-r1.0_by_user-item',
 'cv-score-r0.8_by_user',
 'click-score-r0.8_by_user',
 'pv-score-r0.8_by_user',
 'other-score-r0.8_by_user',
 'cv-score-r0.8_by_item',
 'click-score-r0.8_by_item',
 'pv-score-r0.8_by_item',
 'other-score-r0.8_by_item',
 'cv-score-r0.8_by_user-item',
 'click-score-r0.8_by_user-item',
 'pv-score-r0.8_by_user-item',
 'other-score-r0.8_by_user-item']

In [5]:
features

Unnamed: 0,user_id,product_id,target,rated,avg_sim,max_sim,item_cluster_id,item_category,user_cluster_id,cv-score-r1.0_by_user,...,pv-score-r0.8_by_user,other-score-r0.8_by_user,cv-score-r0.8_by_item,click-score-r0.8_by_item,pv-score-r0.8_by_item,other-score-r0.8_by_item,cv-score-r0.8_by_user-item,click-score-r0.8_by_user-item,pv-score-r0.8_by_user-item,other-score-r0.8_by_user-item
0,0000008_A,00008092_a,0,0,0.468301,1.000000,2,a,6,0.0,...,6.130449,0.000000,3.086395,0.314083,41.081882,49.126359,0.0,0.0,0.643479,0.0
1,0000008_A,00006560_a,0,0,0.629110,1.000000,5,a,6,0.0,...,6.130449,0.000000,4.387759,1.334656,131.993558,55.493268,0.0,0.0,0.643173,0.0
2,0000008_A,00006259_a,0,0,0.561879,1.000000,5,a,6,0.0,...,6.130449,0.000000,5.408501,1.177420,120.660752,43.690444,0.0,0.0,1.173140,0.0
3,0000008_A,00004596_a,0,0,0.522222,1.000000,11,a,6,0.0,...,6.130449,0.000000,4.008576,3.911880,101.952618,93.917275,0.0,0.0,0.643040,0.0
4,0000008_A,00000269_a,0,0,0.355354,1.000000,5,a,6,0.0,...,6.130449,0.000000,9.796767,5.549411,290.481457,211.375620,0.0,0.0,0.529859,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1159795,0107518_D,00667822_d,0,0,0.993547,0.999546,17,d,17,0.0,...,2.030333,88.160648,0.000000,0.000000,4.688855,18.187748,0.0,0.0,0.000000,0.0
1159796,0107518_D,00606475_d,0,0,0.993124,0.999039,18,d,17,0.0,...,2.030333,88.160648,0.000000,0.000000,0.773574,0.000000,0.0,0.0,0.000000,0.0
1159797,0107518_D,00476436_d,0,0,0.988775,0.995000,0,d,17,0.0,...,2.030333,88.160648,0.000000,0.000000,0.071302,0.000000,0.0,0.0,0.000000,0.0
1159798,0107518_D,00707654_d,0,0,0.975349,0.980783,14,d,17,0.0,...,2.030333,88.160648,0.000000,0.000000,0.072660,0.000000,0.0,0.0,0.000000,0.0
