# Author imformation:
Fan Shengzhe, Shanghaijiaotong University, Shanghai, China  
Email: fanshengzhe@sjtu.edu.cn

# 1.读取数据

## 1.1 基本数据的读取

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import os
os.chdir("drive/My Drive/reco/agri-machine-reco")

In [None]:
import pandas as pd
import numpy as np
import os
import json
import tqdm
import gc
from functools import partial

data_dir = './cache'
save_dir = './cache'

if not os.path.exists(save_dir):
  os.mkdir(save_dir)

log_table = pd.read_csv(os.path.join(data_dir, 'ctx_info.csv'))
user_info = pd.read_csv(os.path.join(data_dir, 'user_info.csv'))
item_info = pd.read_csv(os.path.join(data_dir, 'item_info.csv'))
consumer_info = pd.read_csv(os.path.join(data_dir, 'consumer_info.csv'))

In [None]:
log_table

Unnamed: 0,event_time,event_type,item_id,region,price,user_id,user_session,is_nan_region,year,month,season,yearday
0,1514764800,intent,0,0.0,0.081670,0,0,0,2018,1,3,1
1,1514765777,intent,1,0.0,0.081670,0,0,0,2018,1,3,1
2,1514767080,intent,2,0.0,0.081670,0,0,0,2018,1,3,1
3,1514814325,intent,3,0.0,0.081670,0,0,0,2018,1,3,1
4,1514816931,intent,4,0.0,0.081670,0,0,0,2018,1,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...
422121,1609448447,view,10356,15.0,0.085841,54532,86243,0,2020,12,3,366
422122,1609448773,view,25183,15.0,0.090755,54532,86244,0,2020,12,3,366
422123,1609449425,view,2889,15.0,0.102280,54532,86245,0,2020,12,3,366
422124,1609449425,view,2886,15.0,0.085841,54532,86246,0,2020,12,3,366


In [None]:
# 节省内存的一个函数
# 减少内存
def reduce_mem(df):
  numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
  start_mem = df.memory_usage().sum() / 1024**2
  for col in df.columns:
    col_type = df[col].dtypes
    if col_type in numerics:
      c_min = df[col].min()
      c_max = df[col].max()
      if pd.isnull(c_min) or pd.isnull(c_max):
        continue
      if str(col_type)[:3] == 'int':
        if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
            df[col] = df[col].astype(np.int8)
        elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
            df[col] = df[col].astype(np.int16)
        elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
            df[col] = df[col].astype(np.int32)
        elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
          df[col] = df[col].astype(np.int64)
      else:
        if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
          df[col] = df[col].astype(np.float16)
        elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
          df[col] = df[col].astype(np.float32)
        else:
          df[col] = df[col].astype(np.float64)
  end_mem = df.memory_usage().sum() / 1024**2
  # print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100*(start_mem-end_mem)/start_mem))
  return df

In [None]:
log_table = reduce_mem(log_table)
user_info = reduce_mem(user_info)
item_info = reduce_mem(item_info)
consumer_info = reduce_mem(consumer_info)

In [None]:
try:
  from pandarallel import pandarallel
except:
  !pip install pandarallel
  from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandarallel
  Downloading pandarallel-1.6.4.tar.gz (12 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25l[?25hdone
  Created wheel for pandarallel: filename=pandarallel-1.6.4-py3-none-any.whl size=16678 sha256=7d86c607bbe04f7b9ea93ca528dda664aa28822ec19d79177ef3ace68a1bde47
  Stored in directory: /root/.cache/pip/wheels/62/1e/e7/f9ee096e5cc02890a6934a5670ff6e45a3400f330605bd8210
Successfully built pandarallel
Installing collected packages: pandarallel
Successfully installed pandarallel-1.6.4
INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


### 1.1.1 i侧 i: item_info倒排表

In [None]:
def get_item_info_dict(item_info_df):

  fea_name_list = list(item_info_df.columns)
  fea_name_list.remove('item_id')
  item_info_dict = {fea_name: {} for fea_name in fea_name_list}
  for fea_name in item_info_dict:
    item_info_dict[fea_name] = dict(zip(item_info_df['item_id'], item_info_df[fea_name]))

  return item_info_dict

In [None]:
item_info_dict = get_item_info_dict(item_info)
consumer_info_dict = get_item_info_dict(consumer_info)

In [None]:
item_info_dict.keys()

dict_keys(['i_deal_mean_year', 'i_deal_mean_month', 'i_deal_mean_season', 'i_deal_mean_yearday', 'i_pos_weight_mean_year', 'i_pos_weight_mean_month', 'i_pos_weight_mean_season', 'i_pos_weight_mean_yearday', 'i_deal_mode_region', 'i_pos_weight_mode_region', 'i_active_deal_nums', 'i_active_score', 'i_active_deal_day_nums', 'i_active_weight_day_nums', 'category_id', 'power', 'i_weighted_mean_price', 'i_weight_mode_region'])

In [None]:
consumer_info_dict.keys()

dict_keys(['c_deal_mean_u_deal_mean_year', 'c_deal_mean_u_deal_mean_month', 'c_deal_mean_u_deal_mean_season', 'c_deal_mean_u_deal_mean_yearday', 'c_pos_weight_mean_u_pos_weight_mean_year', 'c_pos_weight_mean_u_pos_weight_mean_month', 'c_pos_weight_mean_u_pos_weight_mean_season', 'c_pos_weight_mean_u_pos_weight_mean_yearday', 'c_deal_mean_u_deal_mean_power', 'c_pos_weight_mean_u_pos_weight_mean_power', 'c_deal_mode_u_deal_mode_category_id', 'c_pos_weight_mode_u_pos_weight_mode_category_id', 'c_deal_mode_u_deal_mode_region', 'c_pos_weight_mode_u_pos_weight_mode_region', 'c_deal_mean_u_deal_mean_price', 'c_pos_weight_mean_u_pos_weight_mean_price', 'c_deal_mean_u_active_deal_nums', 'c_pos_weight_mean_u_active_score', 'c_deal_mean_u_active_deal_day_nums', 'c_pos_weight_mean_u_active_weight_day_nums'])

### 1.1.2 u侧 u: user_info倒排表

In [None]:
def get_user_info_dict(user_info_df):

  fea_name_list = list(user_info_df.columns)
  fea_name_list.remove('user_id')
  user_info_dict = {fea_name: {} for fea_name in fea_name_list}
  for fea_name in user_info_dict:
    user_info_dict[fea_name] = dict(zip(user_info_df['user_id'], user_info_df[fea_name]))

  return user_info_dict

In [None]:
user_info_dict = get_user_info_dict(user_info)
user_info_dict.keys()

dict_keys(['u_deal_mean_year', 'u_deal_mean_month', 'u_deal_mean_season', 'u_deal_mean_yearday', 'u_pos_weight_mean_year', 'u_pos_weight_mean_month', 'u_pos_weight_mean_season', 'u_pos_weight_mean_yearday', 'u_deal_mean_power', 'u_pos_weight_mean_power', 'u_deal_mode_category_id', 'u_pos_weight_mode_category_id', 'u_deal_mode_region', 'u_pos_weight_mode_region', 'u_deal_mean_price', 'u_pos_weight_mean_price', 'u_active_deal_nums', 'u_active_score', 'u_active_deal_day_nums', 'u_active_weight_day_nums'])

## 1.2 拆分history和label

In [None]:
hist_click_df = reduce_mem(pd.read_csv(os.path.join(data_dir, 'hist_click_df.csv')))
last_click_df = reduce_mem(pd.read_csv(os.path.join(data_dir, 'last_click_df.csv')))

In [None]:
hist_click_df

Unnamed: 0,event_time,event_type,item_id,region,price,user_id,user_session,is_nan_region,year,month,season,yearday
0,1514764800,intent,0,0.0,0.081665,0,0,0,2018,1,3,1
1,1515015684,deal,0,0.0,0.081665,0,0,0,2018,1,3,3
2,1521177331,view,0,0.0,0.081665,290,644,0,2018,3,0,75
3,1521184173,intent,0,0.0,0.081665,290,644,0,2018,3,0,75
4,1521272146,view,0,0.0,0.081665,290,644,0,2018,3,0,76
...,...,...,...,...,...,...,...,...,...,...,...,...
342478,1609237640,view,32779,9.0,0.124207,54481,86170,0,2020,12,3,364
342479,1609404135,remove_intent,32780,26.0,0.089783,54506,86208,0,2020,12,3,366
342480,1609404461,remove_intent,32780,26.0,0.089783,54506,86208,0,2020,12,3,366
342481,1609352004,view,32781,17.0,0.084778,54510,86212,0,2020,12,3,365


## 1.3 切分训练、验证用户

In [None]:
all_user_num = len(log_table['user_id'].unique())

In [None]:
import numpy as np

def trn_val_split(log_table, sample_user_nums=5000):
  all_user_ids = log_table.user_id.unique()
  
  sampled_user_ids = np.random.choice(all_user_ids, size=sample_user_nums, replace=False)
  
  log_val = log_table[log_table['user_id'].isin(sampled_user_ids)]
  
  # 将验证集中的最后一次点击抽取出来作为答案
  log_val = log_val.sort_values(['user_id', 'event_time'])
  val_ans = log_val.groupby('user_id').tail(1)
  
  log_val = pd.concat([log_val, val_ans, val_ans]).drop_duplicates(['user_id', 'item_id', 'event_time'], keep=False)
  val_ans = val_ans[val_ans.user_id.isin(log_val.user_id.unique())] # 保证答案中出现的用户在验证集中还有
  log_val = log_val[log_val.user_id.isin(val_ans.user_id.unique())]

  log_trn = log_table[~log_table['user_id'].isin(sampled_user_ids)]
  
  return log_trn['user_id'].drop_duplicates(), log_val['user_id'].drop_duplicates()

In [None]:
trn_user, val_user = trn_val_split(log_table, int(all_user_num * 0.1))
trn_user = trn_user
val_user = val_user

## 1.4 读取召回数据

### 1.4.1 读取召回字典

In [None]:
def int_keys(ordered_pairs):
  result = {}
  for key, value in ordered_pairs:
    try:
      key = int(key)
    except ValueError:
      pass
    result[key] = value
  return result

In [None]:
with open(os.path.join(data_dir, 'recall_items_dict.json'), 'r', encoding="utf-8") as f:
  recall_list_dict = json.load(f, object_pairs_hook=int_keys)

### 1.4.2 召回字典转化为dataframe

In [None]:
# 将召回列表转换成df的形式
def recall_dict2df(recall_list_dict):
  df_row_list = [] # [user, item, score]
  for user, recall_list in tqdm.tqdm(recall_list_dict.items()):
    for item, score in recall_list:
      df_row_list.append([user, item, score])
  
  col_names = ['user_id', 'item_id', 'score']
  recall_list_df = pd.DataFrame(df_row_list, columns=col_names)
  
  return recall_list_df

In [None]:
recall_list_df = reduce_mem(recall_dict2df(recall_list_dict))

100%|██████████| 54533/54533 [00:10<00:00, 5030.70it/s] 


In [None]:
del recall_list_dict
gc.collect()

0

### 1.4.3 对召回列表打标、负采样

In [None]:
# 对召回数据打标签
def get_rank_label_df(recall_list_df, label_df, is_test=False):
  # 测试集没有标签
  if is_test:
    recall_list_df['label'] = -1
    return recall_list_df
  
  def event_type2label(event_type):
    if pd.isna(event_type):
      return 1
    elif event_type == 'deal':
      return 4
    elif event_type == 'intent':
      return 3
    elif event_type == 'view':
      return 2
    elif event_type == 'remove_intent':
      return 0


  recall_list_df_ = recall_list_df.merge(label_df[['user_id', 'item_id', 'event_type']], \
                       how='left', on=['user_id', 'item_id'])
  recall_list_df_['label'] = recall_list_df_['event_type'].parallel_apply(event_type2label)
  del recall_list_df_['event_type']
    
  return recall_list_df_

In [None]:
# 对召回列表做负采样
def neg_sample_recall_data(recall_items_df, sample_rate=0.001):
  pos_data = recall_items_df[recall_items_df['label'].isin([4, 3, 2, 0])]
  neg_data = recall_items_df[recall_items_df['label'].isin([1])]
  
  print('before: pos_data_num:', len(pos_data), 'neg_data_num:', len(neg_data), 'pos/neg:', len(pos_data)/len(neg_data))
  
  # 分组采样函数
  def neg_sample_func(group_df):
    neg_num = len(group_df)
    sample_num = max(int(neg_num * sample_rate), 1) # 保证最少有一个
    sample_num = min(sample_num, 5) # 保证最多不超过5个，这里可以根据实际情况进行选择
    return group_df.sample(n=sample_num, replace=True)
  
  # 负采样
  neg_data_user_sample = neg_data.groupby('user_id', group_keys=False).parallel_apply(neg_sample_func)
  neg_data_item_sample = neg_data.groupby('item_id', group_keys=False).parallel_apply(neg_sample_func)
  
  # 将上述两种情况下的采样数据合并
  neg_data_new = neg_data_user_sample.append(neg_data_item_sample)
  # 由于上述两个操作是分开的，可能将两个相同的数据给重复选择了，所以需要对合并后的数据进行去重
  neg_data_new = neg_data_new.sort_values(['user_id', 'score']).drop_duplicates(['user_id', 'item_id'], keep='last')
  
  # 将正样本数据合并
  data_new = pd.concat([pos_data, neg_data_new], ignore_index=True)

  new_pos_data = data_new[data_new['label'].isin([4, 3, 2, 0])]
  new_neg_data = data_new[data_new['label'].isin([1])]
  print('after: pos_data_num:', len(new_pos_data), 'neg_data_num:', len(new_neg_data), 'pos/neg:', len(new_pos_data)/len(new_neg_data))
    
  return data_new

In [None]:
# 打标 + 负采样的主调函数
def get_user_recall_item_label_df(hist_click_df, last_click_df, recall_list_df, sample_rate=0.001, is_test=False):
  # 获取训练数据的召回列表
  user_items_df = recall_list_df[recall_list_df['user_id'].isin(hist_click_df['user_id'].unique())]
  # 训练和测试数据打标签
  user_item_label_df = get_rank_label_df(user_items_df, last_click_df, is_test=is_test)

  # 训练集负采样
  trn_user_item_label_df = neg_sample_recall_data(user_item_label_df[user_item_label_df['user_id'].isin(trn_user)], sample_rate=sample_rate)
  val_user_item_label_df = user_item_label_df[user_item_label_df['user_id'].isin(val_user)]
  user_item_label_df = pd.concat([trn_user_item_label_df, val_user_item_label_df])
  return user_item_label_df

In [None]:
user_item_label_df = reduce_mem(get_user_recall_item_label_df(hist_click_df, last_click_df, recall_list_df, sample_rate=0.001))

before: pos_data_num: 43765 neg_data_num: 12481396 pos/neg: 0.0035064186730394582
after: pos_data_num: 43765 neg_data_num: 76754 pos/neg: 0.5701982958542877


### 1.4.4 拼接上下文特征

In [None]:
user_item_label_df = reduce_mem(user_item_label_df.merge(last_click_df.drop(['item_id', 'event_type'], axis=1), how='left', on='user_id'))

# 2.特征工程

## 2.1 拼接召回特征

In [None]:
user_item_label_df = pd.merge(user_item_label_df, user_info, how='left', on='user_id')
user_item_label_df = pd.merge(user_item_label_df, item_info, how='left', on='item_id')
user_item_label_df = reduce_mem(pd.merge(user_item_label_df, consumer_info, how='left', on='item_id'))

## 2.2 获取last ua item

In [None]:
hist_last_click_df = hist_click_df.sort_values(by=['user_id', 'event_time'])
hist_last_click_df = hist_last_click_df[(hist_last_click_df['event_type']=='deal') 
            | (hist_last_click_df['event_type']=='intent') | 
            (hist_last_click_df['event_type']=='view')].groupby('user_id').tail(1)

In [None]:
# 因为每个用户只有一个最后一次点击，所以可以按user info的方式建倒排表
hist_last_click_dict = get_user_info_dict(hist_last_click_df)

In [None]:
hist_last_click_dict.keys()

dict_keys(['event_time', 'event_type', 'item_id', 'region', 'price', 'user_session', 'is_nan_region', 'year', 'month', 'season', 'yearday'])

In [None]:
del hist_last_click_df
gc.collect()

0

## 2.3 recall item vs last ua item

### 2.3.1 item侧--dense

In [None]:
item_info_dict.keys()

dict_keys(['i_deal_mean_year', 'i_deal_mean_month', 'i_deal_mean_season', 'i_deal_mean_yearday', 'i_pos_weight_mean_year', 'i_pos_weight_mean_month', 'i_pos_weight_mean_season', 'i_pos_weight_mean_yearday', 'i_deal_mode_region', 'i_pos_weight_mode_region', 'i_active_deal_nums', 'i_active_score', 'i_active_deal_day_nums', 'i_active_weight_day_nums', 'category_id', 'power', 'i_weighted_mean_price', 'i_weight_mode_region'])

In [None]:
def get_recall_item_last_ua_item_dense_fea_diff_core(user_item_label_row, fea_name, fea_dict):
  try:
    user_id = user_item_label_row['user_id']
    recall_item_id = user_item_label_row['item_id']
    last_ua_item_id = hist_last_click_dict['item_id'][user_id]

    recall_item_fea = fea_dict[fea_name][recall_item_id]
    last_ua_item_fea = fea_dict[fea_name][last_ua_item_id]
    return recall_item_fea - last_ua_item_fea
  except:
    return np.nan


def get_recall_item_last_ua_item_dense_fea_diff(user_item_label_df, in_cols, fea_name, fea_dict):
  in_cols += ['user_id', 'item_id']
  user_item_label_df = user_item_label_df[in_cols]
  user_item_label_df['new_fea'] = user_item_label_df.apply(\
      partial(get_recall_item_last_ua_item_dense_fea_diff_core, fea_name=fea_name, fea_dict=fea_dict), axis=1)
  user_item_label_df.rename(columns={'new_fea': 'rc_i_diff_last_ua_i_'+fea_name,}, inplace=True)
  return user_item_label_df[['user_id', 'item_id', 'rc_i_diff_last_ua_i_'+fea_name]]

In [None]:
dense_feas = ['i_deal_mean_year', 'i_deal_mean_month', 'i_deal_mean_season', 
        'i_deal_mean_yearday', 'i_pos_weight_mean_year', 'i_pos_weight_mean_month', 
        'i_pos_weight_mean_season', 'i_pos_weight_mean_yearday', 
        'i_active_deal_nums', 'i_active_score', 'i_active_deal_day_nums', 
        'i_active_weight_day_nums', 'power', 'i_weighted_mean_price']

for fea in tqdm.tqdm(dense_feas):
  fea_df = get_recall_item_last_ua_item_dense_fea_diff(user_item_label_df, [], fea, item_info_dict)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.apply(\
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
100%|██████████| 14/14 [06:19<00:00, 27.14s/it]


### 2.3.2 item侧--sparse

In [None]:
def get_recall_item_last_ua_item_sparse_fea_diff_core(user_item_label_row, fea_name, fea_dict):
  try:
    user_id = user_item_label_row['user_id']
    recall_item_id = user_item_label_row['item_id']
    last_ua_item_id = hist_last_click_dict['item_id'][user_id]

    recall_item_fea = fea_dict[fea_name][recall_item_id]
    last_ua_item_fea = fea_dict[fea_name][last_ua_item_id]
    return int(recall_item_fea==last_ua_item_fea)
  except:
    return np.nan


def get_recall_item_last_ua_item_sparse_fea_diff(user_item_label_df, in_cols, fea_name, fea_dict):
  in_cols += ['user_id', 'item_id']
  user_item_label_df = user_item_label_df[in_cols]
  user_item_label_df['new_fea'] = user_item_label_df.apply(\
      partial(get_recall_item_last_ua_item_sparse_fea_diff_core, fea_name=fea_name, fea_dict=fea_dict), axis=1)
  user_item_label_df.rename(columns={'new_fea': 'rc_i_diff_last_ua_i_'+fea_name,}, inplace=True)
  return user_item_label_df[['user_id', 'item_id', 'rc_i_diff_last_ua_i_'+fea_name]]

In [None]:
sparse_feas = ['i_deal_mode_region', 'i_pos_weight_mode_region', 'category_id','i_weight_mode_region']

for fea in tqdm.tqdm(sparse_feas):
  fea_df = get_recall_item_last_ua_item_sparse_fea_diff(user_item_label_df, [], fea, item_info_dict)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.apply(\
100%|██████████| 4/4 [01:52<00:00, 28.24s/it]


### 2.3.3 consumer侧--dense

In [None]:
consumer_info_dict.keys()

dict_keys(['c_deal_mean_u_deal_mean_year', 'c_deal_mean_u_deal_mean_month', 'c_deal_mean_u_deal_mean_season', 'c_deal_mean_u_deal_mean_yearday', 'c_pos_weight_mean_u_pos_weight_mean_year', 'c_pos_weight_mean_u_pos_weight_mean_month', 'c_pos_weight_mean_u_pos_weight_mean_season', 'c_pos_weight_mean_u_pos_weight_mean_yearday', 'c_deal_mean_u_deal_mean_power', 'c_pos_weight_mean_u_pos_weight_mean_power', 'c_deal_mode_u_deal_mode_category_id', 'c_pos_weight_mode_u_pos_weight_mode_category_id', 'c_deal_mode_u_deal_mode_region', 'c_pos_weight_mode_u_pos_weight_mode_region', 'c_deal_mean_u_deal_mean_price', 'c_pos_weight_mean_u_pos_weight_mean_price', 'c_deal_mean_u_active_deal_nums', 'c_pos_weight_mean_u_active_score', 'c_deal_mean_u_active_deal_day_nums', 'c_pos_weight_mean_u_active_weight_day_nums'])

In [None]:
dense_feas = ['c_deal_mean_u_deal_mean_year', 'c_deal_mean_u_deal_mean_month', 
        'c_deal_mean_u_deal_mean_season', 'c_deal_mean_u_deal_mean_yearday', 
        'c_pos_weight_mean_u_pos_weight_mean_year', 'c_pos_weight_mean_u_pos_weight_mean_month', 
        'c_pos_weight_mean_u_pos_weight_mean_season', 'c_pos_weight_mean_u_pos_weight_mean_yearday', 
        'c_deal_mean_u_deal_mean_power', 'c_pos_weight_mean_u_pos_weight_mean_power', 
        'c_deal_mean_u_deal_mean_price', 'c_pos_weight_mean_u_pos_weight_mean_price', 
        'c_deal_mean_u_active_deal_nums', 'c_pos_weight_mean_u_active_score', 
        'c_deal_mean_u_active_deal_day_nums', 'c_pos_weight_mean_u_active_weight_day_nums']

for fea in tqdm.tqdm(dense_feas):
  fea_df = get_recall_item_last_ua_item_dense_fea_diff(user_item_label_df, [], fea, consumer_info_dict)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.apply(\
100%|██████████| 16/16 [07:51<00:00, 29.47s/it]


### 2.3.4 consumer侧--sparse

In [None]:
sparse_feas = ['c_deal_mode_u_deal_mode_category_id', 'c_pos_weight_mode_u_pos_weight_mode_category_id', 
         'c_deal_mode_u_deal_mode_region', 'c_pos_weight_mode_u_pos_weight_mode_region',]

for fea in tqdm.tqdm(sparse_feas):
  fea_df = get_recall_item_last_ua_item_sparse_fea_diff(user_item_label_df, [], fea, consumer_info_dict)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.apply(\
100%|██████████| 4/4 [02:01<00:00, 30.47s/it]


## 2.4 recall item vs user info

In [None]:
user_info_dict.keys()

dict_keys(['u_deal_mean_year', 'u_deal_mean_month', 'u_deal_mean_season', 'u_deal_mean_yearday', 'u_pos_weight_mean_year', 'u_pos_weight_mean_month', 'u_pos_weight_mean_season', 'u_pos_weight_mean_yearday', 'u_deal_mean_power', 'u_pos_weight_mean_power', 'u_deal_mode_category_id', 'u_pos_weight_mode_category_id', 'u_deal_mode_region', 'u_pos_weight_mode_region', 'u_deal_mean_price', 'u_pos_weight_mean_price', 'u_active_deal_nums', 'u_active_score', 'u_active_deal_day_nums', 'u_active_weight_day_nums'])

In [None]:
item_info_dict.keys()

dict_keys(['i_deal_mean_year', 'i_deal_mean_month', 'i_deal_mean_season', 'i_deal_mean_yearday', 'i_pos_weight_mean_year', 'i_pos_weight_mean_month', 'i_pos_weight_mean_season', 'i_pos_weight_mean_yearday', 'i_deal_mode_region', 'i_pos_weight_mode_region', 'i_active_deal_nums', 'i_active_score', 'i_active_deal_day_nums', 'i_active_weight_day_nums', 'category_id', 'power', 'i_weighted_mean_price', 'i_weight_mode_region'])

In [None]:
consumer_info_dict.keys()

dict_keys(['c_deal_mean_u_deal_mean_year', 'c_deal_mean_u_deal_mean_month', 'c_deal_mean_u_deal_mean_season', 'c_deal_mean_u_deal_mean_yearday', 'c_pos_weight_mean_u_pos_weight_mean_year', 'c_pos_weight_mean_u_pos_weight_mean_month', 'c_pos_weight_mean_u_pos_weight_mean_season', 'c_pos_weight_mean_u_pos_weight_mean_yearday', 'c_deal_mean_u_deal_mean_power', 'c_pos_weight_mean_u_pos_weight_mean_power', 'c_deal_mode_u_deal_mode_category_id', 'c_pos_weight_mode_u_pos_weight_mode_category_id', 'c_deal_mode_u_deal_mode_region', 'c_pos_weight_mode_u_pos_weight_mode_region', 'c_deal_mean_u_deal_mean_price', 'c_pos_weight_mean_u_pos_weight_mean_price', 'c_deal_mean_u_active_deal_nums', 'c_pos_weight_mean_u_active_score', 'c_deal_mean_u_active_deal_day_nums', 'c_pos_weight_mean_u_active_weight_day_nums'])

### 2.4.1 item侧--dense

In [None]:
item_info_dict.keys()

dict_keys(['i_deal_mean_year', 'i_deal_mean_month', 'i_deal_mean_season', 'i_deal_mean_yearday', 'i_pos_weight_mean_year', 'i_pos_weight_mean_month', 'i_pos_weight_mean_season', 'i_pos_weight_mean_yearday', 'i_deal_mode_region', 'i_pos_weight_mode_region', 'i_active_deal_nums', 'i_active_score', 'i_active_deal_day_nums', 'i_active_weight_day_nums', 'category_id', 'power', 'i_weighted_mean_price', 'i_weight_mode_region'])

In [None]:
def get_recall_item_user_info_dense_fea_diff_core(user_item_label_row, fea_name_suff, consumer_fea):
  user_id = user_item_label_row['user_id']
  recall_item_id = user_item_label_row['item_id']

  user_fea = user_info_dict['u_'+fea_name_suff][user_id]
  if not consumer_fea:
    item_fea_dict = item_info_dict
    try:
      recall_item_fea = item_fea_dict['i_'+fea_name_suff][recall_item_id]
    except:
      if 'price' in fea_name_suff:
        recall_item_fea = item_fea_dict['i_weighted_mean_price'][recall_item_id]
      elif 'deal_mean' in fea_name_suff:
        recall_item_fea = item_fea_dict[fea_name_suff[10:]][recall_item_id]
      else:
        recall_item_fea = item_fea_dict[fea_name_suff[16:]][recall_item_id]
  else:
    item_fea_dict = consumer_info_dict
    if 'deal_nums' in fea_name_suff:
      recall_item_fea = item_fea_dict['c_deal_mean_u_active_deal_nums'][recall_item_id]
    elif 'deal_day_nums' in fea_name_suff:
      recall_item_fea = item_fea_dict['c_deal_mean_u_active_deal_day_nums'][recall_item_id]
    elif 'deal_mean' in fea_name_suff:
      recall_item_fea = item_fea_dict['c_deal_mean_u_'+fea_name_suff][recall_item_id]
    else:
      recall_item_fea = item_fea_dict['c_pos_weight_mean_u_'+fea_name_suff][recall_item_id]
  return recall_item_fea - user_fea


def get_recall_item_user_info_dense_fea_diff(user_item_label_df, in_cols, fea_name_suff, consumer_fea):
  in_cols += ['user_id', 'item_id']
  user_item_label_df = user_item_label_df[in_cols]
  user_item_label_df['new_fea'] = user_item_label_df.apply(\
      partial(get_recall_item_user_info_dense_fea_diff_core, fea_name_suff=fea_name_suff, consumer_fea=consumer_fea), axis=1)
  if not consumer_fea:
    user_item_label_df.rename(columns={'new_fea': 'rc_i_diff_u_'+fea_name_suff,}, inplace=True)
    return user_item_label_df[['user_id', 'item_id', 'rc_i_diff_u_'+fea_name_suff]]
  else:
    user_item_label_df.rename(columns={'new_fea': 'rc_c_diff_u_'+fea_name_suff,}, inplace=True)
    return user_item_label_df[['user_id', 'item_id', 'rc_c_diff_u_'+fea_name_suff]]

In [None]:
dense_feas = ['deal_mean_year', 'deal_mean_month', 'deal_mean_season', 
        'deal_mean_yearday', 'pos_weight_mean_year', 'pos_weight_mean_month', 
        'pos_weight_mean_season', 'pos_weight_mean_yearday', 
        'deal_mean_power', 
        'pos_weight_mean_power', 'deal_mean_price', 'pos_weight_mean_price', 
        'active_deal_nums', 'active_score', 'active_deal_day_nums', 'active_weight_day_nums']

for fea_suff in tqdm.tqdm(dense_feas):
  fea_df = get_recall_item_user_info_dense_fea_diff(user_item_label_df, [], fea_suff, False)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.apply(\
100%|██████████| 16/16 [08:38<00:00, 32.39s/it]


### 2.4.2 item侧--sparse

In [None]:
def get_recall_item_user_info_sparse_fea_diff_core(user_item_label_row, fea_name_suff, consumer_fea):
  user_id = user_item_label_row['user_id']
  recall_item_id = user_item_label_row['item_id']

  user_fea = user_info_dict['u_'+fea_name_suff][user_id]
  if not consumer_fea:
    item_fea_dict = item_info_dict
    try:
      recall_item_fea = item_fea_dict['i_'+fea_name_suff][recall_item_id]
    except:
      if 'deal_mode' in fea_name_suff:
        recall_item_fea = item_fea_dict[fea_name_suff[10:]][recall_item_id]
      else:
        recall_item_fea = item_fea_dict[fea_name_suff[16:]][recall_item_id]
  else:
    item_fea_dict = consumer_info_dict
    if 'deal_mode' in fea_name_suff:
      recall_item_fea = item_fea_dict['c_deal_mode_u_'+fea_name_suff][recall_item_id]
    else:
      recall_item_fea = item_fea_dict['c_pos_weight_mode_u_'+fea_name_suff][recall_item_id]
  return int(recall_item_fea==user_fea)


def get_recall_item_user_info_sparse_fea_diff(user_item_label_df, in_cols, fea_name_suff, consumer_fea):
  in_cols += ['user_id', 'item_id']
  user_item_label_df = user_item_label_df[in_cols]
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
      partial(get_recall_item_user_info_sparse_fea_diff_core, fea_name_suff=fea_name_suff, consumer_fea=consumer_fea), axis=1)
  if not consumer_fea:
    user_item_label_df.rename(columns={'new_fea': 'rc_i_diff_u_'+fea_name_suff,}, inplace=True)
    return user_item_label_df[['user_id', 'item_id', 'rc_i_diff_u_'+fea_name_suff,]]
  else:
    user_item_label_df.rename(columns={'new_fea': 'rc_c_diff_u_'+fea_name_suff,}, inplace=True)
    return user_item_label_df[['user_id', 'item_id', 'rc_c_diff_u_'+fea_name_suff,]]

In [None]:
sparse_feas = ['deal_mode_category_id', 'pos_weight_mode_category_id', 
         'deal_mode_region', 'pos_weight_mode_region',]
for fea_suff in tqdm.tqdm(sparse_feas):
  fea_df = get_recall_item_user_info_sparse_fea_diff(user_item_label_df, [], fea_suff, False)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
100%|██████████| 4/4 [01:43<00:00, 25.95s/it]


### 2.4.3 consumer侧--dense

In [None]:
dense_feas = ['deal_mean_year', 'deal_mean_month', 'deal_mean_season', 
        'deal_mean_yearday', 'pos_weight_mean_year', 'pos_weight_mean_month', 
        'pos_weight_mean_season', 'pos_weight_mean_yearday', 'deal_mean_power', 
        'pos_weight_mean_power', 'deal_mean_price', 'pos_weight_mean_price', 
        'active_deal_day_nums', 'active_weight_day_nums']

for fea_suff in tqdm.tqdm(dense_feas):
  fea_df = get_recall_item_user_info_dense_fea_diff(user_item_label_df, [], fea_suff, True)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.apply(\
100%|██████████| 14/14 [08:22<00:00, 35.90s/it]


### 2.4.4 consumer侧--sparse

In [None]:
sparse_feas =  ['deal_mode_category_id', 'pos_weight_mode_category_id', 
          'deal_mode_region', 'pos_weight_mode_region',]

for fea_suff in tqdm.tqdm(sparse_feas):
  fea_df = get_recall_item_user_info_sparse_fea_diff(user_item_label_df, [], fea_suff, True)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
100%|██████████| 4/4 [01:55<00:00, 28.82s/it]


## 2.5 ctx vs last ua ctx

In [None]:
hist_last_click_dict.keys()

dict_keys(['event_time', 'event_type', 'item_id', 'region', 'price', 'user_session', 'is_nan_region', 'year', 'month', 'season', 'yearday'])

### 2.5.1 dense

In [None]:
def get_ctx_last_ua_ctx_dense_fea_diff_core(user_item_label_row, fea_name):
  try:
    user_id = user_item_label_row['user_id']
    ctx_fea = user_item_label_row[fea_name]
    last_ua_fea = hist_last_click_dict[fea_name][user_id]
    return ctx_fea - last_ua_fea
  except:
    return np.nan


def get_ctx_last_ua_ctx_dense_fea_diff(user_item_label_df, in_cols, fea_name):
  in_cols += ['user_id', 'item_id', fea_name]
  user_item_label_df = user_item_label_df[in_cols]
  user_item_label_df['new_fea'] = user_item_label_df.apply(\
      partial(get_ctx_last_ua_ctx_dense_fea_diff_core, fea_name=fea_name), axis=1)
  user_item_label_df.rename(columns={'new_fea': 'ctx_diff_last_ua_ctx_'+fea_name,}, inplace=True)
  return user_item_label_df[['user_id', 'item_id', 'ctx_diff_last_ua_ctx_'+fea_name]]

In [None]:
dense_feas = ['event_time', 'price', 'year', 'month', 'season', 'yearday']

for fea in tqdm.tqdm(dense_feas):
  fea_df = get_ctx_last_ua_ctx_dense_fea_diff(user_item_label_df, [], fea)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.apply(\
100%|██████████| 6/6 [03:16<00:00, 32.69s/it]


### 2.5.2 sparse

In [None]:
def get_ctx_last_ua_ctx_sparse_fea_diff_core(user_item_label_row, fea_name):
  try:
    user_id = user_item_label_row['user_id']
    ctx_fea = user_item_label_row[fea_name]
    last_ua_fea = hist_last_click_dict[fea_name][user_id]
    return int(ctx_fea==last_ua_fea)
  except:
    return np.nan


def get_ctx_last_ua_ctx_sparse_fea_diff(user_item_label_df, in_cols, fea_name):
  in_cols += ['user_id', 'item_id', fea_name]
  user_item_label_df = user_item_label_df[in_cols]
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
      partial(get_ctx_last_ua_ctx_sparse_fea_diff_core, fea_name=fea_name), axis=1)
  user_item_label_df.rename(columns={'new_fea': 'ctx_diff_last_ua_ctx_'+fea_name,}, inplace=True)
  return user_item_label_df[['user_id', 'item_id', 'ctx_diff_last_ua_ctx_'+fea_name]]

In [None]:
sparse_feas = ['region', 'user_session', 'is_nan_region',]

for fea in tqdm.tqdm(sparse_feas):
  fea_df = get_ctx_last_ua_ctx_sparse_fea_diff(user_item_label_df, [], fea)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
100%|██████████| 3/3 [01:23<00:00, 27.73s/it]


# 3.保存特征

### 3.1 保存提取特征后的表

In [None]:
if os.path.exists(os.path.join(save_dir, 'user_item_rank_feats_df.csv')):
  os.remove(os.path.join(save_dir, 'user_item_rank_feats_df.csv'))

In [None]:
user_item_label_df.to_csv(os.path.join(save_dir, 'user_item_rank_feats_df.csv'), index=False)

## 3.2 保存训练集、测试集用户列表

In [None]:
if os.path.exists(os.path.join(save_dir, 'trn_user.csv')):
  os.remove(os.path.join(save_dir, 'trn_user.csv'))
trn_user.to_csv(os.path.join(save_dir, 'trn_user.csv'), index=False)

if os.path.exists(os.path.join(save_dir, 'val_user.csv')):
  os.remove(os.path.join(save_dir, 'val_user.csv'))
val_user.to_csv(os.path.join(save_dir, 'val_user.csv'), index=False)