# Author imformation:
Fan Shengzhe, Shanghaijiaotong University, Shanghai, China  
Email: fanshengzhe@sjtu.edu.cn

# 1.读取数据

## 1.1 基本数据的读取

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import os
os.chdir("drive/My Drive/reco/veg-fru-reco")

In [None]:
# path_prefix = '.'
# 2023年2月测试colab时发现有bug，相对路径有时候不起作用
# 可以切换为绝对路径解决这个bug
path_prefix = '/content/drive/My Drive/reco/veg-fru-reco'

In [None]:
import pandas as pd
import numpy as np
import os
import json
import tqdm
import gc
from functools import partial

data_dir = os.path.join(path_prefix, 'cache')
save_dir = os.path.join(path_prefix, 'cache')

if not os.path.exists(save_dir):
  os.mkdir(save_dir)

log_table = pd.read_csv(os.path.join(data_dir, 'ctx_info.csv'))
user_info = pd.read_csv(os.path.join(data_dir, 'user_info.csv'))
item_info = pd.read_csv(os.path.join(data_dir, 'item_info.csv'))
consumer_info = pd.read_csv(os.path.join(data_dir, 'consumer_info.csv'))

In [None]:
# 节省内存的一个函数
# 减少内存
def reduce_mem(df):
  numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
  start_mem = df.memory_usage().sum() / 1024**2
  for col in df.columns:
    col_type = df[col].dtypes
    if col_type in numerics:
      c_min = df[col].min()
      c_max = df[col].max()
      if pd.isnull(c_min) or pd.isnull(c_max):
        continue
      if str(col_type)[:3] == 'int':
        if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
            df[col] = df[col].astype(np.int8)
        elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
            df[col] = df[col].astype(np.int16)
        elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
            df[col] = df[col].astype(np.int32)
        elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
          df[col] = df[col].astype(np.int64)
      else:
        if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
          df[col] = df[col].astype(np.float16)
        elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
          df[col] = df[col].astype(np.float32)
        else:
          df[col] = df[col].astype(np.float64)
  end_mem = df.memory_usage().sum() / 1024**2
  # print('-- Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100*(start_mem-end_mem)/start_mem))
  return df

In [None]:
log_table = reduce_mem(log_table)
user_info = reduce_mem(user_info)
item_info = reduce_mem(item_info)
consumer_info = reduce_mem(consumer_info)

### 1.1.1 i侧 i: item_info倒排表

In [None]:
def get_item_info_dict(item_info_df):

  fea_name_list = list(item_info_df.columns)
  fea_name_list.remove('item_id')
  item_info_dict = {fea_name: {} for fea_name in fea_name_list}
  for fea_name in item_info_dict:
    item_info_dict[fea_name] = dict(zip(item_info_df['item_id'], item_info_df[fea_name]))

  return item_info_dict

In [None]:
item_info_dict = get_item_info_dict(item_info)
consumer_info_dict = get_item_info_dict(consumer_info)

In [None]:
item_info_dict.keys()

dict_keys(['category_id', 'prod_at_ts_ms', 'price', 'prod_at_ts_s', 'prod_year', 'prod_month', 'prod_day', 'prod_hour', 'prod_minute', 'prod_second', 'prod_season', 'prod_time_bucket', 'prod_weekday', 'prod_yearday', 'i_mean_day', 'i_mean_hour', 'i_mean_minute', 'i_mean_time_bucket', 'i_mean_weekday', 'i_mean_yearday', 'i_mode_environment', 'i_mode_device_group', 'i_mode_os', 'i_mode_province', 'i_mode_city', 'i_mode_referrer_type', 'i_active_nums', 'i_active_day_nums', 'i_mean_rev_ua_time', 'i_ua_score', 'i_mean_freshness'])

In [None]:
consumer_info_dict.keys()

dict_keys(['c_mean_u_mean_day', 'c_mean_u_mean_hour', 'c_mean_u_mean_minute', 'c_mean_u_mean_time_bucket', 'c_mean_u_mean_weekday', 'c_mean_u_mean_yearday', 'c_mean_u_mean_prod_year', 'c_mean_u_mean_prod_month', 'c_mean_u_mean_prod_day', 'c_mean_u_mean_prod_hour', 'c_mean_u_mean_prod_minute', 'c_mean_u_mean_prod_season', 'c_mean_u_mean_prod_time_bucket', 'c_mean_u_mean_prod_weekday', 'c_mean_u_mean_prod_yearday', 'c_mean_u_mean_price', 'c_mode_u_mode_category_id', 'c_mode_u_mode_environment', 'c_mode_u_mode_device_group', 'c_mode_u_mode_os', 'c_mode_u_mode_province', 'c_mode_u_mode_city', 'c_mode_u_mode_referrer_type', 'c_mean_u_active_nums', 'c_mean_u_active_day_nums', 'c_mean_u_mean_rev_ua_time', 'c_mean_u_ua_score', 'c_mean_u_mean_freshness'])

### 1.1.2 u侧 u: user_info倒排表

In [None]:
def get_user_info_dict(user_info_df):

  fea_name_list = list(user_info_df.columns)
  fea_name_list.remove('user_id')
  user_info_dict = {fea_name: {} for fea_name in fea_name_list}
  for fea_name in user_info_dict:
    user_info_dict[fea_name] = dict(zip(user_info_df['user_id'], user_info_df[fea_name]))

  return user_info_dict

In [None]:
user_info_dict = get_user_info_dict(user_info)
user_info_dict.keys()

dict_keys(['u_mean_day', 'u_mean_hour', 'u_mean_minute', 'u_mean_second', 'u_mean_time_bucket', 'u_mean_weekday', 'u_mean_yearday', 'u_mean_prod_year', 'u_mean_prod_month', 'u_mean_prod_day', 'u_mean_prod_hour', 'u_mean_prod_minute', 'u_mean_prod_season', 'u_mean_prod_time_bucket', 'u_mean_prod_weekday', 'u_mean_prod_yearday', 'u_mean_price', 'u_mode_category_id', 'u_mode_environment', 'u_mode_device_group', 'u_mode_os', 'u_mode_province', 'u_mode_city', 'u_mode_referrer_type', 'u_active_nums', 'u_active_day_nums', 'u_mean_rev_ua_time', 'u_ua_score', 'u_mean_freshness'])

In [None]:
try:
  from pandarallel import pandarallel
except:
  !pip install pandarallel
  from pandarallel import pandarallel
pandarallel.initialize(progress_bar=False)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandarallel
  Downloading pandarallel-1.6.4.tar.gz (12 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25l[?25hdone
  Created wheel for pandarallel: filename=pandarallel-1.6.4-py3-none-any.whl size=16678 sha256=8f381c6ef19404040bceff441072b2d41b1a5658444f833501c16f6c99e6e30e
  Stored in directory: /root/.cache/pip/wheels/62/1e/e7/f9ee096e5cc02890a6934a5670ff6e45a3400f330605bd8210
Successfully built pandarallel
Installing collected packages: pandarallel
Successfully installed pandarallel-1.6.4
INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [None]:
log_table

Unnamed: 0,user_id,item_id,timestamp_ms,environment,device_group,os,province,city,referrer_type,timestamp_s,day,hour,minute,second,time_bucket,weekday,yearday,freshness
0,0,160417,1507029570190,0,0,0,0,0,0,1507029570,3,11,19,30,3,1,276,87481190
1,0,5408,1507029571478,0,0,0,0,0,0,1507029571,3,11,19,31,3,1,276,35314478
2,0,50823,1507029601478,0,0,0,0,0,0,1507029601,3,11,20,1,3,1,276,15987478
3,1,157770,1507029532200,0,0,0,0,1,1,1507029532,3,11,18,52,3,1,276,45597200
4,1,96613,1507029671831,0,0,0,0,1,1,1507029671,3,11,21,11,3,1,276,91227831
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1630628,228075,70758,1508211323220,0,1,1,0,1,2,1508211323,17,3,35,23,0,1,290,23159220
1630629,242176,331116,1508211542618,0,1,1,0,1,0,1508211542,17,3,39,2,0,1,290,133923618
1630630,242176,234481,1508211850103,0,1,1,0,1,0,1508211850,17,3,44,10,0,1,290,30278103
1630631,242176,211442,1508212189949,0,1,1,0,1,0,1508212189,17,3,49,49,0,1,290,24469949


## 1.2 拆分history和label

In [None]:
hist_click_df = pd.read_csv(os.path.join(data_dir, 'hist_click_df.csv'))
last_click_df = pd.read_csv(os.path.join(data_dir, 'last_click_df.csv'))

hist_click_df = reduce_mem(hist_click_df)
last_click_df = reduce_mem(last_click_df)

## 1.3 切分训练、验证用户

In [None]:
all_user_num = len(log_table['user_id'].unique())

In [None]:
import numpy as np

def trn_val_split(log_table, sample_user_nums=5000):
  all_user_ids = log_table.user_id.unique()
  
  sampled_user_ids = np.random.choice(all_user_ids, size=sample_user_nums, replace=False)
  
  log_val = log_table[log_table['user_id'].isin(sampled_user_ids)]
  
  # 将验证集中的最后一次点击抽取出来作为答案
  log_val = log_val.sort_values(['user_id', 'timestamp_ms'])
  val_ans = log_val.groupby('user_id').tail(1)
  
  log_val = pd.concat([log_val, val_ans, val_ans]).drop_duplicates(['user_id', 'item_id', 'timestamp_ms'], keep=False)
  # 去除val_ans中某些用户只有一个点击数据的情况，如果该用户只有一个点击数据，又被分到ans中，
  # 那么训练集中就没有这个用户的点击数据，出现用户冷启动问题，给模型验证带来麻烦
  val_ans = val_ans[val_ans.user_id.isin(log_val.user_id.unique())] # 保证答案中出现的用户在验证集中还有
  log_val = log_val[log_val.user_id.isin(val_ans.user_id.unique())]

  log_trn = log_table[~log_table['user_id'].isin(sampled_user_ids)]
  
  return log_trn['user_id'].drop_duplicates(), log_val['user_id'].drop_duplicates()

In [None]:
trn_user, val_user = trn_val_split(log_table, int(all_user_num * 0.1))

## 1.4 读取召回数据

### 1.4.1 读取召回字典

In [None]:
def int_keys(ordered_pairs):
  result = {}
  for key, value in ordered_pairs:
    try:
      key = int(key)
    except ValueError:
      pass
    result[key] = value
  return result

In [None]:
with open(os.path.join(data_dir, 'recall_items_dict.json'), 'r', encoding="utf-8") as f:
  recall_list_dict = json.load(f, object_pairs_hook=int_keys)

### 1.4.2 召回字典转化为dataframe

In [None]:
# 将召回列表转换成df的形式
def recall_dict2df(recall_list_dict):
  df_row_list = [] # [user, item, score]
  for user, recall_list in tqdm.tqdm(recall_list_dict.items()):
    for item, recall_score in recall_list:
      df_row_list.append([user, item, recall_score])
  
  col_names = ['user_id', 'item_id', 'recall_score']
  recall_list_df = pd.DataFrame(df_row_list, columns=col_names)
  
  return recall_list_df

In [None]:
recall_list_df = recall_dict2df(recall_list_dict)
recall_list_df = reduce_mem(recall_list_df)

100%|██████████| 250000/250000 [00:36<00:00, 6920.50it/s]


In [None]:
del recall_list_dict
gc.collect()

0

### 1.4.3 对召回列表打标、负采样

In [None]:
# 对召回数据打标签
def get_rank_label_df(recall_list_df, label_df, is_test=False):
  # 测试集没有标签
  if is_test:
    recall_list_df['label'] = -1
    return recall_list_df
  
  def event_type2label(timestamp_ms):
    if pd.isna(timestamp_ms):
      return 0
    else:
      return 1

  recall_list_df_ = recall_list_df.merge(label_df[['user_id', 'item_id', 'timestamp_ms']], \
                       how='left', on=['user_id', 'item_id'])
  recall_list_df_['label'] = recall_list_df_['timestamp_ms'].parallel_apply(event_type2label)
  del recall_list_df_['timestamp_ms']
    
  return recall_list_df_

In [None]:
# 对召回列表做负采样
def neg_sample_recall_data(recall_items_df, sample_rate=0.001):
  pos_data = recall_items_df[recall_items_df['label'].isin([1])]
  neg_data = recall_items_df[recall_items_df['label'].isin([0])]
  
  print('before: pos_data_num:', len(pos_data), 'neg_data_num:', len(neg_data), 'pos/neg:', len(pos_data)/len(neg_data))
  
  # 分组采样函数
  def neg_sample_func(group_df):
    neg_num = len(group_df)
    sample_num = max(int(neg_num * sample_rate), 1) # 保证最少有一个
    sample_num = min(sample_num, 5) # 保证最多不超过5个，这里可以根据实际情况进行选择
    return group_df.sample(n=sample_num, replace=True)
  
  # 对user进行负采样，保证所有user都在采样后的数据中
  neg_data_user_sample = neg_data.groupby('user_id', group_keys=False).apply(neg_sample_func)
  # 对item进行负采样，保证所有item都在采样后的数据中
  neg_data_item_sample = neg_data.groupby('item_id', group_keys=False).apply(neg_sample_func)
  
  # 将上述两种情况下的采样数据合并
  neg_data_new = neg_data_user_sample.append(neg_data_item_sample)
  # 由于上述两个操作是分开的，可能将两个相同的数据给重复选择了，所以需要对合并后的数据进行去重
  neg_data_new = neg_data_new.sort_values(['user_id', 'recall_score']).drop_duplicates(['user_id', 'item_id'], keep='last')
  
  # 将正样本数据合并
  data_new = pd.concat([pos_data, neg_data_new], ignore_index=True)

  new_pos_data = data_new[data_new['label'].isin([1])]
  new_neg_data = data_new[data_new['label'].isin([0])]
  print('after: pos_data_num:', len(new_pos_data), 'neg_data_num:', len(new_neg_data), 'pos/neg:', len(new_pos_data)/len(new_neg_data))
    
  return data_new

In [None]:
# 打标 + 负采样的主调函数
def get_user_recall_item_label_df(hist_click_df, last_click_df, recall_list_df, sample_rate=0.001, is_test=False):
  # 获取训练数据的召回列表
  user_items_df = recall_list_df[recall_list_df['user_id'].isin(hist_click_df['user_id'].unique())]
  # 训练和测试数据打标签
  user_item_label_df = get_rank_label_df(user_items_df, last_click_df, is_test=is_test)

  # 训练集负采样
  trn_user_item_label_df = neg_sample_recall_data(user_item_label_df[user_item_label_df['user_id'].isin(trn_user)], sample_rate=sample_rate)
  val_user_item_label_df = user_item_label_df[user_item_label_df['user_id'].isin(val_user)]
  user_item_label_df = pd.concat([trn_user_item_label_df, val_user_item_label_df])
  return user_item_label_df

In [None]:
user_item_label_df = reduce_mem(get_user_recall_item_label_df(hist_click_df, last_click_df, recall_list_df, sample_rate=0.01))

before: pos_data_num: 172447 neg_data_num: 33577553 pos/neg: 0.005135782229276803
after: pos_data_num: 172447 neg_data_num: 265104 pos/neg: 0.6504881103265133


In [None]:
del recall_list_df
gc.collect()

0

### 1.4.4 拼接上下文特征

In [None]:
user_item_label_df = user_item_label_df.merge(last_click_df.drop(['item_id'], axis=1), how='left', on='user_id')

In [None]:
del last_click_df
gc.collect()

0

### 1.4.5 对打标后的召回列表做倒排表

In [None]:
# 将最终的召回的df数据转换成字典的形式做排序特征
def make_tuple_func(group_df):
  row_data = []
  for name, row_df in group_df.iterrows():
    row_data.append((row_df['item_id'].astype(int), row_df['recall_score'], row_df['label'].astype(int)))
    
  return row_data

In [None]:
# user_item_label_tuples = user_item_label_df.groupby('user_id').parallel_apply(make_tuple_func).reset_index()
# user_item_label_tuples_dict = dict(zip(user_item_label_tuples['user_id'], user_item_label_tuples[0]))

# 2.特征工程

## 2.1 拼接召回特征

In [None]:
user_item_label_df = pd.merge(user_item_label_df, user_info, how='left', on='user_id')
user_item_label_df = pd.merge(user_item_label_df, item_info, how='left', on='item_id')
user_item_label_df = pd.merge(user_item_label_df, consumer_info, how='left', on='item_id')

In [None]:
del user_info
del item_info
del consumer_info
gc.collect()

0

## 2.2 获取last ua item

In [None]:
hist_last_click_df = hist_click_df.sort_values(by=['user_id', 'timestamp_ms']).groupby('user_id').tail(1)

In [None]:
# 因为每个用户只有一个最后一次点击，所以可以按user info的方式建倒排表
hist_last_click_dict = get_user_info_dict(hist_last_click_df)

In [None]:
hist_last_click_dict.keys()

dict_keys(['item_id', 'timestamp_ms', 'environment', 'device_group', 'os', 'province', 'city', 'referrer_type', 'timestamp_s', 'day', 'hour', 'minute', 'second', 'time_bucket', 'weekday', 'yearday', 'freshness'])

In [None]:
del hist_last_click_df
gc.collect()

0

## 2.3 recall item vs last ua item

### 2.3.1 item侧--dense

In [None]:
item_info_dict.keys()

dict_keys(['category_id', 'prod_at_ts_ms', 'price', 'prod_at_ts_s', 'prod_year', 'prod_month', 'prod_day', 'prod_hour', 'prod_minute', 'prod_second', 'prod_season', 'prod_time_bucket', 'prod_weekday', 'prod_yearday', 'i_mean_day', 'i_mean_hour', 'i_mean_minute', 'i_mean_time_bucket', 'i_mean_weekday', 'i_mean_yearday', 'i_mode_environment', 'i_mode_device_group', 'i_mode_os', 'i_mode_province', 'i_mode_city', 'i_mode_referrer_type', 'i_active_nums', 'i_active_day_nums', 'i_mean_rev_ua_time', 'i_ua_score', 'i_mean_freshness'])

In [None]:
def get_recall_item_last_ua_item_dense_fea_diff_core(user_item_label_row, fea_name, fea_dict):
  user_id = user_item_label_row['user_id']
  recall_item_id = user_item_label_row['item_id']
  last_ua_item_id = hist_last_click_dict['item_id'][user_id]

  recall_item_fea = fea_dict[fea_name][recall_item_id]
  last_ua_item_fea = fea_dict[fea_name][last_ua_item_id]
  return recall_item_fea - last_ua_item_fea


def get_recall_item_last_ua_item_dense_fea_diff(user_item_label_df, in_cols, fea_name, fea_dict):
  in_cols += ['user_id', 'item_id']
  user_item_label_df = user_item_label_df[in_cols]
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
      partial(get_recall_item_last_ua_item_dense_fea_diff_core, fea_name=fea_name, fea_dict=fea_dict), axis=1)
  user_item_label_df.rename(columns={'new_fea': 'rc_i_diff_last_ua_i_'+fea_name,}, inplace=True)
  return user_item_label_df[['user_id', 'item_id', 'rc_i_diff_last_ua_i_'+fea_name]]

In [None]:
dense_feas = ['prod_at_ts_ms', 'price', 'prod_year', 'prod_month', 'prod_day', 'prod_hour',
        'prod_minute', 'prod_second', 'prod_season', 'prod_time_bucket', 'prod_weekday', 
        'prod_yearday', 'i_mean_day', 'i_mean_hour', 'i_mean_minute', 'i_mean_time_bucket', 
        'i_mean_weekday', 'i_mean_yearday', 
        'i_active_nums', 'i_active_day_nums', 'i_mean_rev_ua_time', 'i_ua_score', 'i_mean_freshness']

for fea in tqdm.tqdm(dense_feas):
  fea_df = get_recall_item_last_ua_item_dense_fea_diff(user_item_label_df, [], fea, item_info_dict)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
100%|██████████| 23/23 [48:58<00:00, 127.75s/it]


### 2.3.2 item侧--sparse

In [None]:
def get_recall_item_last_ua_item_sparse_fea_diff_core(user_item_label_row, fea_name, fea_dict):
  user_id = user_item_label_row['user_id']
  recall_item_id = user_item_label_row['item_id']
  last_ua_item_id = hist_last_click_dict['item_id'][user_id]

  recall_item_fea = fea_dict[fea_name][recall_item_id]
  last_ua_item_fea = fea_dict[fea_name][last_ua_item_id]
  return int(recall_item_fea==last_ua_item_fea)


def get_recall_item_last_ua_item_sparse_fea_diff(user_item_label_df, in_cols, fea_name, fea_dict):
  in_cols += ['user_id', 'item_id']
  user_item_label_df = user_item_label_df[in_cols]
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
      partial(get_recall_item_last_ua_item_sparse_fea_diff_core, fea_name=fea_name, fea_dict=fea_dict), axis=1)
  user_item_label_df.rename(columns={'new_fea': 'rc_i_diff_last_ua_i_'+fea_name,}, inplace=True)
  return user_item_label_df[['user_id', 'item_id', 'rc_i_diff_last_ua_i_'+fea_name]]

In [None]:
sparse_feas = ['category_id', 'i_mode_environment', 'i_mode_device_group', 'i_mode_os', 
        'i_mode_province', 'i_mode_city', 'i_mode_referrer_type',]

for fea in tqdm.tqdm(sparse_feas):
  fea_df = get_recall_item_last_ua_item_sparse_fea_diff(user_item_label_df, [], fea, item_info_dict)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
100%|██████████| 7/7 [15:25<00:00, 132.21s/it]


### 2.3.3 consumer侧--dense

In [None]:
consumer_info_dict.keys()

dict_keys(['c_mean_u_mean_day', 'c_mean_u_mean_hour', 'c_mean_u_mean_minute', 'c_mean_u_mean_time_bucket', 'c_mean_u_mean_weekday', 'c_mean_u_mean_yearday', 'c_mean_u_mean_prod_year', 'c_mean_u_mean_prod_month', 'c_mean_u_mean_prod_day', 'c_mean_u_mean_prod_hour', 'c_mean_u_mean_prod_minute', 'c_mean_u_mean_prod_season', 'c_mean_u_mean_prod_time_bucket', 'c_mean_u_mean_prod_weekday', 'c_mean_u_mean_prod_yearday', 'c_mean_u_mean_price', 'c_mode_u_mode_category_id', 'c_mode_u_mode_environment', 'c_mode_u_mode_device_group', 'c_mode_u_mode_os', 'c_mode_u_mode_province', 'c_mode_u_mode_city', 'c_mode_u_mode_referrer_type', 'c_mean_u_active_nums', 'c_mean_u_active_day_nums', 'c_mean_u_mean_rev_ua_time', 'c_mean_u_ua_score', 'c_mean_u_mean_freshness'])

In [None]:
dense_feas = ['c_mean_u_mean_day', 'c_mean_u_mean_hour', 'c_mean_u_mean_minute', 'c_mean_u_mean_time_bucket', 
        'c_mean_u_mean_weekday', 'c_mean_u_mean_yearday', 'c_mean_u_mean_prod_year', 'c_mean_u_mean_prod_month',
        'c_mean_u_mean_prod_day', 'c_mean_u_mean_prod_hour', 'c_mean_u_mean_prod_minute',
        'c_mean_u_mean_prod_season', 'c_mean_u_mean_prod_time_bucket', 'c_mean_u_mean_prod_weekday', 
        'c_mean_u_mean_prod_yearday', 'c_mean_u_mean_price', 'c_mean_u_active_nums', 'c_mean_u_active_day_nums', 
        'c_mean_u_mean_rev_ua_time', 'c_mean_u_ua_score', 'c_mean_u_mean_freshness']

for fea in tqdm.tqdm(dense_feas):
  fea_df = get_recall_item_last_ua_item_dense_fea_diff(user_item_label_df, [], fea, consumer_info_dict)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
100%|██████████| 21/21 [30:37<00:00, 87.49s/it]


### 2.3.4 consumer侧--sparse

In [None]:
sparse_feas = ['c_mode_u_mode_category_id', 'c_mode_u_mode_environment', 'c_mode_u_mode_device_group', 
        'c_mode_u_mode_os', 'c_mode_u_mode_province', 'c_mode_u_mode_city', 'c_mode_u_mode_referrer_type',]

for fea in tqdm.tqdm(sparse_feas):
  fea_df = get_recall_item_last_ua_item_sparse_fea_diff(user_item_label_df, [], fea, consumer_info_dict)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
100%|██████████| 7/7 [11:17<00:00, 96.83s/it]


### 2.3.5 召回相似性特征

In [None]:
# 读取itemcf sim
with open(os.path.join(data_dir, 'itemcf_sim_dict.json'), 'r', encoding="utf-8") as f:
  itemcf_sim_dict = json.load(f, object_pairs_hook=int_keys)
# 读取binetwork sim
with open(os.path.join(data_dir, 'binetwork_sim_dict.json'), 'r', encoding="utf-8") as f:
  binetwork_sim_dict = json.load(f, object_pairs_hook=int_keys)
# 读取word2vec sim
with open(os.path.join(data_dir, 'word2vec_emb_sim_dict.json'), 'r', encoding="utf-8") as f:
  word2vec_emb_sim_dict = json.load(f, object_pairs_hook=int_keys)

In [None]:
def get_recall_item_last_ua_item_recall_sim_diff_core(user_item_label_row, sim_dict):
  user_id = user_item_label_row['user_id']
  recall_item_id = user_item_label_row['item_id']
  last_ua_item_id = hist_last_click_dict['item_id'][user_id]
  try:
    res = sim_dict[last_ua_item_id][recall_item_id]
  except:
    res = 0
  return res

def get_recall_item_last_ua_item_recall_sim_diff(user_item_label_df, in_cols, sim_dict, recall_name):
  in_cols += ['user_id', 'item_id']
  user_item_label_df = user_item_label_df[in_cols]
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
      partial(get_recall_item_last_ua_item_recall_sim_diff_core, sim_dict=sim_dict), axis=1)
  user_item_label_df.rename(columns={'new_fea': 'rc_i_diff_last_ua_i_'+recall_name+'_sim',}, inplace=True)
  return user_item_label_df[['user_id', 'item_id', 'rc_i_diff_last_ua_i_'+recall_name+'_sim']]

In [None]:
fea_df = get_recall_item_last_ua_item_recall_sim_diff(user_item_label_df, [], itemcf_sim_dict, 'itemcf')
user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))

fea_df = get_recall_item_last_ua_item_recall_sim_diff(user_item_label_df, [], binetwork_sim_dict, 'binetwork')
user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))

fea_df = get_recall_item_last_ua_item_recall_sim_diff(user_item_label_df, [], word2vec_emb_sim_dict, 'word2vec')
user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))

gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\


0

## 2.4 recall item vs user info

In [None]:
user_info_dict.keys()

dict_keys(['u_mean_day', 'u_mean_hour', 'u_mean_minute', 'u_mean_second', 'u_mean_time_bucket', 'u_mean_weekday', 'u_mean_yearday', 'u_mean_prod_year', 'u_mean_prod_month', 'u_mean_prod_day', 'u_mean_prod_hour', 'u_mean_prod_minute', 'u_mean_prod_season', 'u_mean_prod_time_bucket', 'u_mean_prod_weekday', 'u_mean_prod_yearday', 'u_mean_price', 'u_mode_category_id', 'u_mode_environment', 'u_mode_device_group', 'u_mode_os', 'u_mode_province', 'u_mode_city', 'u_mode_referrer_type', 'u_active_nums', 'u_active_day_nums', 'u_mean_rev_ua_time', 'u_ua_score', 'u_mean_freshness'])

In [None]:
item_info_dict.keys()

dict_keys(['category_id', 'prod_at_ts_ms', 'price', 'prod_at_ts_s', 'prod_year', 'prod_month', 'prod_day', 'prod_hour', 'prod_minute', 'prod_second', 'prod_season', 'prod_time_bucket', 'prod_weekday', 'prod_yearday', 'i_mean_day', 'i_mean_hour', 'i_mean_minute', 'i_mean_time_bucket', 'i_mean_weekday', 'i_mean_yearday', 'i_mode_environment', 'i_mode_device_group', 'i_mode_os', 'i_mode_province', 'i_mode_city', 'i_mode_referrer_type', 'i_active_nums', 'i_active_day_nums', 'i_mean_rev_ua_time', 'i_ua_score', 'i_mean_freshness'])

In [None]:
consumer_info_dict.keys()

dict_keys(['c_mean_u_mean_day', 'c_mean_u_mean_hour', 'c_mean_u_mean_minute', 'c_mean_u_mean_time_bucket', 'c_mean_u_mean_weekday', 'c_mean_u_mean_yearday', 'c_mean_u_mean_prod_year', 'c_mean_u_mean_prod_month', 'c_mean_u_mean_prod_day', 'c_mean_u_mean_prod_hour', 'c_mean_u_mean_prod_minute', 'c_mean_u_mean_prod_season', 'c_mean_u_mean_prod_time_bucket', 'c_mean_u_mean_prod_weekday', 'c_mean_u_mean_prod_yearday', 'c_mean_u_mean_price', 'c_mode_u_mode_category_id', 'c_mode_u_mode_environment', 'c_mode_u_mode_device_group', 'c_mode_u_mode_os', 'c_mode_u_mode_province', 'c_mode_u_mode_city', 'c_mode_u_mode_referrer_type', 'c_mean_u_active_nums', 'c_mean_u_active_day_nums', 'c_mean_u_mean_rev_ua_time', 'c_mean_u_ua_score', 'c_mean_u_mean_freshness'])

### 2.4.1 item侧--dense

In [None]:
def get_recall_item_user_info_dense_fea_diff_core(user_item_label_row, fea_name_suff, consumer_fea):
  user_id = user_item_label_row['user_id']
  recall_item_id = user_item_label_row['item_id']

  user_fea = user_info_dict['u_'+fea_name_suff][user_id]
  if not consumer_fea:
    item_fea_dict = item_info_dict
    try:
      recall_item_fea = item_fea_dict['i_'+fea_name_suff][recall_item_id]
    except:
      recall_item_fea = item_fea_dict[fea_name_suff[5:]][recall_item_id]
  else:
    item_fea_dict = consumer_info_dict
    recall_item_fea = item_fea_dict['c_mean_u_'+fea_name_suff][recall_item_id]
  return recall_item_fea - user_fea


def get_recall_item_user_info_dense_fea_diff(user_item_label_df, in_cols, fea_name_suff, consumer_fea):
  in_cols += ['user_id', 'item_id']
  user_item_label_df = user_item_label_df[in_cols]
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
      partial(get_recall_item_user_info_dense_fea_diff_core, fea_name_suff=fea_name_suff, consumer_fea=consumer_fea), axis=1)
  if not consumer_fea:
    user_item_label_df.rename(columns={'new_fea': 'rc_i_diff_u_'+fea_name_suff,}, inplace=True)
    return user_item_label_df[['user_id', 'item_id', 'rc_i_diff_u_'+fea_name_suff]]
  else:
    user_item_label_df.rename(columns={'new_fea': 'rc_c_diff_u_'+fea_name_suff,}, inplace=True)
    return user_item_label_df[['user_id', 'item_id', 'rc_c_diff_u_'+fea_name_suff]]

In [None]:
dense_feas = ['mean_day', 'mean_hour', 'mean_minute', 'mean_time_bucket',
        'mean_weekday', 'mean_yearday', 'mean_prod_year', 'mean_prod_month',
        'mean_prod_day', 'mean_prod_hour', 'mean_prod_minute', 'mean_prod_season', 
        'mean_prod_time_bucket', 'mean_prod_weekday', 'mean_prod_yearday', 'mean_price', 'mean_freshness']
for fea_suff in tqdm.tqdm(dense_feas):
  fea_df = get_recall_item_user_info_dense_fea_diff(user_item_label_df, [], fea_suff, False)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
100%|██████████| 17/17 [28:56<00:00, 102.12s/it]


### 2.4.2 item侧--sparse

In [None]:
def get_recall_item_user_info_sparse_fea_diff_core(user_item_label_row, fea_name_suff, consumer_fea):
  user_id = user_item_label_row['user_id']
  recall_item_id = user_item_label_row['item_id']

  user_fea = user_info_dict['u_'+fea_name_suff][user_id]
  if not consumer_fea:
    item_fea_dict = item_info_dict
    try:
      recall_item_fea = item_fea_dict['i_'+fea_name_suff][recall_item_id]
    except:
      recall_item_fea = item_fea_dict[fea_name_suff[5:]][recall_item_id]
  else:
    item_fea_dict = consumer_info_dict
    recall_item_fea = item_fea_dict['c_mode_u_'+fea_name_suff][recall_item_id]
  return int(recall_item_fea==user_fea)


def get_recall_item_user_info_sparse_fea_diff(user_item_label_df, in_cols, fea_name_suff, consumer_fea):
  in_cols += ['user_id', 'item_id']
  user_item_label_df = user_item_label_df[in_cols]
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
      partial(get_recall_item_user_info_sparse_fea_diff_core, fea_name_suff=fea_name_suff, consumer_fea=consumer_fea), axis=1)
  if not consumer_fea:
    user_item_label_df.rename(columns={'new_fea': 'rc_i_diff_u_'+fea_name_suff,}, inplace=True)
    return user_item_label_df[['user_id', 'item_id', 'rc_i_diff_u_'+fea_name_suff,]]
  else:
    user_item_label_df.rename(columns={'new_fea': 'rc_c_diff_u_'+fea_name_suff,}, inplace=True)
    return user_item_label_df[['user_id', 'item_id', 'rc_c_diff_u_'+fea_name_suff,]]

In [None]:
sparse_feas = ['mode_category_id', 'mode_environment', 'mode_device_group', 
        'mode_os', 'mode_province', 'mode_city', 'mode_referrer_type',]
for fea_suff in tqdm.tqdm(sparse_feas):
  fea_df = get_recall_item_user_info_sparse_fea_diff(user_item_label_df, [], fea_suff, False)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
100%|██████████| 7/7 [12:28<00:00, 106.97s/it]


### 2.4.3 consumer侧--dense

In [None]:
dense_feas = ['mean_day', 'mean_hour', 'mean_minute','mean_time_bucket', 
        'mean_weekday', 'mean_yearday', 'mean_prod_year', 'mean_prod_month',
        'mean_prod_day', 'mean_prod_hour', 'mean_prod_minute', 
        'mean_prod_season', 'mean_prod_time_bucket', 'mean_prod_weekday', 
        'mean_prod_yearday', 'mean_price', 'active_nums', 'active_day_nums',
        'mean_rev_ua_time', 'ua_score', 'mean_freshness']


for fea_suff in tqdm.tqdm(dense_feas):
  fea_df = get_recall_item_user_info_dense_fea_diff(user_item_label_df, [], fea_suff, True)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
100%|██████████| 21/21 [39:57<00:00, 114.17s/it]


### 2.4.4 consumer侧--sparse

In [None]:
sparse_feas = ['mode_category_id', 'mode_environment', 'mode_device_group', 
        'mode_os', 'mode_province', 'mode_city', 'mode_referrer_type',]

for fea_suff in tqdm.tqdm(sparse_feas):
  fea_df = get_recall_item_user_info_sparse_fea_diff(user_item_label_df, [], fea_suff, True)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
100%|██████████| 7/7 [14:34<00:00, 124.99s/it]


## 2.5 ctx vs last ua ctx

In [None]:
hist_last_click_dict.keys()

dict_keys(['item_id', 'timestamp_ms', 'environment', 'device_group', 'os', 'province', 'city', 'referrer_type', 'timestamp_s', 'day', 'hour', 'minute', 'second', 'time_bucket', 'weekday', 'yearday', 'freshness'])

### 2.5.1 dense

In [None]:
def get_ctx_last_ua_ctx_dense_fea_diff_core(user_item_label_row, fea_name):
  user_id = user_item_label_row['user_id']
  ctx_fea = user_item_label_row[fea_name]
  last_ua_fea = hist_last_click_dict[fea_name][user_id]
  return ctx_fea - last_ua_fea


def get_ctx_last_ua_ctx_dense_fea_diff(user_item_label_df, in_cols, fea_name):
  in_cols += ['user_id', 'item_id', fea_name]
  user_item_label_df = user_item_label_df[in_cols]
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
      partial(get_ctx_last_ua_ctx_dense_fea_diff_core, fea_name=fea_name), axis=1)
  user_item_label_df.rename(columns={'new_fea': 'ctx_diff_last_ua_ctx_'+fea_name,}, inplace=True)
  return user_item_label_df[['user_id', 'item_id', 'ctx_diff_last_ua_ctx_'+fea_name]]

In [None]:
dense_feas = ['timestamp_ms', 'day', 'hour', 'minute', 'second', 'time_bucket', 'weekday', 'yearday', 'freshness']

for fea in tqdm.tqdm(dense_feas):
  fea_df = get_ctx_last_ua_ctx_dense_fea_diff(user_item_label_df, [], fea)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
100%|██████████| 9/9 [18:36<00:00, 124.01s/it]


### 2.5.2 sparse

In [None]:
def get_ctx_last_ua_ctx_sparse_fea_diff_core(user_item_label_row, fea_name):
  user_id = user_item_label_row['user_id']
  ctx_fea = user_item_label_row[fea_name]
  last_ua_fea = hist_last_click_dict[fea_name][user_id]
  return int(ctx_fea==last_ua_fea)


def get_ctx_last_ua_ctx_sparse_fea_diff(user_item_label_df, in_cols, fea_name):
  in_cols += ['user_id', 'item_id', fea_name]
  user_item_label_df = user_item_label_df[in_cols]
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
      partial(get_ctx_last_ua_ctx_sparse_fea_diff_core, fea_name=fea_name), axis=1)
  user_item_label_df.rename(columns={'new_fea': 'ctx_diff_last_ua_ctx_'+fea_name,}, inplace=True)
  return user_item_label_df[['user_id', 'item_id', 'ctx_diff_last_ua_ctx_'+fea_name]]

In [None]:
sparse_feas = ['environment', 'device_group', 'os', 'province', 'city', 'referrer_type',]

for fea in tqdm.tqdm(sparse_feas):
  fea_df = get_ctx_last_ua_ctx_sparse_fea_diff(user_item_label_df, [], fea)
  user_item_label_df = reduce_mem(pd.merge(user_item_label_df, fea_df, how='left', on=['user_id', 'item_id']))
  gc.collect()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_item_label_df['new_fea'] = user_item_label_df.parallel_apply(\
100%|██████████| 6/6 [13:06<00:00, 131.11s/it]


# 3.保存特征

## 3.1 保存提取特征后的表

In [None]:
# len(user_item_label_df)

In [None]:
import math

In [None]:
save_parts = 3
part_len = math.ceil(len(user_item_label_df)/save_parts)

In [None]:
for part_index in range(save_parts):
  if os.path.exists(os.path.join(save_dir, f'user_item_rank_feats_df_part_{part_index}.csv')):
    os.remove(os.path.join(save_dir, f'user_item_rank_feats_df_part_{part_index}.csv'))
  user_item_label_df[part_index*part_len:(part_index+1)*part_len].\
  to_csv(os.path.join(save_dir, f'user_item_rank_feats_df_part_{part_index}.csv'), index=False)

## 3.2 保存训练集、测试集用户列表

In [None]:
if os.path.exists(os.path.join(save_dir, 'trn_user.csv')):
  os.remove(os.path.join(save_dir, 'trn_user.csv'))
trn_user.to_csv(os.path.join(save_dir, 'trn_user.csv'), index=False)

if os.path.exists(os.path.join(save_dir, 'val_user.csv')):
  os.remove(os.path.join(save_dir, 'val_user.csv'))
val_user.to_csv(os.path.join(save_dir, 'val_user.csv'), index=False)