# Author imformation:
Fan Shengzhe, Shanghaijiaotong University, Shanghai, China  
Email: fanshengzhe@sjtu.edu.cn

# 1.读取数据

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import os
os.chdir("drive/My Drive/reco/agri-machine-reco")

In [None]:
import pandas as pd
from functools import partial
import os

log_table = pd.read_csv('./cache/log_table.csv')
save_dir = './cache'

if not os.path.exists(save_dir):
  os.mkdir(save_dir)

log_table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 422126 entries, 0 to 422125
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   event_time     422126 non-null  object 
 1   event_type     422126 non-null  object 
 2   item_id        422126 non-null  int64  
 3   power          422126 non-null  float64
 4   category_id    422126 non-null  int64  
 5   region         255106 non-null  float64
 6   price          422126 non-null  float64
 7   user_id        422126 non-null  int64  
 8   user_session   422126 non-null  int64  
 9   is_nan_region  422126 non-null  int64  
dtypes: float64(3), int64(5), object(2)
memory usage: 32.2+ MB


In [None]:
try:
  from pandarallel import pandarallel
except:
  !pip install pandarallel
  from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pandarallel
  Downloading pandarallel-1.6.4.tar.gz (12 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pandarallel
  Building wheel for pandarallel (setup.py) ... [?25l[?25hdone
  Created wheel for pandarallel: filename=pandarallel-1.6.4-py3-none-any.whl size=16678 sha256=fa2aa9eedc8579aa56bb6cc73fce1bb2070f3a2c039033789a8f0236316803ad
  Stored in directory: /root/.cache/pip/wheels/62/1e/e7/f9ee096e5cc02890a6934a5670ff6e45a3400f330605bd8210
Successfully built pandarallel
Installing collected packages: pandarallel
Successfully installed pandarallel-1.6.4
INFO: Pandarallel will run on 2 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## 1.1 归一化函数

In [None]:
def normalizer(df, fea_name):
  max_ = df[fea_name].max()
  min_ = df[fea_name].min()
  df[fea_name] = (df[fea_name] - min_) / (max_ - min_)

# 2.时间类特征

## 2.1 基础时间特征：年、月、日、时、分、秒

In [None]:
import time
def get_year(timestamp):
  timeArray = time.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
  return timeArray.tm_year

def get_month(timestamp):
  timeArray = time.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
  return timeArray.tm_mon

def get_day(timestamp):
  timeArray = time.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
  return timeArray.tm_mday

def get_hour(timestamp):
  timeArray = time.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
  return timeArray.tm_hour

def get_minute(timestamp):
  timeArray = time.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
  return timeArray.tm_min

def get_second(timestamp):
  timeArray = time.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
  return timeArray.tm_sec

In [None]:
log_table['year'] = log_table['event_time'].map(get_year)
log_table['month'] = log_table['event_time'].map(get_month)
# log_table['day'] = log_table['event_time'].map(get_day)
# log_table['hour'] = log_table['event_time'].map(get_hour)
# log_table['minute'] = log_table['event_time'].map(get_minute)
# log_table['second'] = log_table['event_time'].map(get_second)

## 2.2 季节类特征

In [None]:
def timestamp2season(timestamp):
  timeArray = time.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
  if (3 <= timeArray.tm_mon < 6):
    return 0  # spring
  elif (6 <= timeArray.tm_mon < 9):
    return 1  # summer 
  elif (9 <= timeArray.tm_mon < 12):
    return 2  # autumn
  elif (1 <= timeArray.tm_mon < 3 or timeArray.tm_mon == 12):
    return 3  # winter

In [None]:
log_table['season'] = log_table['event_time'].map(timestamp2season)

## 2.3 时段类特征

In [None]:
def timestamp2time_bucket(timestamp):
  timeArray = time.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
  if (1 <= timeArray.tm_hour < 5):
    return 0  # 凌晨
  elif (5 <= timeArray.tm_hour < 8):
    return 1  # 早上
  elif (8 <= timeArray.tm_hour < 11):
    return 2  # 上午
  elif (11 <= timeArray.tm_hour < 13):
    return 3  # 中午
  elif (13 <= timeArray.tm_hour < 17):
    return 4  # 下午
  elif (17 <= timeArray.tm_hour < 19):
    return 5  # 傍晚
  elif (19 <= timeArray.tm_hour < 23):
    return 6  # 晚上
  elif (23 <= timeArray.tm_hour < 24 or 0 <= timeArray.tm_hour < 1):
    return 7  # 子夜

In [None]:
# log_table['time_bucket'] = log_table['event_time'].map(timestamp2time_bucket)

## 2.4 星期类特征

In [None]:
def timestamp2weekday(timestamp):
  timeArray = time.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
  return timeArray.tm_wday  # 0是周一

In [None]:
# log_table['weekday'] = log_table['event_time'].map(timestamp2weekday)

## 2.5 年度特征（一年中的第几天）

In [None]:
def timestamp2yearday(timestamp):
  timeArray = time.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
  return timeArray.tm_yday  # 0是周一

In [None]:
log_table['yearday'] = log_table['event_time'].map(timestamp2yearday)

## 2.6 拆分历史数据

将用户的点击log拆分为：
* 历史行为序列
* 最后一次交互的物品

In [None]:
def get_hist_and_last_click(all_click):
  all_click = all_click.sort_values(by=['user_id', 'event_time'])
  click_last_df = all_click[(all_click['event_type']=='deal') | (all_click['event_type']=='intent') | (all_click['event_type']=='view')].groupby('user_id').tail(1)


  def hist_func(group):
    last_click = group[(group['event_type']=='deal') | (group['event_type']=='intent') | (group['event_type']=='view')].tail(1)

    hist_click = pd.concat([group, last_click, last_click]).drop_duplicates(keep=False)
    if len(hist_click) == 0:
      return group
    else:
      return hist_click

  click_hist_df = all_click.groupby('user_id').parallel_apply(hist_func).reset_index(drop=True)

  return click_hist_df, click_last_df

In [None]:
hist_click_df, last_click_df = get_hist_and_last_click(log_table)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=27267), Label(value='0 / 27267')))…

# 3.用户侧统计特征

In [None]:
log_table['user_id'].nunique()

54533

In [None]:
user_info = hist_click_df['user_id'].drop_duplicates().to_frame()
# user_info.info()

## 3.1 用户的时间偏好

### 3.1.1 成交行为均值

In [None]:
## 定义工具函数
def agg_mean(group, key):
  return pd.Series(group[key].mean(), index=[key])

def get_user_time_fea_by_deal_mean(data, in_cols, fea_name):
  """
  制作用户的时间习惯特征
  :param data: 数据集
  :param extra_cols: 除user_id和fea_name之外用到的特征列
  """
  in_cols += ['user_id', fea_name]
  data = data[in_cols]
  data = data[(data['event_type']=='deal')]
  agg_func = partial(agg_mean, key=fea_name)
  data = data.groupby('user_id').parallel_apply(agg_func).reset_index()

  data.rename(columns={fea_name: 'u_deal_mean_'+fea_name,}, inplace=True)
  
  return data[['user_id', 'u_deal_mean_'+fea_name]]

In [None]:
## 开始抽取特征
# 1. 基础时间特征
# 1.1 年
u_deal_mean_year = get_user_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'year')
hist_click_df = pd.merge(hist_click_df, u_deal_mean_year, how='outer', on='user_id')
user_info = pd.merge(user_info, u_deal_mean_year, how='outer', on='user_id')

# 1.2 月
u_deal_mean_month = get_user_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'month')
hist_click_df = pd.merge(hist_click_df, u_deal_mean_month, how='outer', on='user_id')
user_info = pd.merge(user_info, u_deal_mean_month, how='outer', on='user_id')

# # 1.3 日
# u_deal_mean_day = get_user_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'day')
# hist_click_df = pd.merge(hist_click_df, u_deal_mean_day, how='outer', on='user_id')
# user_info = pd.merge(user_info, u_deal_mean_day, how='outer', on='user_id')

# # 1.4 时
# u_deal_mean_hour = get_user_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'hour')
# hist_click_df = pd.merge(hist_click_df, u_deal_mean_hour, how='outer', on='user_id')
# user_info = pd.merge(user_info, u_deal_mean_hour, how='outer', on='user_id')

# # 1.5 分
# u_deal_mean_minute = get_user_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'minute')
# hist_click_df = pd.merge(hist_click_df, u_deal_mean_minute, how='outer', on='user_id')
# user_info = pd.merge(user_info, u_deal_mean_minute, how='outer', on='user_id')

# # 1.6 秒
# u_deal_mean_second = get_user_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'second')
# hist_click_df = pd.merge(hist_click_df, u_deal_mean_second, how='outer', on='user_id')
# user_info = pd.merge(user_info, u_deal_mean_second, how='outer', on='user_id')

# 2. 季节类特征
u_deal_mean_season = get_user_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'season')
hist_click_df = pd.merge(hist_click_df, u_deal_mean_season, how='outer', on='user_id')
user_info = pd.merge(user_info, u_deal_mean_season, how='outer', on='user_id')

# # 3. 时段类特征
# u_deal_mean_time_bucket = get_user_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'time_bucket')
# hist_click_df = pd.merge(hist_click_df, u_deal_mean_time_bucket, how='outer', on='user_id')
# user_info = pd.merge(user_info, u_deal_mean_time_bucket, how='outer', on='user_id')

# # 4. 星期类特征
# u_deal_mean_weekday = get_user_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'weekday')
# hist_click_df = pd.merge(hist_click_df, u_deal_mean_weekday, how='outer', on='user_id')
# user_info = pd.merge(user_info, u_deal_mean_weekday, how='outer', on='user_id')

# 5. 年度类特征
u_deal_mean_yearday = get_user_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'yearday')
hist_click_df = pd.merge(hist_click_df, u_deal_mean_yearday, how='outer', on='user_id')
user_info = pd.merge(user_info, u_deal_mean_yearday, how='outer', on='user_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1242), Label(value='0 / 1242'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1242), Label(value='0 / 1242'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1242), Label(value='0 / 1242'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1242), Label(value='0 / 1242'))), …

### 3.1.2 正向行为加权均值

In [None]:
## 先定义工具函数
def ua2weight(event_type):
  if event_type == 'deal':
    return 3
  elif event_type == 'intent':
    return 2
  else:
    return 1

def agg_weight_mean(group, key):
  weight_sum = group['weight'].sum()
  group[key] = group[key] * group['weight']
  return pd.Series(group[key].sum()/weight_sum, index=[key])
  

def get_user_time_fea_by_weighted_mean(data, in_cols, fea_name):
  """
  制作用户的时间习惯特征
  :param data: 数据集
  :param extra_cols: 除user_id和fea_name之外用到的特征列
  """
  in_cols += ['user_id', fea_name]
  data = data[in_cols]
  data = data[(data['event_type']=='deal') | (data['event_type']=='intent') | (data['event_type']=='view')]

  data['weight'] = data['event_type'].parallel_apply(ua2weight)

  agg_func = partial(agg_weight_mean, key=fea_name)
  data = data.groupby('user_id', dropna=True).parallel_apply(agg_func).reset_index()
  data.rename(columns={fea_name: 'u_pos_weight_mean_'+fea_name,}, inplace=True)
  
  return data[['user_id', 'u_pos_weight_mean_'+fea_name]]

In [None]:
## 开始抽取特征
# 1. 基础时间特征
# 1.1 年
u_pos_weight_mean_year = get_user_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'year')
hist_click_df = pd.merge(hist_click_df, u_pos_weight_mean_year, how='outer', on='user_id')
user_info = pd.merge(user_info, u_pos_weight_mean_year, how='outer', on='user_id')

# 1.2 月
u_pos_weight_mean_month = get_user_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'month')
hist_click_df = pd.merge(hist_click_df, u_pos_weight_mean_month, how='outer', on='user_id')
user_info = pd.merge(user_info, u_pos_weight_mean_month, how='outer', on='user_id')

# # 1.3 日
# u_pos_weight_mean_day = get_user_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'day')
# hist_click_df = pd.merge(hist_click_df, u_pos_weight_mean_day, how='outer', on='user_id')
# user_info = pd.merge(user_info, u_pos_weight_mean_day, how='outer', on='user_id')

# # 1.4 时
# u_pos_weight_mean_hour = get_user_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'hour')
# hist_click_df = pd.merge(hist_click_df, u_pos_weight_mean_hour, how='outer', on='user_id')
# # user_info = pd.merge(user_info, u_pos_weight_mean_hour, how='outer', on='user_id')

# # 1.5 分
# u_pos_weight_mean_minute = get_user_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'minute')
# hist_click_df = pd.merge(hist_click_df, u_pos_weight_mean_minute, how='outer', on='user_id')
# # user_info = pd.merge(user_info, u_pos_weight_mean_minute, how='outer', on='user_id')

# # 1.6 秒
# u_pos_weight_mean_second = get_user_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'second')
# hist_click_df = pd.merge(hist_click_df, u_pos_weight_mean_second, how='outer', on='user_id')
# # user_info = pd.merge(user_info, u_pos_weight_mean_second, how='outer', on='user_id')

# 2. 季节类特征
u_pos_weight_mean_season = get_user_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'season')
hist_click_df = pd.merge(hist_click_df, u_pos_weight_mean_season, how='outer', on='user_id')
user_info = pd.merge(user_info, u_pos_weight_mean_season, how='outer', on='user_id')

# # 3. 时段类特征
# u_pos_weight_mean_time_bucket = get_user_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'time_bucket')
# hist_click_df = pd.merge(hist_click_df, u_pos_weight_mean_time_bucket, how='outer', on='user_id')
# user_info = pd.merge(user_info, u_pos_weight_mean_time_bucket, how='outer', on='user_id')

# # 4. 星期类特征
# u_pos_weight_mean_weekday = get_user_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'weekday')
# hist_click_df = pd.merge(hist_click_df, u_pos_weight_mean_weekday, how='outer', on='user_id')
# user_info = pd.merge(user_info, u_pos_weight_mean_weekday, how='outer', on='user_id')

# 5. 年度类特征
u_pos_weight_mean_yearday = get_user_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'yearday')
hist_click_df = pd.merge(hist_click_df, u_pos_weight_mean_yearday, how='outer', on='user_id')
user_info = pd.merge(user_info, u_pos_weight_mean_yearday, how='outer', on='user_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=27116), Label(value='0 / 27116')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=27116), Label(value='0 / 27116')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=27116), Label(value='0 / 27116')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=27116), Label(value='0 / 27116')))…

## 3.2 用户的动力偏好

### 3.2.1 成交的动力偏好

In [None]:
u_deal_mean_power = get_user_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'power')
hist_click_df = pd.merge(hist_click_df, u_deal_mean_power, how='outer', on='user_id')
user_info = pd.merge(user_info, u_deal_mean_power, how='outer', on='user_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1242), Label(value='0 / 1242'))), …

### 3.2.2 正反馈加权的动力偏好

In [None]:
u_pos_weight_mean_power = get_user_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'power')
hist_click_df = pd.merge(hist_click_df, u_pos_weight_mean_power, how='outer', on='user_id')
user_info = pd.merge(user_info, u_pos_weight_mean_power, how='outer', on='user_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=27116), Label(value='0 / 27116')))…

## 3.3 用户的类别偏好

### 3.3.1 成交最多的类别

In [None]:
import random
import numpy as np
def agg_mode(group, key):
  mode_list = group[key].mode()
  if not mode_list.empty:
    return pd.Series(random.choice(mode_list), index=[key])
  else:
    return pd.Series(np.nan, index=[key])

def get_user_time_fea_by_deal_mode(data, in_cols, fea_name):
  """
  制作用户的时间习惯特征
  :param data: 数据集
  :param extra_cols: 除user_id和fea_name之外用到的特征列
  """
  in_cols += ['user_id', fea_name]
  data = data[in_cols]
  data = data[(data['event_type']=='deal')]
  agg_func = partial(agg_mode, key=fea_name)
  data = data.groupby('user_id').parallel_apply(agg_func).reset_index()

  data.rename(columns={fea_name: 'u_deal_mode_'+fea_name,}, inplace=True)
  
  return data[['user_id', 'u_deal_mode_'+fea_name]]

In [None]:
u_deal_mode_category_id = get_user_time_fea_by_deal_mode(hist_click_df, ['event_type'], 'category_id')
hist_click_df = pd.merge(hist_click_df, u_deal_mode_category_id, how='outer', on='user_id')
user_info = pd.merge(user_info, u_deal_mode_category_id, how='outer', on='user_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1242), Label(value='0 / 1242'))), …

### 3.3.2 正向行为评分最高的类别

In [None]:
## 先定义工具函数
def agg_weight_mode(group, key):
  cal = {}
  for item, weight in zip(group[key], group['weight']):
    cal[item] = cal.get(item, 0) + weight
  cal = sorted(cal.items(), key=lambda x:x[1], reverse=True)
  return pd.Series(cal[0][0], index=[key])
  

def get_user_time_fea_by_weighted_mode(data, in_cols, fea_name):
  """
  制作用户的时间习惯特征
  :param data: 数据集
  :param extra_cols: 除user_id和fea_name之外用到的特征列
  """
  in_cols += ['user_id', fea_name]
  data = data[in_cols]
  data = data[(data['event_type']=='deal') | (data['event_type']=='intent') | (data['event_type']=='view')]

  data['weight'] = data['event_type'].parallel_apply(ua2weight)

  agg_func = partial(agg_weight_mode, key=fea_name)
  data = data.groupby('user_id', dropna=True).parallel_apply(agg_func).reset_index()
  data.rename(columns={fea_name: 'u_pos_weight_mode_'+fea_name,}, inplace=True)
  
  return data[['user_id', 'u_pos_weight_mode_'+fea_name]]

In [None]:
u_weighted_mode_category_id = get_user_time_fea_by_weighted_mode(hist_click_df, ['event_type'], 'category_id')
hist_click_df = pd.merge(hist_click_df, u_weighted_mode_category_id, how='outer', on='user_id')
user_info = pd.merge(user_info, u_weighted_mode_category_id, how='outer', on='user_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=27116), Label(value='0 / 27116')))…

## 3.4 用户的地区偏好

### 3.4.1 成交的地区偏好

In [None]:
u_deal_mode_region = get_user_time_fea_by_deal_mode(hist_click_df, ['event_type'], 'region')
hist_click_df = pd.merge(hist_click_df, u_deal_mode_region, how='outer', on='user_id')
user_info = pd.merge(user_info, u_deal_mode_region, how='outer', on='user_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1242), Label(value='0 / 1242'))), …

### 3.4.2 正反馈加权的地区偏好

In [None]:
u_weighted_mode_region = get_user_time_fea_by_weighted_mode(hist_click_df, ['event_type'], 'region')
hist_click_df = pd.merge(hist_click_df, u_weighted_mode_region, how='outer', on='user_id')
user_info = pd.merge(user_info, u_weighted_mode_region, how='outer', on='user_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=27116), Label(value='0 / 27116')))…

## 3.5 用户的价格偏好

### 3.5.1 成交的价格偏好

In [None]:
u_deal_mean_price = get_user_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'price')
hist_click_df = pd.merge(hist_click_df, u_deal_mean_price, how='outer', on='user_id')
user_info = pd.merge(user_info, u_deal_mean_price, how='outer', on='user_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1242), Label(value='0 / 1242'))), …

### 3.5.2 正反馈加权的价格偏好

In [None]:
u_pos_weight_mean_price = get_user_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'price')
hist_click_df = pd.merge(hist_click_df, u_pos_weight_mean_price, how='outer', on='user_id')
user_info = pd.merge(user_info, u_pos_weight_mean_price, how='outer', on='user_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=27116), Label(value='0 / 27116')))…

## 3.6 用户的活跃度特征

活跃度：
1. 购买行为的个数
2. 所有行为加权活跃度
3. 购买行为活跃天数
4. 所有行为加权活跃天数

### 3.6.1 购买行为的活跃度

In [None]:
import time
## 定义工具函数
def str2timestamp(time_str):
  timeArray = time.strptime(time_str, "%Y-%m-%d %H:%M:%S")
  timeStamp = int(time.mktime(timeArray))
  return timeStamp

def get_ua_nums(group):
  return pd.Series([len(group)], index=['ua_nums'])

def get_user_active_by_deal_nums(data, in_cols):
  in_cols += ['user_id']
  data = data[in_cols]
  data = data[(data['event_type']=='deal')]
  agg_func = get_ua_nums

  data['event_time'] = data['event_time'].parallel_apply(str2timestamp)
  data = data.groupby('user_id').parallel_apply(agg_func).reset_index()

  data.rename(columns={'ua_nums': 'u_active_deal_nums'}, inplace=True)
  
  return data[['user_id', 'u_active_deal_nums']]

In [None]:
u_active_deal_nums = get_user_active_by_deal_nums(hist_click_df, ['event_type', 'event_time'])
hist_click_df = pd.merge(hist_click_df, u_active_deal_nums, how='outer', on='user_id')
user_info = pd.merge(user_info, u_active_deal_nums, how='outer', on='user_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=11110), Label(value='0 / 11110')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1242), Label(value='0 / 1242'))), …

### 3.6.2 所有行为的加权活跃度

In [None]:
def ua_active_weight(ua):
  if ua == 'deal':
    return 5
  elif ua == 'intent':
    return 4
  elif ua == 'view':
    return 3
  else:
    return 1

def agg_weight_sum(group):
  return pd.Series(group['weight'].sum(), index=['ua_score'])

def get_user_active_by_weighted_mean(data, in_cols, fea_name=None):
  in_cols += ['user_id']
  data = data[in_cols]
  # data = data[(data['event_type']=='deal') | (data['event_type']=='intent') | (data['event_type']=='view')]

  data['weight'] = data['event_type'].parallel_apply(ua_active_weight)

  agg_func = agg_weight_sum
  data = data.groupby('user_id', dropna=True).parallel_apply(agg_func).reset_index()
  data.rename(columns={'ua_score': 'u_active_score'}, inplace=True)
  
  return data[['user_id', 'u_active_score']]

In [None]:
u_active_pos_score = get_user_active_by_weighted_mean(hist_click_df, ['event_type', 'event_time'])
hist_click_df = pd.merge(hist_click_df, u_active_pos_score, how='outer', on='user_id')
user_info = pd.merge(user_info, u_active_pos_score, how='outer', on='user_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=171242), Label(value='0 / 171242')…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weight'] = data['event_type'].parallel_apply(ua_active_weight)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=27267), Label(value='0 / 27267')))…

### 3.6.3 购买行为的活跃天数

In [None]:
import time
## 定义工具函数
def str2ymd(time_str):
  timeArray = time.strptime(time_str, "%Y-%m-%d %H:%M:%S")
  return time.strftime("%Y-%m-%d", timeArray)

def get_deal_day_num(group):
  return pd.Series(group['event_time'].nunique(), index=['deal_day_num'])

def get_user_active_by_deal_nums(data, in_cols):
  in_cols += ['user_id']
  data = data[in_cols]
  data = data[(data['event_type']=='deal')]
  agg_func = get_deal_day_num

  data['event_time'] = data['event_time'].parallel_apply(str2ymd)
  data = data.groupby('user_id').parallel_apply(agg_func).reset_index()

  data.rename(columns={'deal_day_num': 'u_active_deal_day_nums'}, inplace=True)
  
  return data[['user_id', 'u_active_deal_day_nums']]

In [None]:
u_active_deal_day_nums = get_user_active_by_deal_nums(hist_click_df, ['event_type', 'event_time'])
hist_click_df = pd.merge(hist_click_df, u_active_deal_day_nums, how='outer', on='user_id')
user_info = pd.merge(user_info, u_active_deal_day_nums, how='outer', on='user_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=11110), Label(value='0 / 11110')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=1242), Label(value='0 / 1242'))), …

### 3.6.4 所有行为的加权活跃天数

In [None]:
def get_pos_day_num(group):
  deal_days = group[group['event_type']=='deal']['event_time'].unique()
  intent_days = group[group['event_type']=='intent']['event_time'].unique()
  view_days = group[group['event_type']=='view']['event_time'].unique()
  remove_intent_days = group[group['event_type']=='remove_intent']['event_time'].unique()
  
  pure_deal_days = set(deal_days)
  pure_intent_days = set(intent_days).difference(pure_deal_days)
  pure_view_days = set(view_days).difference(pure_deal_days.union(pure_intent_days))
  pure_remove_intent_days = set(remove_intent_days).difference(pure_deal_days.union(pure_intent_days).union(pure_view_days))

  score = 5 * len(pure_deal_days) + 4 * len(pure_intent_days) + 3 * len(pure_view_days) + len(pure_remove_intent_days)
  return pd.Series(score, index=['ua_score'])

def get_user_active_by_weighted_mean(data, in_cols, fea_name=None):
  in_cols += ['user_id']
  data = data[in_cols]
  # data = data[(data['event_type']=='deal') | (data['event_type']=='intent') | (data['event_type']=='view')]

  data['event_time'] = data['event_time'].parallel_apply(str2ymd)

  agg_func = get_pos_day_num
  data = data.groupby('user_id', dropna=True).parallel_apply(agg_func).reset_index()
  data.rename(columns={'ua_score': 'u_active_weight_day_nums'}, inplace=True)
  
  return data[['user_id', 'u_active_weight_day_nums']]

In [None]:
u_active_weight_day_nums = get_user_active_by_weighted_mean(hist_click_df, ['event_type', 'event_time'])
hist_click_df = pd.merge(hist_click_df, u_active_weight_day_nums, how='outer', on='user_id')
user_info = pd.merge(user_info, u_active_weight_day_nums, how='outer', on='user_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=171242), Label(value='0 / 171242')…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['event_time'] = data['event_time'].parallel_apply(str2ymd)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=27267), Label(value='0 / 27267')))…

# 4.物品侧统计特征

In [None]:
item_info = hist_click_df['item_id'].drop_duplicates().to_frame()

## 4.1 物品被购买的时间偏好

### 4.1.1 成交行为均值

In [None]:
# log_table.describe()

In [None]:
def get_item_time_fea_by_deal_mean(data, in_cols, fea_name):
  """
  制作用户的时间习惯特征
  :param data: 数据集
  :param extra_cols: 除item_id和fea_name之外用到的特征列
  """
  in_cols += ['item_id', fea_name]
  data = data[in_cols]
  data = data[(data['event_type']=='deal')]
  agg_func = partial(agg_mean, key=fea_name)
  data = data.groupby('item_id').parallel_apply(agg_func).reset_index()

  data.rename(columns={fea_name: 'i_deal_mean_'+fea_name,}, inplace=True)
  
  return data[['item_id', 'i_deal_mean_'+fea_name]]

In [None]:
## 开始抽取特征
# 1. 基础时间特征
# 1.1 年
i_deal_mean_year = get_item_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'year')
hist_click_df = pd.merge(hist_click_df, i_deal_mean_year, how='outer', on='item_id')
item_info = pd.merge(item_info, i_deal_mean_year, how='outer', on='item_id')

# 1.2 月
i_deal_mean_month = get_item_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'month')
hist_click_df = pd.merge(hist_click_df, i_deal_mean_month, how='outer', on='item_id')
item_info = pd.merge(item_info, i_deal_mean_month, how='outer', on='item_id')

# # 1.3 日
# i_deal_mean_day = get_item_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'day')
# hist_click_df = pd.merge(hist_click_df, i_deal_mean_day, how='outer', on='item_id')
# item_info = pd.merge(item_info, i_deal_mean_day, how='outer', on='item_id')

# # 1.4 时
# i_deal_mean_hour = get_item_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'hour')
# hist_click_df = pd.merge(hist_click_df, i_deal_mean_hour, how='outer', on='item_id')
# item_info = pd.merge(item_info, i_deal_mean_hour, how='outer', on='item_id')

# # 1.5 分
# i_deal_mean_minute = get_item_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'minute')
# hist_click_df = pd.merge(hist_click_df, i_deal_mean_minute, how='outer', on='item_id')
# item_info = pd.merge(item_info, i_deal_mean_minute, how='outer', on='item_id')

# # 1.6 秒
# i_deal_mean_second = get_item_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'second')
# hist_click_df = pd.merge(hist_click_df, i_deal_mean_second, how='outer', on='item_id')
# item_info = pd.merge(item_info, i_deal_mean_second, how='outer', on='item_id')

# 2. 季节类特征
i_deal_mean_season = get_item_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'season')
hist_click_df = pd.merge(hist_click_df, i_deal_mean_season, how='outer', on='item_id')
item_info = pd.merge(item_info, i_deal_mean_season, how='outer', on='item_id')

# # 3. 时段类特征
# i_deal_mean_time_bucket = get_item_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'time_bucket')
# hist_click_df = pd.merge(hist_click_df, i_deal_mean_time_bucket, how='outer', on='item_id')
# item_info = pd.merge(item_info, i_deal_mean_time_bucket, how='outer', on='item_id')

# # 4. 星期类特征
# i_deal_mean_weekday = get_item_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'weekday')
# hist_click_df = pd.merge(hist_click_df, i_deal_mean_weekday, how='outer', on='item_id')
# item_info = pd.merge(item_info, i_deal_mean_weekday, how='outer', on='item_id')

# 5. 年度类特征
i_deal_mean_yearday = get_item_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'yearday')
hist_click_df = pd.merge(hist_click_df, i_deal_mean_yearday, how='outer', on='item_id')
item_info = pd.merge(item_info, i_deal_mean_yearday, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4499), Label(value='0 / 4499'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4499), Label(value='0 / 4499'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4499), Label(value='0 / 4499'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4499), Label(value='0 / 4499'))), …

### 4.1.2 正向行为加权均值

In [None]:
def get_item_time_fea_by_weighted_mean(data, in_cols, fea_name):
  """
  制作用户的时间习惯特征
  :param data: 数据集
  :param extra_cols: 除item_id和fea_name之外用到的特征列
  """
  in_cols += ['item_id', fea_name]
  data = data[in_cols]
  data = data[(data['event_type']=='deal') | (data['event_type']=='intent') | (data['event_type']=='view')]

  data['weight'] = data['event_type'].parallel_apply(ua2weight)

  agg_func = partial(agg_weight_mean, key=fea_name)
  data = data.groupby('item_id', dropna=True).parallel_apply(agg_func).reset_index()
  data.rename(columns={fea_name: 'i_pos_weight_mean_'+fea_name,}, inplace=True)
  
  return data[['item_id', 'i_pos_weight_mean_'+fea_name]]

In [None]:
## 开始抽取特征
# 1. 基础时间特征
# 1.1 年
i_pos_weight_mean_year = get_item_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'year')
hist_click_df = pd.merge(hist_click_df, i_pos_weight_mean_year, how='outer', on='item_id')
item_info = pd.merge(item_info, i_pos_weight_mean_year, how='outer', on='item_id')

# 1.2 月
i_pos_weight_mean_month = get_item_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'month')
hist_click_df = pd.merge(hist_click_df, i_pos_weight_mean_month, how='outer', on='item_id')
item_info = pd.merge(item_info, i_pos_weight_mean_month, how='outer', on='item_id')

# # 1.3 日
# i_pos_weight_mean_day = get_item_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'day')
# hist_click_df = pd.merge(hist_click_df, i_pos_weight_mean_day, how='outer', on='item_id')
# item_info = pd.merge(item_info, i_pos_weight_mean_day, how='outer', on='item_id')

# # 1.4 时
# i_pos_weight_mean_hour = get_item_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'hour')
# hist_click_df = pd.merge(hist_click_df, i_pos_weight_mean_hour, how='outer', on='item_id')
# item_info = pd.merge(item_info, i_pos_weight_mean_hour, how='outer', on='item_id')

# # 1.5 分
# i_pos_weight_mean_minute = get_item_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'minute')
# hist_click_df = pd.merge(hist_click_df, i_pos_weight_mean_minute, how='outer', on='item_id')
# item_info = pd.merge(item_info, i_pos_weight_mean_minute, how='outer', on='item_id')

# # 1.6 秒
# i_pos_weight_mean_second = get_item_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'second')
# hist_click_df = pd.merge(hist_click_df, i_pos_weight_mean_second, how='outer', on='item_id')
# item_info = pd.merge(item_info, i_pos_weight_mean_second, how='outer', on='item_id')

# 2. 季节类特征
i_pos_weight_mean_season = get_item_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'season')
hist_click_df = pd.merge(hist_click_df, i_pos_weight_mean_season, how='outer', on='item_id')
item_info = pd.merge(item_info, i_pos_weight_mean_season, how='outer', on='item_id')

# # 3. 时段类特征
# i_pos_weight_mean_time_bucket = get_item_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'time_bucket')
# hist_click_df = pd.merge(hist_click_df, i_pos_weight_mean_time_bucket, how='outer', on='item_id')
# item_info = pd.merge(item_info, i_pos_weight_mean_time_bucket, how='outer', on='item_id')

# # 4. 星期类特征
# i_pos_weight_mean_weekday = get_item_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'weekday')
# hist_click_df = pd.merge(hist_click_df, i_pos_weight_mean_weekday, how='outer', on='item_id')
# item_info = pd.merge(item_info, i_pos_weight_mean_weekday, how='outer', on='item_id')

# 5. 年度类特征
i_pos_weight_mean_yearday = get_item_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'yearday')
hist_click_df = pd.merge(hist_click_df, i_pos_weight_mean_yearday, how='outer', on='item_id')
item_info = pd.merge(item_info, i_pos_weight_mean_yearday, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15935), Label(value='0 / 15935')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15935), Label(value='0 / 15935')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15935), Label(value='0 / 15935')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15935), Label(value='0 / 15935')))…

## 4.2 物品被购买的地区偏好

### 4.2.1 成交的地区偏好

In [None]:
def get_item_time_fea_by_deal_mode(data, in_cols, fea_name):
  """
  制作用户的时间习惯特征
  :param data: 数据集
  :param extra_cols: 除item_id和fea_name之外用到的特征列
  """
  in_cols += ['item_id', fea_name]
  data = data[in_cols]
  data = data[(data['event_type']=='deal')]
  agg_func = partial(agg_mode, key=fea_name)
  data = data.groupby('item_id').parallel_apply(agg_func).reset_index()

  data.rename(columns={fea_name: 'i_deal_mode_'+fea_name,}, inplace=True)
  
  return data[['item_id', 'i_deal_mode_'+fea_name]]

In [None]:
i_deal_mode_region = get_item_time_fea_by_deal_mode(hist_click_df, ['event_type'], 'region')
hist_click_df = pd.merge(hist_click_df, i_deal_mode_region, how='outer', on='item_id')
item_info = pd.merge(item_info, i_deal_mode_region, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4499), Label(value='0 / 4499'))), …

### 4.2.2 正反馈加权的地区偏好

In [None]:
def get_item_time_fea_by_weighted_mode(data, in_cols, fea_name):
  """
  制作用户的时间习惯特征
  :param data: 数据集
  :param extra_cols: 除item_id和fea_name之外用到的特征列
  """
  in_cols += ['item_id', fea_name]
  data = data[in_cols]
  data = data[(data['event_type']=='deal') | (data['event_type']=='intent') | (data['event_type']=='view')]

  data['weight'] = data['event_type'].parallel_apply(ua2weight)

  agg_func = partial(agg_weight_mode, key=fea_name)
  data = data.groupby('item_id', dropna=True).parallel_apply(agg_func).reset_index()
  data.rename(columns={fea_name: 'i_pos_weight_mode_'+fea_name,}, inplace=True)
  
  return data[['item_id', 'i_pos_weight_mode_'+fea_name]]

In [None]:
i_weighted_mode_region = get_item_time_fea_by_weighted_mode(hist_click_df, ['event_type'], 'region')
hist_click_df = pd.merge(hist_click_df, i_weighted_mode_region, how='outer', on='item_id')
item_info = pd.merge(item_info, i_weighted_mode_region, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15935), Label(value='0 / 15935')))…

## 4.3 物品的热度特征

### 4.3.1 购买行为的活跃度

In [None]:
def get_item_active_by_deal_nums(data, in_cols):
  in_cols += ['item_id']
  data = data[in_cols]
  data = data[(data['event_type']=='deal')]
  agg_func = get_ua_nums

  data['event_time'] = data['event_time'].parallel_apply(str2timestamp)
  data = data.groupby('item_id').parallel_apply(agg_func).reset_index()

  data.rename(columns={'ua_nums': 'i_active_deal_nums'}, inplace=True)
  
  return data[['item_id', 'i_active_deal_nums']]

In [None]:
i_active_deal_nums = get_item_active_by_deal_nums(hist_click_df, ['event_type', 'event_time'])
hist_click_df = pd.merge(hist_click_df, i_active_deal_nums, how='outer', on='item_id')
item_info = pd.merge(item_info, i_active_deal_nums, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=11110), Label(value='0 / 11110')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4499), Label(value='0 / 4499'))), …

### 4.3.2 所有行为的加权活跃度

In [None]:
def get_item_active_by_weighted_mean(data, in_cols, fea_name=None):
  in_cols += ['item_id']
  data = data[in_cols]
  # data = data[(data['event_type']=='deal') | (data['event_type']=='intent') | (data['event_type']=='view')]

  data['weight'] = data['event_type'].parallel_apply(ua_active_weight)

  agg_func = agg_weight_sum
  data = data.groupby('item_id', dropna=True).parallel_apply(agg_func).reset_index()
  data.rename(columns={'ua_score': 'i_active_score'}, inplace=True)
  
  return data[['item_id', 'i_active_score']]

In [None]:
i_active_pos_score = get_item_active_by_weighted_mean(hist_click_df, ['event_type', 'event_time'])
hist_click_df = pd.merge(hist_click_df, i_active_pos_score, how='outer', on='item_id')
item_info = pd.merge(item_info, i_active_pos_score, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=171242), Label(value='0 / 171242')…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weight'] = data['event_type'].parallel_apply(ua_active_weight)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=16158), Label(value='0 / 16158')))…

### 4.3.3 购买行为的活跃天数

In [None]:
def get_item_active_by_deal_nums(data, in_cols):
  in_cols += ['item_id']
  data = data[in_cols]
  data = data[(data['event_type']=='deal')]
  agg_func = get_deal_day_num

  data['event_time'] = data['event_time'].parallel_apply(str2ymd)
  data = data.groupby('item_id').parallel_apply(agg_func).reset_index()

  data.rename(columns={'deal_day_num': 'i_active_deal_day_nums'}, inplace=True)
  
  return data[['item_id', 'i_active_deal_day_nums']]

In [None]:
i_active_deal_day_nums = get_item_active_by_deal_nums(hist_click_df, ['event_type', 'event_time'])
hist_click_df = pd.merge(hist_click_df, i_active_deal_day_nums, how='outer', on='item_id')
item_info = pd.merge(item_info, i_active_deal_day_nums, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=11110), Label(value='0 / 11110')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4499), Label(value='0 / 4499'))), …

### 4.3.4 所有行为的加权活跃天数

In [None]:
def get_item_active_by_weighted_mean(data, in_cols, fea_name=None):
  in_cols += ['item_id']
  data = data[in_cols]
  # data = data[(data['event_type']=='deal') | (data['event_type']=='intent') | (data['event_type']=='view')]

  data['event_time'] = data['event_time'].parallel_apply(str2ymd)

  agg_func = get_pos_day_num
  data = data.groupby('item_id', dropna=True).parallel_apply(agg_func).reset_index()
  data.rename(columns={'ua_score': 'i_active_weight_day_nums'}, inplace=True)
  
  return data[['item_id', 'i_active_weight_day_nums']]

In [None]:
i_active_weight_day_nums = get_item_active_by_weighted_mean(hist_click_df, ['event_type', 'event_time'])
hist_click_df = pd.merge(hist_click_df, i_active_weight_day_nums, how='outer', on='item_id')
item_info = pd.merge(item_info, i_active_weight_day_nums, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=171242), Label(value='0 / 171242')…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['event_time'] = data['event_time'].parallel_apply(str2ymd)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=16158), Label(value='0 / 16158')))…

## 4.4 物品本身的特征
其实真正的物品测特征只有category_id和power，price和region都有小幅变化，算是上下文特征。但为了i2i召回方便，我们还是把他们做些统计得到i侧特征

### 4.4.1 category_id和power

In [None]:
hist_click_df['item_id'].nunique()

32316

In [None]:
category_id_table = hist_click_df[['item_id', 'category_id']].drop_duplicates()
item_info = pd.merge(item_info, category_id_table, how='outer', on='item_id')

In [None]:
power_table = hist_click_df[['item_id', 'power']].drop_duplicates()
item_info = pd.merge(item_info, power_table, how='outer', on='item_id')

### 4.4.2 price
按照加权来计算price。正向反馈对应的price的置信度更高，所以会有更高的权重。

In [None]:
def get_weighted_mean_price(data, in_cols, fea_name='price'):
  in_cols += ['item_id', fea_name]
  data = data[in_cols]
  # data = data[(data['event_type']=='deal') | (data['event_type']=='intent') | (data['event_type']=='view')]

  data['weight'] = data['event_type'].parallel_apply(ua_active_weight)
  agg_func = partial(agg_weight_mean, key=fea_name)
  data = data.groupby('item_id', dropna=True).parallel_apply(agg_func).reset_index()
  data.rename(columns={fea_name: 'i_weighted_mean_price'}, inplace=True)
  
  return data[['item_id', 'i_weighted_mean_price']]


In [None]:
i_weighted_mean_price = get_weighted_mean_price(hist_click_df, ['event_type'])
item_info = pd.merge(item_info, i_weighted_mean_price, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=171242), Label(value='0 / 171242')…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weight'] = data['event_type'].parallel_apply(ua_active_weight)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=16158), Label(value='0 / 16158')))…

### 4.4.3 region
i侧的region使用众数统计。但其实region更多当作上下文特征

In [None]:
def get_weight_mode_region(data, in_cols, fea_name='region'):
  in_cols += ['item_id', fea_name]
  data = data[in_cols]
  # data = data[(data['event_type']=='deal') | (data['event_type']=='intent') | (data['event_type']=='view')]

  data['weight'] = data['event_type'].parallel_apply(ua_active_weight)

  agg_func = partial(agg_weight_mode, key=fea_name)
  data = data.groupby('item_id', dropna=True).parallel_apply(agg_func).reset_index()
  data.rename(columns={fea_name: 'i_weight_mode_region'}, inplace=True)
  
  return data[['item_id', 'i_weight_mode_region']]

In [None]:
i_weight_mode_region = get_weight_mode_region(hist_click_df, ['event_type'])
item_info = pd.merge(item_info, i_weight_mode_region, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=171242), Label(value='0 / 171242')…

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['weight'] = data['event_type'].parallel_apply(ua_active_weight)


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=16158), Label(value='0 / 16158')))…

# 5.物品消费者侧特征

In [None]:
consumer_info = hist_click_df['item_id'].drop_duplicates().to_frame()

## 5.1 物品消费者的时间偏好

### 5.1.1 成交行为均值

In [None]:
# hist_click_df.describe()

In [None]:
def get_consumer_time_fea_by_deal_mean(data, in_cols, fea_name):
  """
  制作用户的时间习惯特征
  :param data: 数据集
  :param extra_cols: 除user_id和fea_name之外用到的特征列
  """
  in_cols += ['item_id', fea_name]
  data = data[in_cols]
  data = data[(data['event_type']=='deal')]
  agg_func = partial(agg_mean, key=fea_name)
  data = data.groupby('item_id').parallel_apply(agg_func).reset_index()

  data.rename(columns={fea_name: 'c_deal_mean_'+fea_name,}, inplace=True)
  
  return data[['item_id', 'c_deal_mean_'+fea_name]]

In [None]:
## 开始抽取特征
# 1. 基础时间特征
# 1.1 年
c_deal_mean_year = get_consumer_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'u_deal_mean_year')
hist_click_df = pd.merge(hist_click_df, c_deal_mean_year, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_deal_mean_year, how='outer', on='item_id')

# 1.2 月
c_deal_mean_month = get_consumer_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'u_deal_mean_month')
hist_click_df = pd.merge(hist_click_df, c_deal_mean_month, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_deal_mean_month, how='outer', on='item_id')

# # 1.3 日
# c_deal_mean_day = get_consumer_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'u_deal_mean_day')
# hist_click_df = pd.merge(hist_click_df, c_deal_mean_day, how='outer', on='item_id')
# consumer_info = pd.merge(consumer_info, c_deal_mean_day, how='outer', on='item_id')

# # 1.4 时
# c_deal_mean_hour = get_consumer_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'u_deal_mean_hour')
# hist_click_df = pd.merge(hist_click_df, c_deal_mean_hour, how='outer', on='item_id')
# consumer_info = pd.merge(consumer_info, c_deal_mean_hour, how='outer', on='item_id')

# # 1.5 分
# c_deal_mean_minute = get_consumer_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'u_deal_mean_minute')
# hist_click_df = pd.merge(hist_click_df, c_deal_mean_minute, how='outer', on='item_id')
# consumer_info = pd.merge(consumer_info, c_deal_mean_minute, how='outer', on='item_id')

# # 1.6 秒
# c_deal_mean_second = get_consumer_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'u_deal_mean_second')
# hist_click_df = pd.merge(hist_click_df, c_deal_mean_second, how='outer', on='item_id')
# consumer_info = pd.merge(consumer_info, c_deal_mean_second, how='outer', on='item_id')

# 2. 季节类特征
c_deal_mean_season = get_consumer_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'u_deal_mean_season')
hist_click_df = pd.merge(hist_click_df, c_deal_mean_season, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_deal_mean_season, how='outer', on='item_id')

# # 3. 时段类特征
# c_deal_mean_time_bucket = get_consumer_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'u_deal_mean_time_bucket')
# hist_click_df = pd.merge(hist_click_df, c_deal_mean_time_bucket, how='outer', on='item_id')
# consumer_info = pd.merge(consumer_info, c_deal_mean_time_bucket, how='outer', on='item_id')

# # 4. 星期类特征
# c_deal_mean_weekday = get_consumer_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'u_deal_mean_weekday')
# hist_click_df = pd.merge(hist_click_df, c_deal_mean_weekday, how='outer', on='item_id')
# consumer_info = pd.merge(consumer_info, c_deal_mean_weekday, how='outer', on='item_id')

# 5. 年度类特征
c_deal_mean_yearday = get_consumer_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'u_deal_mean_yearday')
hist_click_df = pd.merge(hist_click_df, c_deal_mean_yearday, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_deal_mean_yearday, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4499), Label(value='0 / 4499'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4499), Label(value='0 / 4499'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4499), Label(value='0 / 4499'))), …

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4499), Label(value='0 / 4499'))), …

### 5.1.2 正向行为加权均值

In [None]:
def get_consumer_time_fea_by_weighted_mean(data, in_cols, fea_name):
  """
  制作用户的时间习惯特征
  :param data: 数据集
  :param extra_cols: 除user_id和fea_name之外用到的特征列
  """
  in_cols += ['item_id', fea_name]
  data = data[in_cols]
  data = data[(data['event_type']=='deal') | (data['event_type']=='intent') | (data['event_type']=='view')]

  data['weight'] = data['event_type'].parallel_apply(ua2weight)

  agg_func = partial(agg_weight_mean, key=fea_name)
  data = data.groupby('item_id', dropna=True).parallel_apply(agg_func).reset_index()
  data.rename(columns={fea_name: 'c_pos_weight_mean_'+fea_name,}, inplace=True)
  
  return data[['item_id', 'c_pos_weight_mean_'+fea_name]]

In [None]:
## 开始抽取特征
# 1. 基础时间特征
# 1.1 年
c_pos_weight_mean_year = get_consumer_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'u_pos_weight_mean_year')
hist_click_df = pd.merge(hist_click_df, c_pos_weight_mean_year, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_pos_weight_mean_year, how='outer', on='item_id')

# 1.2 月
c_pos_weight_mean_month = get_consumer_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'u_pos_weight_mean_month')
hist_click_df = pd.merge(hist_click_df, c_pos_weight_mean_month, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_pos_weight_mean_month, how='outer', on='item_id')

# # 1.3 日
# c_pos_weight_mean_day = get_consumer_time_fea_by_weighted_mean(log_tahist_click_dfble, ['event_type'], 'u_pos_weight_mean_day')
# hist_click_df = pd.merge(hist_click_df, c_pos_weight_mean_day, how='outer', on='item_id')
# consumer_info = pd.merge(consumer_info, c_pos_weight_mean_day, how='outer', on='item_id')

# # 1.4 时
# c_pos_weight_mean_hour = get_consumer_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'u_pos_weight_mean_hour')
# hist_click_df = pd.merge(hist_click_df, c_pos_weight_mean_hour, how='outer', on='item_id')
# consumer_info = pd.merge(consumer_info, c_pos_weight_mean_hour, how='outer', on='item_id')

# # 1.5 分
# c_pos_weight_mean_minute = get_consumer_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'u_pos_weight_mean_minute')
# hist_click_df = pd.merge(hist_click_df, c_pos_weight_mean_minute, how='outer', on='item_id')
# consumer_info = pd.merge(consumer_info, c_pos_weight_mean_minute, how='outer', on='item_id')

# # 1.6 秒
# c_pos_weight_mean_second = get_consumer_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'u_pos_weight_mean_second')
# hist_click_df = pd.merge(hist_click_df, c_pos_weight_mean_second, how='outer', on='item_id')
# consumer_info = pd.merge(consumer_info, c_pos_weight_mean_second, how='outer', on='item_id')

# 2. 季节类特征
c_pos_weight_mean_season = get_consumer_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'u_pos_weight_mean_season')
hist_click_df = pd.merge(hist_click_df, c_pos_weight_mean_season, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_pos_weight_mean_season, how='outer', on='item_id')

# # 3. 时段类特征
# c_pos_weight_mean_time_bucket = get_consumer_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'u_pos_weight_mean_time_bucket')
# hist_click_df = pd.merge(hist_click_df, c_pos_weight_mean_time_bucket, how='outer', on='item_id')
# consumer_info = pd.merge(consumer_info, c_pos_weight_mean_time_bucket, how='outer', on='item_id')

# # 4. 星期类特征
# c_pos_weight_mean_weekday = get_consumer_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'u_pos_weight_mean_weekday')
# hist_click_df = pd.merge(hist_click_df, c_pos_weight_mean_weekday, how='outer', on='item_id')
# consumer_info = pd.merge(consumer_info, c_pos_weight_mean_weekday, how='outer', on='item_id')

# 5. 年度类特征
c_pos_weight_mean_yearday = get_consumer_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'u_pos_weight_mean_yearday')
hist_click_df = pd.merge(hist_click_df, c_pos_weight_mean_yearday, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_pos_weight_mean_yearday, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15935), Label(value='0 / 15935')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15935), Label(value='0 / 15935')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15935), Label(value='0 / 15935')))…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15935), Label(value='0 / 15935')))…

## 5.2 物品消费者的动力偏好

### 5.2.1 成交的动力偏好

In [None]:
c_deal_mean_power = get_consumer_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'u_deal_mean_power')
hist_click_df = pd.merge(hist_click_df, c_deal_mean_power, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_deal_mean_power, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4499), Label(value='0 / 4499'))), …

### 5.2.2 正反馈加权的动力偏好

In [None]:
c_pos_weight_mean_power = get_consumer_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'u_pos_weight_mean_power')
hist_click_df = pd.merge(hist_click_df, c_pos_weight_mean_power, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_pos_weight_mean_power, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15935), Label(value='0 / 15935')))…

## 5.3 物品消费者的类别偏好

### 5.3.1 成交最多的类别

In [None]:
def get_consumer_time_fea_by_deal_mode(data, in_cols, fea_name):
  """
  制作用户的时间习惯特征
  :param data: 数据集
  :param extra_cols: 除user_id和fea_name之外用到的特征列
  """
  in_cols += ['item_id', fea_name]
  data = data[in_cols]
  data = data[(data['event_type']=='deal')]
  agg_func = partial(agg_mode, key=fea_name)
  data = data.groupby('item_id').parallel_apply(agg_func).reset_index()

  data.rename(columns={fea_name: 'c_deal_mode_'+fea_name,}, inplace=True)
  
  return data[['item_id', 'c_deal_mode_'+fea_name]]

In [None]:
c_deal_mode_category_id = get_consumer_time_fea_by_deal_mode(hist_click_df, ['event_type'], 'u_deal_mode_category_id')
hist_click_df = pd.merge(hist_click_df, c_deal_mode_category_id, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_deal_mode_category_id, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4499), Label(value='0 / 4499'))), …

### 5.3.2 正向行为评分最高的类别

In [None]:
def get_consumer_time_fea_by_weighted_mode(data, in_cols, fea_name):
  """
  制作用户的时间习惯特征
  :param data: 数据集
  :param extra_cols: 除user_id和fea_name之外用到的特征列
  """
  in_cols += ['item_id', fea_name]
  data = data[in_cols]
  data = data[(data['event_type']=='deal') | (data['event_type']=='intent') | (data['event_type']=='view')]

  data['weight'] = data['event_type'].parallel_apply(ua2weight)

  agg_func = partial(agg_weight_mode, key=fea_name)
  data = data.groupby('item_id', dropna=True).parallel_apply(agg_func).reset_index()
  data.rename(columns={fea_name: 'c_pos_weight_mode_'+fea_name,}, inplace=True)
  
  return data[['item_id', 'c_pos_weight_mode_'+fea_name]]

In [None]:
c_weighted_mode_category_id = get_consumer_time_fea_by_weighted_mode(hist_click_df, ['event_type'], 'u_pos_weight_mode_category_id')
hist_click_df = pd.merge(hist_click_df, c_weighted_mode_category_id, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_weighted_mode_category_id, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15935), Label(value='0 / 15935')))…

## 5.4 物品消费者的地区偏好

### 5.4.1 成交的地区偏好

In [None]:
c_deal_mode_region = get_consumer_time_fea_by_deal_mode(hist_click_df, ['event_type'], 'u_deal_mode_region')
hist_click_df = pd.merge(hist_click_df, c_deal_mode_region, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_deal_mode_region, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4499), Label(value='0 / 4499'))), …

### 5.4.2 正反馈加权的地区偏好

In [None]:
c_weighted_mode_region = get_consumer_time_fea_by_weighted_mode(hist_click_df, ['event_type'], 'u_pos_weight_mode_region')
hist_click_df = pd.merge(hist_click_df, c_weighted_mode_region, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_weighted_mode_region, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15935), Label(value='0 / 15935')))…

## 5.5 物品消费者的价格偏好

### 5.5.1 成交的价格偏好

In [None]:
c_deal_mean_price = get_consumer_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'u_deal_mean_price')
hist_click_df = pd.merge(hist_click_df, c_deal_mean_price, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_deal_mean_price, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4499), Label(value='0 / 4499'))), …

### 5.5.2 正反馈加权的价格偏好

In [None]:
c_pos_weight_mean_price = get_consumer_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'u_pos_weight_mean_price')
hist_click_df = pd.merge(hist_click_df, c_pos_weight_mean_price, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_pos_weight_mean_price, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15935), Label(value='0 / 15935')))…

## 5.6 物品消费者的活跃度特征

### 5.6.1 购买行为的活跃度

In [None]:
c_active_deal_nums = get_consumer_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'u_active_deal_nums')
hist_click_df = pd.merge(hist_click_df, c_active_deal_nums, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_active_deal_nums, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4499), Label(value='0 / 4499'))), …

### 5.6.2 所有行为的加权活跃度

In [None]:
c_active_pos_score = get_consumer_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'u_active_score')
hist_click_df = pd.merge(hist_click_df, c_active_pos_score, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_active_pos_score, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15935), Label(value='0 / 15935')))…

### 5.6.3 购买行为的活跃天数

In [None]:
c_active_deal_day_nums = get_consumer_time_fea_by_deal_mean(hist_click_df, ['event_type'], 'u_active_deal_day_nums')
hist_click_df = pd.merge(hist_click_df, c_active_deal_day_nums, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_active_deal_day_nums, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4499), Label(value='0 / 4499'))), …

### 5.6.4 所有行为的加权活跃天数

In [None]:
c_active_weight_day_nums = get_consumer_time_fea_by_weighted_mean(hist_click_df, ['event_type'], 'u_active_weight_day_nums')
hist_click_df = pd.merge(hist_click_df, c_active_weight_day_nums, how='outer', on='item_id')
consumer_info = pd.merge(consumer_info, c_active_weight_day_nums, how='outer', on='item_id')

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=153151), Label(value='0 / 153151')…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=15935), Label(value='0 / 15935')))…

# 6.预览

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 200)

## 6.1 样本预览

In [None]:
hist_click_df

Unnamed: 0,event_time,event_type,item_id,power,category_id,region,price,user_id,user_session,is_nan_region,year,month,season,yearday,u_deal_mean_year,u_deal_mean_month,u_deal_mean_season,u_deal_mean_yearday,u_pos_weight_mean_year,u_pos_weight_mean_month,u_pos_weight_mean_season,u_pos_weight_mean_yearday,u_deal_mean_power,u_pos_weight_mean_power,u_deal_mode_category_id,u_pos_weight_mode_category_id,u_deal_mode_region,u_pos_weight_mode_region,u_deal_mean_price,u_pos_weight_mean_price,u_active_deal_nums,u_active_score,u_active_deal_day_nums,u_active_weight_day_nums,i_deal_mean_year,i_deal_mean_month,i_deal_mean_season,i_deal_mean_yearday,i_pos_weight_mean_year,i_pos_weight_mean_month,i_pos_weight_mean_season,i_pos_weight_mean_yearday,i_deal_mode_region,i_pos_weight_mode_region,i_active_deal_nums,i_active_score,i_active_deal_day_nums,i_active_weight_day_nums,c_deal_mean_u_deal_mean_year,c_deal_mean_u_deal_mean_month,c_deal_mean_u_deal_mean_season,c_deal_mean_u_deal_mean_yearday,c_pos_weight_mean_u_pos_weight_mean_year,c_pos_weight_mean_u_pos_weight_mean_month,c_pos_weight_mean_u_pos_weight_mean_season,c_pos_weight_mean_u_pos_weight_mean_yearday,c_deal_mean_u_deal_mean_power,c_pos_weight_mean_u_pos_weight_mean_power,c_deal_mode_u_deal_mode_category_id,c_pos_weight_mode_u_pos_weight_mode_category_id,c_deal_mode_u_deal_mode_region,c_pos_weight_mode_u_pos_weight_mode_region,c_deal_mean_u_deal_mean_price,c_pos_weight_mean_u_pos_weight_mean_price,c_deal_mean_u_active_deal_nums,c_pos_weight_mean_u_active_score,c_deal_mean_u_active_deal_day_nums,c_pos_weight_mean_u_active_weight_day_nums
0,2018-01-01 00:00:00,intent,0,0.073838,0,0.0,0.081670,0,0,0,2018,1,3,1,2018.0,1.0,3.0,3.0,2018.000000,1.000000,3.00000,2.617647,0.078987,0.078380,0.0,0.0,0.0,0.0,0.079637,0.079904,18.0,119,1.0,13,2018.375,4.5,1.25,124.5,2018.587156,5.642202,1.137615,157.275229,0.0,0.0,8.0,249,8.0,219,2018.375,4.5,1.25,124.375,2018.624491,5.553513,1.091879,153.988276,0.084268,0.083781,0.0,0.0,0.0,0.0,0.084536,0.084716,16.875,228.220183,1.25,46.211009
1,2018-01-03 21:41:24,deal,0,0.073838,0,0.0,0.081670,0,0,0,2018,1,3,3,2018.0,1.0,3.0,3.0,2018.000000,1.000000,3.00000,2.617647,0.078987,0.078380,0.0,0.0,0.0,0.0,0.079637,0.079904,18.0,119,1.0,13,2018.375,4.5,1.25,124.5,2018.587156,5.642202,1.137615,157.275229,0.0,0.0,8.0,249,8.0,219,2018.375,4.5,1.25,124.375,2018.624491,5.553513,1.091879,153.988276,0.084268,0.083781,0.0,0.0,0.0,0.0,0.084536,0.084716,16.875,228.220183,1.25,46.211009
2,2018-03-16 05:15:31,view,0,0.073838,0,0.0,0.081670,290,644,0,2018,3,0,75,,,,,2018.309859,4.211268,0.43662,113.267606,,0.088999,,0.0,,0.0,,0.088679,,207,,76,2018.375,4.5,1.25,124.5,2018.587156,5.642202,1.137615,157.275229,0.0,0.0,8.0,249,8.0,219,2018.375,4.5,1.25,124.375,2018.624491,5.553513,1.091879,153.988276,0.084268,0.083781,0.0,0.0,0.0,0.0,0.084536,0.084716,16.875,228.220183,1.25,46.211009
3,2018-03-16 07:09:33,intent,0,0.073838,0,0.0,0.081670,290,644,0,2018,3,0,75,,,,,2018.309859,4.211268,0.43662,113.267606,,0.088999,,0.0,,0.0,,0.088679,,207,,76,2018.375,4.5,1.25,124.5,2018.587156,5.642202,1.137615,157.275229,0.0,0.0,8.0,249,8.0,219,2018.375,4.5,1.25,124.375,2018.624491,5.553513,1.091879,153.988276,0.084268,0.083781,0.0,0.0,0.0,0.0,0.084536,0.084716,16.875,228.220183,1.25,46.211009
4,2018-03-17 07:35:46,view,0,0.073838,0,0.0,0.081670,290,644,0,2018,3,0,76,,,,,2018.309859,4.211268,0.43662,113.267606,,0.088999,,0.0,,0.0,,0.088679,,207,,76,2018.375,4.5,1.25,124.5,2018.587156,5.642202,1.137615,157.275229,0.0,0.0,8.0,249,8.0,219,2018.375,4.5,1.25,124.375,2018.624491,5.553513,1.091879,153.988276,0.084268,0.083781,0.0,0.0,0.0,0.0,0.084536,0.084716,16.875,228.220183,1.25,46.211009
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
342478,2020-12-29 10:27:20,view,32779,0.123667,0,9.0,0.124189,54481,86170,0,2020,12,3,364,,,,,2020.000000,12.000000,3.00000,364.000000,,0.123667,,0.0,,9.0,,0.124189,,3,,3,,,,,2020.000000,12.000000,3.000000,364.000000,,9.0,,3,,3,,,,,2020.000000,12.000000,3.000000,364.000000,,0.123667,,0.0,,9.0,,0.124189,,3.000000,,3.000000
342479,2020-12-31 08:42:15,remove_intent,32780,0.093869,0,26.0,0.089766,54506,86208,0,2020,12,3,366,,,,,2020.000000,12.000000,3.00000,365.500000,,0.074279,,0.0,,,,0.081670,,24,,8,,,,,,,,,,,,2,,1,,,,,,,,,,,,,,,,,,,,
342480,2020-12-31 08:47:41,remove_intent,32780,0.093869,0,26.0,0.089766,54506,86208,0,2020,12,3,366,,,,,2020.000000,12.000000,3.00000,365.500000,,0.074279,,0.0,,,,0.081670,,24,,8,,,,,,,,,,,,2,,1,,,,,,,,,,,,,,,,,,,,
342481,2020-12-30 18:13:24,view,32781,0.082121,0,17.0,0.084791,54510,86212,0,2020,12,3,365,,,,,2020.000000,12.000000,3.00000,365.000000,,0.082121,,0.0,,17.0,,0.084791,,3,,3,,,,,2020.000000,12.000000,3.000000,365.000000,,17.0,,3,,3,,,,,2020.000000,12.000000,3.000000,365.000000,,0.082121,,0.0,,17.0,,0.084791,,3.000000,,3.000000


## 6.2 特征统计信息

In [None]:
hist_click_df.describe()

Unnamed: 0,item_id,power,category_id,region,price,user_id,user_session,is_nan_region,year,month,season,yearday,u_deal_mean_year,u_deal_mean_month,u_deal_mean_season,u_deal_mean_yearday,u_pos_weight_mean_year,u_pos_weight_mean_month,u_pos_weight_mean_season,u_pos_weight_mean_yearday,u_deal_mean_power,u_pos_weight_mean_power,u_deal_mode_category_id,u_pos_weight_mode_category_id,u_deal_mode_region,u_pos_weight_mode_region,u_deal_mean_price,u_pos_weight_mean_price,u_active_deal_nums,u_active_score,u_active_deal_day_nums,u_active_weight_day_nums,i_deal_mean_year,i_deal_mean_month,i_deal_mean_season,i_deal_mean_yearday,i_pos_weight_mean_year,i_pos_weight_mean_month,i_pos_weight_mean_season,i_pos_weight_mean_yearday,i_deal_mode_region,i_pos_weight_mode_region,i_active_deal_nums,i_active_score,i_active_deal_day_nums,i_active_weight_day_nums,c_deal_mean_u_deal_mean_year,c_deal_mean_u_deal_mean_month,c_deal_mean_u_deal_mean_season,c_deal_mean_u_deal_mean_yearday,c_pos_weight_mean_u_pos_weight_mean_year,c_pos_weight_mean_u_pos_weight_mean_month,c_pos_weight_mean_u_pos_weight_mean_season,c_pos_weight_mean_u_pos_weight_mean_yearday,c_deal_mean_u_deal_mean_power,c_pos_weight_mean_u_pos_weight_mean_power,c_deal_mode_u_deal_mode_category_id,c_pos_weight_mode_u_pos_weight_mode_category_id,c_deal_mode_u_deal_mode_region,c_pos_weight_mode_u_pos_weight_mode_region,c_deal_mean_u_deal_mean_price,c_pos_weight_mean_u_pos_weight_mean_price,c_deal_mean_u_active_deal_nums,c_pos_weight_mean_u_active_score,c_deal_mean_u_active_deal_day_nums,c_pos_weight_mean_u_active_weight_day_nums
count,342483.0,342483.0,342483.0,207592.0,342483.0,342483.0,342483.0,342483.0,342483.0,342483.0,342483.0,342483.0,107144.0,107144.0,107144.0,107144.0,341311.0,341311.0,341311.0,341311.0,107144.0,341311.0,107144.0,341311.0,101411.0,291520.0,107144.0,341311.0,107144.0,342483.0,107144.0,342483.0,221578.0,221578.0,221578.0,221578.0,341857.0,341857.0,341857.0,341857.0,138156.0,207285.0,221578.0,342483.0,221578.0,342483.0,221578.0,221578.0,221578.0,221578.0,341857.0,341857.0,341857.0,341857.0,221578.0,341857.0,221578.0,341857.0,218002.0,322158.0,221578.0,341857.0,221578.0,341857.0,221578.0,341857.0
mean,8793.873165,0.099339,3.253537,28.91635,0.099311,23442.045129,41130.202074,0.393862,2018.785438,5.978957,1.239466,166.440904,2018.976453,5.680846,1.211414,157.485496,2018.791806,5.973122,1.240863,166.269043,0.089991,0.098811,1.047329,2.075242,20.424707,25.120098,0.089828,0.09878,15.443021,184.838745,1.080406,41.639807,2018.887503,5.66557,1.218726,157.226198,2018.785778,5.96954,1.251214,166.174547,21.218948,28.916656,7.689929,206.606699,7.540203,150.549026,2018.887255,5.664558,1.219193,157.184637,2018.787581,5.96961,1.251968,166.177468,0.089666,0.098492,0.712381,1.863566,16.235025,22.461314,0.089634,0.098459,18.96224,183.442729,1.087471,40.282676
std,8265.276927,0.056436,8.929575,34.870721,0.055557,17012.493296,25522.707245,0.488606,0.697252,3.030083,1.109542,92.674133,0.771212,2.907526,1.116308,88.625692,0.62108,2.67011,0.97595,81.725632,0.020895,0.045668,5.046571,7.548185,27.930239,32.672299,0.020305,0.045384,15.374345,286.126602,0.303702,52.627728,0.516485,1.988902,0.754214,60.750943,0.297871,1.4761,0.530593,45.180486,26.588221,34.881936,11.775034,386.3579,11.299039,244.427912,0.513613,1.968972,0.750297,60.188587,0.275299,1.409628,0.504613,43.158431,0.01658,0.037878,4.371308,7.313835,24.742732,31.91468,0.016226,0.037702,12.81456,141.426779,0.22795,24.546637
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2018.0,1.0,0.0,1.0,2018.0,1.0,0.0,1.0,2018.0,1.0,0.0,1.0,0.061986,0.035411,0.0,0.0,0.0,0.0,0.073976,0.073574,1.0,1.0,1.0,1.0,2018.0,1.0,0.0,1.0,2018.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,2018.0,1.0,0.0,1.0,2018.0,1.0,0.0,1.0,0.061986,0.035411,0.0,0.0,0.0,0.0,0.074744,0.073574,1.0,3.0,1.0,3.0
25%,2083.0,0.078276,0.0,6.0,0.080897,7575.0,18665.0,0.0,2018.0,4.0,0.0,93.0,2018.0,3.0,0.0,88.0,2018.0,4.0,0.419355,106.9,0.081626,0.082975,0.0,0.0,0.0,4.0,0.081889,0.083277,5.0,11.0,1.0,6.0,2018.6,4.645161,0.8,125.0,2018.636364,5.297101,1.0,145.514286,4.0,6.0,1.0,27.0,1.0,20.0,2018.6,4.606742,0.8,125.0,2018.660938,5.364499,1.0,147.653273,0.083032,0.085162,0.0,0.0,0.0,0.0,0.08316,0.085485,10.5,89.333333,1.0,24.289474
50%,6008.0,0.088245,0.0,17.0,0.086089,20940.0,39493.0,0.0,2019.0,6.0,1.0,160.0,2019.0,5.0,1.0,148.0,2019.0,5.967742,1.0,161.238095,0.085767,0.088705,0.0,0.0,8.0,14.0,0.085291,0.088283,11.0,91.0,1.0,24.0,2019.0,5.615385,1.153846,156.157895,2018.8,5.904762,1.210526,164.0,13.0,17.0,3.0,70.0,3.0,57.0,2018.999024,5.617425,1.153846,156.0,2018.799009,5.915731,1.212103,164.520331,0.086161,0.089846,0.0,0.0,8.0,9.0,0.086001,0.0896,16.5,167.678571,1.0,39.24359
75%,13470.0,0.100626,0.0,38.0,0.096039,38164.0,63447.0,1.0,2019.0,8.0,2.0,237.0,2020.0,8.0,2.0,219.0,2019.0,8.0,2.0,224.322581,0.093363,0.098912,0.0,0.0,27.0,34.0,0.092454,0.097166,19.0,232.0,1.0,56.0,2019.032258,6.666667,1.5,187.0,2018.977778,6.590909,1.466667,184.968504,31.0,38.0,8.0,200.0,8.0,167.0,2019.045455,6.666667,1.5,186.381026,2018.953402,6.532225,1.450124,183.257838,0.091411,0.097844,0.0,0.0,18.0,31.0,0.091308,0.096859,24.090909,247.820513,1.076923,52.421053
max,32783.0,1.0,54.0,221.0,1.0,54532.0,86246.0,1.0,2021.0,12.0,3.0,366.0,2020.0,12.0,3.0,366.0,2020.041667,12.0,3.0,366.0,0.671206,1.0,50.0,54.0,196.0,221.0,0.674402,1.0,96.0,3141.0,4.0,519.0,2020.0,12.0,3.0,366.0,2020.0,12.0,3.0,366.0,220.0,221.0,80.0,2803.0,73.0,1594.0,2020.0,12.0,3.0,366.0,2020.0,12.0,3.0,365.0,0.469566,0.737388,50.0,54.0,196.0,221.0,0.466983,0.745535,96.0,3141.0,4.0,519.0


## 6.3 填充信息统计 

In [None]:
hist_click_df.isna().apply(sum, axis=0)

event_time                                              0
event_type                                              0
item_id                                                 0
power                                                   0
category_id                                             0
region                                             134891
price                                                   0
user_id                                                 0
user_session                                            0
is_nan_region                                           0
year                                                    0
month                                                   0
season                                                  0
yearday                                                 0
u_deal_mean_year                                   235339
u_deal_mean_month                                  235339
u_deal_mean_season                                 235339
u_deal_mean_ye

## 6.4 u侧特征

In [None]:
user_info

Unnamed: 0,user_id,u_deal_mean_year,u_deal_mean_month,u_deal_mean_season,u_deal_mean_yearday,u_pos_weight_mean_year,u_pos_weight_mean_month,u_pos_weight_mean_season,u_pos_weight_mean_yearday,u_deal_mean_power,u_pos_weight_mean_power,u_deal_mode_category_id,u_pos_weight_mode_category_id,u_deal_mode_region,u_pos_weight_mode_region,u_deal_mean_price,u_pos_weight_mean_price,u_active_deal_nums,u_active_score,u_active_deal_day_nums,u_active_weight_day_nums
0,0,2018.0,1.0,3.0,3.0,2018.000000,1.000000,3.000000,2.617647,0.078987,0.078380,0.0,0.0,0.0,0.0,0.079637,0.079904,18.0,119,1.0,13
1,1,2019.0,4.0,0.0,106.0,2018.257426,2.405941,2.227723,59.415842,0.091322,0.085542,0.0,0.0,5.0,7.0,0.092490,0.086284,6.0,247,1.0,40
2,2,,,,,2018.000000,1.000000,3.000000,1.400000,,0.087910,,0.0,,0.0,,0.081509,,55,,7
3,3,,,,,2018.000000,1.000000,3.000000,2.000000,,0.098819,,0.0,,4.0,,0.095694,,30,,15
4,4,2018.0,6.0,1.0,155.0,2018.029630,5.622222,1.103704,144.637037,0.086750,0.100066,0.0,0.0,0.0,0.0,0.088292,0.101859,35.0,249,1.0,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54528,54528,,,,,2020.000000,12.000000,3.000000,366.000000,,0.069354,,0.0,,6.0,,0.077004,,3,,3
54529,54529,,,,,2020.041667,11.541667,3.000000,350.791667,,0.108355,,0.0,,0.0,,0.106126,,96,,8
54530,54530,,,,,,,,,,,,,,,,,,9,,1
54531,54531,,,,,,,,,,,,,,,,,,1,,1


In [None]:
user_info.to_csv(os.path.join(save_dir, 'user_info.csv'), index=False)

## 6.5 i侧特征

In [None]:
item_info

Unnamed: 0,item_id,i_deal_mean_year,i_deal_mean_month,i_deal_mean_season,i_deal_mean_yearday,i_pos_weight_mean_year,i_pos_weight_mean_month,i_pos_weight_mean_season,i_pos_weight_mean_yearday,i_deal_mode_region,i_pos_weight_mode_region,i_active_deal_nums,i_active_score,i_active_deal_day_nums,i_active_weight_day_nums,category_id,power,i_weighted_mean_price,i_weight_mode_region
0,0,2018.375000,4.500000,1.250000,124.50000,2018.587156,5.642202,1.137615,157.275229,0.0,0.0,8.0,249,8.0,219,0,0.073838,0.081670,0.0
1,1,2019.000000,8.000000,2.500000,232.00000,2018.895833,7.020833,1.729167,200.104167,0.0,0.0,4.0,106,4.0,83,0,0.087144,0.081670,0.0
2,2,2019.030303,5.727273,1.484848,159.69697,2018.890511,5.844282,1.272506,162.627737,0.0,0.0,33.0,933,31.0,762,0,0.070977,0.081670,0.0
3,3,2018.000000,1.000000,3.000000,3.00000,2018.826087,4.304348,1.347826,112.652174,0.0,0.0,1.0,56,1.0,50,0,0.053446,0.081670,0.0
4,4,2018.843750,4.250000,1.281250,113.50000,2018.834975,4.889163,1.337438,132.591133,0.0,0.0,32.0,907,30.0,719,0,0.064256,0.081670,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32311,32778,,,,,2020.000000,12.000000,3.000000,363.000000,,88.0,,4,,4,0,0.102593,0.107873,88.0
32312,32779,,,,,2020.000000,12.000000,3.000000,364.000000,,9.0,,3,,3,0,0.123667,0.124189,9.0
32313,32780,,,,,,,,,,,,2,,1,0,0.093869,0.089766,26.0
32314,32781,,,,,2020.000000,12.000000,3.000000,365.000000,,17.0,,3,,3,0,0.082121,0.084791,17.0


In [None]:
item_info.to_csv(os.path.join(save_dir, 'item_info.csv'), index=False)

## 6.6 consumer侧特征

In [None]:
consumer_info

Unnamed: 0,item_id,c_deal_mean_u_deal_mean_year,c_deal_mean_u_deal_mean_month,c_deal_mean_u_deal_mean_season,c_deal_mean_u_deal_mean_yearday,c_pos_weight_mean_u_pos_weight_mean_year,c_pos_weight_mean_u_pos_weight_mean_month,c_pos_weight_mean_u_pos_weight_mean_season,c_pos_weight_mean_u_pos_weight_mean_yearday,c_deal_mean_u_deal_mean_power,c_pos_weight_mean_u_pos_weight_mean_power,c_deal_mode_u_deal_mode_category_id,c_pos_weight_mode_u_pos_weight_mode_category_id,c_deal_mode_u_deal_mode_region,c_pos_weight_mode_u_pos_weight_mode_region,c_deal_mean_u_deal_mean_price,c_pos_weight_mean_u_pos_weight_mean_price,c_deal_mean_u_active_deal_nums,c_pos_weight_mean_u_active_score,c_deal_mean_u_active_deal_day_nums,c_pos_weight_mean_u_active_weight_day_nums
0,0,2018.375000,4.500000,1.250000,124.375000,2018.624491,5.553513,1.091879,153.988276,0.084268,0.083781,0.0,0.0,0.0,0.0,0.084536,0.084716,16.875000,228.220183,1.250000,46.211009
1,1,2019.000000,8.000000,2.500000,231.746154,2018.822025,6.917411,1.633370,197.490583,0.083506,0.085330,0.0,0.0,0.0,0.0,0.081776,0.083593,34.500000,280.458333,1.250000,42.333333
2,2,2019.030303,5.699394,1.484848,159.306667,2018.881431,5.843754,1.260233,162.719371,0.083643,0.085233,0.0,0.0,0.0,0.0,0.083726,0.085518,17.606061,251.635036,1.090909,62.430657
3,3,2018.000000,1.000000,3.000000,3.000000,2018.788999,4.614085,1.471202,123.704942,0.078987,0.078232,0.0,0.0,0.0,0.0,0.079637,0.081992,18.000000,181.434783,1.000000,30.173913
4,4,2018.843750,4.281250,1.262401,114.539683,2018.805931,5.202528,1.336897,142.733769,0.082153,0.083696,0.0,0.0,0.0,0.0,0.083640,0.084767,21.843750,246.770936,1.125000,46.472906
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32311,32778,,,,,2020.000000,12.000000,3.000000,362.944444,,0.086906,,0.0,,88.0,,0.087435,,43.000000,,8.000000
32312,32779,,,,,2020.000000,12.000000,3.000000,364.000000,,0.123667,,0.0,,9.0,,0.124189,,3.000000,,3.000000
32313,32780,,,,,,,,,,,,,,,,,,,,
32314,32781,,,,,2020.000000,12.000000,3.000000,365.000000,,0.082121,,0.0,,17.0,,0.084791,,3.000000,,3.000000


In [None]:
consumer_info.to_csv(os.path.join(save_dir, 'consumer_info.csv'), index=False)

## 6.7 ctx info

In [None]:
ctx_fea_list = ['event_time', 'event_type', 'item_id', 'region', 
         'price', 'user_id', 'user_session', 'is_nan_region', 
         'year', 'month', 'season', 'yearday']
ctx_info = log_table[ctx_fea_list]
hist_click_df = hist_click_df[ctx_fea_list]
last_click_df = last_click_df[ctx_fea_list]

In [None]:
ctx_info

Unnamed: 0,event_time,event_type,item_id,region,price,user_id,user_session,is_nan_region,year,month,season,yearday
0,2018-01-01 00:00:00,intent,0,0.0,0.081670,0,0,0,2018,1,3,1
1,2018-01-01 00:16:17,intent,1,0.0,0.081670,0,0,0,2018,1,3,1
2,2018-01-01 00:38:00,intent,2,0.0,0.081670,0,0,0,2018,1,3,1
3,2018-01-01 13:45:25,intent,3,0.0,0.081670,0,0,0,2018,1,3,1
4,2018-01-01 14:28:51,intent,4,0.0,0.081670,0,0,0,2018,1,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...
422121,2020-12-31 21:00:47,view,10356,15.0,0.085841,54532,86243,0,2020,12,3,366
422122,2020-12-31 21:06:13,view,25183,15.0,0.090755,54532,86244,0,2020,12,3,366
422123,2020-12-31 21:17:05,view,2889,15.0,0.102280,54532,86245,0,2020,12,3,366
422124,2020-12-31 21:17:05,view,2886,15.0,0.085841,54532,86246,0,2020,12,3,366


In [None]:
def get_timestamp(time_str):
  timeArray = time.strptime(time_str, "%Y-%m-%d %H:%M:%S")
  timeStamp = int(time.mktime(timeArray))
  return timeStamp

In [None]:
ctx_info['event_time'] = ctx_info['event_time'].map(get_timestamp)
hist_click_df['event_time'] = hist_click_df['event_time'].map(get_timestamp)
last_click_df['event_time'] = last_click_df['event_time'].map(get_timestamp)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ctx_info['event_time'] = ctx_info['event_time'].map(get_timestamp)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  hist_click_df['event_time'] = hist_click_df['event_time'].map(get_timestamp)


In [None]:
ctx_info.to_csv(os.path.join(save_dir, 'ctx_info.csv'), index=False)

In [None]:
hist_click_df.to_csv(os.path.join(save_dir, 'hist_click_df.csv'), index=False)
last_click_df.to_csv(os.path.join(save_dir, 'last_click_df.csv'), index=False)