In [4]:
import numpy as np
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns 

import plotly.express as px
import plotly.graph_objects as go

plt.style.use('ggplot')

In [5]:
# reference: https://www.icode9.com/content-4-1103054.html

In [6]:
user_log = pd.read_csv('../input/tianchitemp/user_log_format1.csv', dtype={'time_stamp':'str'})
user_info = pd.read_csv('../input/tianchitemp/user_info_format1.csv')
train_data1 = pd.read_csv('../input/tianchitemp/train_format1.csv')
submission = pd.read_csv('../input/repurchase/test_format1.csv')
train_data = pd.read_csv('../input/repurchase/train_format2.csv')

train_data1['origin'] = 'train'
submission['origin'] = 'test'
matrix = pd.concat([train_data1, submission], ignore_index=True, sort=False)
#print(matrix)


In [7]:
matrix.tail(5)

In [8]:
matrix.drop(['prob'], axis=1, inplace=True)
# 连接user_info表，通过user_id关联
matrix = matrix.merge(user_info, on='user_id', how='left')
# 使用merchant_id（原列名seller_id）
user_log.rename(columns={'seller_id':'merchant_id'}, inplace=True)

In [9]:
user_log['user_id'] = user_log['user_id'].astype('int32')
user_log['merchant_id'] = user_log['merchant_id'].astype('int32')
user_log['item_id'] = user_log['item_id'].astype('int32')
user_log['cat_id'] = user_log['cat_id'].astype('int32')
user_log['brand_id'].fillna(0, inplace=True)
user_log['brand_id'] = user_log['brand_id'].astype('int32')
user_log['time_stamp'] = pd.to_datetime(user_log['time_stamp'], format='%H%M')
# 1 for <18; 2 for [18,24]; 3 for [25,29]; 4 for [30,34]; 5 for [35,39]; 6 for [40,49]; 7 and 8 for >= 50; 0 and NULL for unknown

In [10]:
matrix['age_range'].fillna(0, inplace=True)
# 0:female, 1:male, 2:unknown
matrix['gender'].fillna(2, inplace=True)
matrix['age_range'] = matrix['age_range'].astype('int8')
matrix['gender'] = matrix['gender'].astype('int8')
matrix['label'] = matrix['label'].astype('str')
matrix['user_id'] = matrix['user_id'].astype('int32')
matrix['merchant_id'] = matrix['merchant_id'].astype('int32')
del user_info, train_data1

In [11]:
matrix.head()

# User

In [12]:
# User特征处理
groups = user_log.groupby(['user_id'])

In [13]:
user_log.head()

In [14]:
# 用户交互行为数量 u1
temp = groups.size().reset_index().rename(columns={0:'u1'})
matrix = matrix.merge(temp, on='user_id', how='left')

In [15]:
# 对于每个user_id 不重复的item_id的数量 => u2
temp = groups['item_id'].agg([('u2', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')

In [16]:
# 对于每个user_id 不重复的merchant_id的数量 => u3
temp = groups['merchant_id'].agg([('u3', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')

In [17]:
# 对于每个user_id 不重复的cat_id的数量 => u4
temp = groups['cat_id'].agg([('u4', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')

In [18]:
# 对于每个user_id 不重复的brand_id的数量 => u5
temp = groups['brand_id'].agg([('u5', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')

In [19]:
# 统计每个用户操作类型为0，1，2，3的总个数
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'u6', 1:'u7', 2:'u8', 3:'u9'})
matrix = matrix.merge(temp, on='user_id', how='left')

In [20]:
# 统计用户多少次在不同时间段进行操作
temp = groups['time_stamp'].agg([('u10', 'nunique')]).reset_index()
matrix = matrix.merge(temp, on='user_id', how='left')

In [21]:
# 时间间隔特征 u11 按照小时
# 对于每个user_id 计算time_stamp的最小时间 => F_time, 最大时间max => L_time
temp = groups['time_stamp'].agg([('F_time', 'min'), ('L_time', 'max')]).reset_index()
temp['u11'] = (temp['L_time'] - temp['F_time']).dt.seconds/3600
matrix = matrix.merge(temp[['user_id', 'u11']], on='user_id', how='left')

# Merchant

In [22]:
# 商家特征处理
groups = user_log.groupby(['merchant_id'])

In [23]:
# 商家被交互行为数量 m1
temp = groups.size().reset_index().rename(columns={0:'m1'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

In [24]:
# 统计商家被交互的user_id, item_id, cat_id, brand_id 唯一值
temp = groups['user_id', 'item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'user_id':'m2', 'item_id':'m3', 'cat_id':'m4', 'brand_id':'m5'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

In [25]:
# 统计商家被交互的action_type 唯一值
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'m6', 1:'m7', 2:'m8', 3:'m9'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

In [26]:
# 按照merchant_id 统计随机负采样的个数
temp = train_data[train_data['label']==-1].groupby(['merchant_id']).size().reset_index().rename(columns={0:'m10'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

In [27]:
# 统计商家被交互的年龄群体 占比
# temp = groups['age_range'].value_counts().unstack().reset_index().rename(columns={'0':'m_a_0','1':'m_a_1','2':'m_a_2','3':'m_a_3','4':'m_a_4','5':'m_a_5','6':'m_a_6','7':'m_a_7','8':'m_a_8'})
# temp

# User+Merchant

In [28]:
# 按照user_id, merchant_id分组
groups = user_log.groupby(['user_id', 'merchant_id'])
temp = groups.size().reset_index().rename(columns={0:'um1'}) #统计行为个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['item_id', 'cat_id', 'brand_id'].nunique().reset_index().rename(columns={'item_id':'um2', 'cat_id':'um3', 'brand_id':'um4'}) #统计item_id, cat_id, brand_id唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['action_type'].value_counts().unstack().reset_index().rename(columns={0:'um5', 1:'um6', 2:'um7', 3:'um8'})#统计不同action_type唯一个数
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left')
temp = groups['time_stamp'].agg([('first', 'min'), ('last', 'max')]).reset_index()
temp['um9'] = (temp['last'] - temp['first']).dt.seconds/3600
temp.drop(['first', 'last'], axis=1, inplace=True)
matrix = matrix.merge(temp, on=['user_id', 'merchant_id'], how='left') #统计时间间隔
#print(matrix)

# Derivative Features

In [29]:
#用户购买点击比
matrix['r1'] = matrix['u9']/matrix['u7'] 
#商家购买点击比
matrix['r2'] = matrix['m8']/matrix['m6'] 
#不同用户不同商家购买点击比
matrix['r3'] = matrix['um7']/matrix['um5']

In [30]:
matrix.fillna(0, inplace=True)

In [31]:
# # 修改age_range字段名称为 age_0, age_1, age_2... age_8
temp = pd.get_dummies(matrix['age_range'], prefix='age')
matrix = pd.concat([matrix, temp], axis=1)
temp = pd.get_dummies(matrix['gender'], prefix='g')
matrix = pd.concat([matrix, temp], axis=1)
matrix.drop(['age_range', 'gender'], axis=1, inplace=True)
print(matrix)

In [52]:
user_info = pd.read_csv('../input/tianchitemp/user_info_format1.csv')
merchant_user_log = user_log.merge(user_info, on='user_id', how='left')
del user_info

In [53]:
# 商家对应用户画像特征
tempp = merchant_user_log.groupby(['merchant_id','user_id'])['age_range','gender'].mean().reset_index().rename(columns={0:'age_range',1:'gender'})
groups = tempp.groupby('merchant_id')
del merchant_user_log,tempp

In [59]:
# 每个商家各个年龄段用户的数量
temp = groups['age_range'].value_counts().unstack().reset_index().rename(columns={0:'m_a_0',1:'m_a_1',2:'m_a_2',3:'m_a_3',4:'m_a_4',5:'m_a_5',6:'m_a_6',7:'m_a_7',8:'m_a_8'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

In [55]:
# 每个商家各个性别用户的数量
temp = groups['gender'].value_counts().unstack().reset_index().rename(columns={0:'m_g_0',1:'m_g_1',2:'m_g_2'})
matrix = matrix.merge(temp, on='merchant_id', how='left')

In [61]:
# matrix.drop(['m_a_0_x','m_a_1_x','m_a_2_x','m_a_3_x','m_a_4_x','m_a_5_x','m_a_6_x','m_a_7_x','m_a_8_x'],axis=1,inplace=True)
# matrix.drop('age_range',axis=1,inplace=True)
matrix.fillna(0)

In [64]:
# 上述特征占总用户的比值
for i in range(0,9):
    matrix['r_m_a_'+str(i)]  = matrix['m_a_'+str(i)]/matrix['m2']

In [65]:
for i in range(0,3):
    matrix['r_m_g_'+str(i)]  = matrix['m_g_'+str(i)]/matrix['m2']

In [66]:
matrix

In [67]:
matrix.to_pickle('./matrix.pkl')
matrix.to_csv('./matrix.csv')

# Training 

In [2]:
import pandas as pd
a = pd.DataFrame({'a1':[1,2,3],'a2':['1.2','2.2.2','2.1.2']}) 

In [2]:
a

Unnamed: 0,a1,a2
0,1,1.2
1,2,2.2.2
2,3,2.1.2


In [10]:
for i in range(0,9):
    a['dummy'+str(i+1)] = a['a2'].apply(lambda x: 1 if x.split('.')[0]==str(i+1) else 0, 1)

In [11]:
a

Unnamed: 0,a1,a2,dummy1,dummy2,dummy3,dummy4,dummy5,dummy6,dummy7,dummy8,dummy9
0,1,1.2,1,0,0,0,0,0,0,0,0
1,2,2.2.2,0,1,0,0,0,0,0,0,0
2,3,2.1.2,0,1,0,0,0,0,0,0,0


In [3]:
'2.2'>'10.2'

True

In [64]:
dict_ = {'a':1,'b':3}
data = pd.DataFrame({'Country':['a','a','b'],'a2':['1.2','2.2.2','2.1.2'],'Area Income':[None,1,None],'label':[1,1,0]}) 
test = pd.DataFrame({'Country':['a','c','c'],'a2':['1.2','2.2.2','2.1.2'],'Area Income':[None,1,None],'label':[1,1,0]}) 

In [51]:
import numpy as np
data['temp'] = data['Area Income'].isnull()
data['Area Income'] = data.apply(lambda x: dict_[x['Country']] if x['temp']== True else x['Area Income'],axis=1) 
data.drop('temp',axis=1,inplace=True)

In [52]:
data

Unnamed: 0,Country,a2,Area Income,label
0,a,1.2,1.0,1
1,a,2.2.2,1.0,1
2,b,2.1.2,3.0,0


In [62]:
# target encoding with smoothing parameter = 4 and 10
c=4; nmid=10; y_avg = data['label'].mean()
y_catvar = data.groupby('Country')['label'].mean()
num_instances_catvar = data.groupby('Country').size()
y_catvar_smooth = y_avg + (y_catvar - y_avg) / (1 + np.exp( - (num_instances_catvar - nmid) / c))
data['Country'+'_target_encoded'] = data['Country'].map(y_catvar_smooth)

In [63]:
data

Unnamed: 0,Country,a2,Area Income,label,Country_target_encoded
0,a,1.2,1.0,1,0.706401
1,a,2.2.2,1.0,1,0.706401
2,b,2.1.2,3.0,0,0.6031


In [65]:
test['Country'+'_target_encoded'] = test['Country'].map(y_catvar_smooth)

In [66]:
test

Unnamed: 0,Country,a2,Area Income,label,Country_target_encoded
0,a,1.2,,1,0.706401
1,c,2.2.2,1.0,1,
2,c,2.1.2,,0,


In [67]:
test.isnull().sum(

Country                   0
a2                        0
Area Income               2
label                     0
Country_target_encoded    2
dtype: int64

In [None]:
data['log_Area'] = np.log(data['Area Income'])
