# 1.导包

In [83]:
# OS
import os
import datetime
import pytz
from multiprocessing import Pool
import multiprocessing

#数据处理
import pandas as pd
# import ray.dataframe as pd
import numpy as np
import random
import sklearn.preprocessing as preprocessing
from scipy.special import boxcox1p

#可视化
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

print(multiprocessing.cpu_count())

12


# 2.数据载入、对齐、排序

In [108]:
data = pd.read_pickle('data/round1_train1')
data_test = pd.read_pickle('data/round1_test1')

In [85]:
# 给test数据拼接一个f26特征
data_test = pd.concat([
    data_test,
    pd.Series([0] * len(data_test),name = 'is_trade')
], axis = 1)

# 拼接    
data = data.append(data_test)

# 按时间排序，重置索引
data.reset_index(drop = True,inplace = True)
data.reset_index(inplace = True)
data.set_index('context_timestamp', drop = False,inplace=True)# f16转换为索引
data.sort_values(['context_timestamp'],inplace = True)

# 3.特征转换、衍生
## 3.1离散数据
### 3.1.1将出现次数少的值合并到统一类别中
- f6：将10（456）、2（347）、1（85）、11（21）、0（12）、17（1）、16（1）这几个取值单独拉出一个类。
- f9：将8（449）、7（245）、0（123）、6（116）、5（63）、4（33）3（11）、2（5）、1（1）这几个取值单独拉出一个类。
- f20：将23（353）、4（266）、2（87）、3（80）、1（20）、0（7）、25（4）这几个取值单独拉出一个类。
- f22：将5002（477）、5020（357）、5000（81）、5019（70）、5001（60）、4999（7）这几个取值单独拉出一个类。

In [86]:
# TODO

### 3.1.2onehot/dummy-trap
- f6广告商品的价格等级、f9广告商品被展示次数的等级、f11用户的预测性别编号、f12用户的预测年龄等级、f13用户的预测职业编号、f14用户的星级编号
- f7、f8、f17、f20、f22不确定

#### 3.1.2.1f2

In [87]:
def split_f2(data):
    # 切分
    data = pd.concat([
        data.loc[:,:'item_category_list'],
        data['item_category_list'].astype(np.str).str.split(';', expand=True),
        data.loc[:,'item_property_list':]
    ], axis = 1)
    data.rename({
        0:'item_category_list:1',
        1:'item_category_list:2',
        2:'item_category_list:3'
    }, axis='columns',inplace = True)

    # 类型转换
    data['item_category_list:3'].fillna('-1',inplace = True)
    data['item_category_list:1'] = data['item_category_list:1'].astype('int')
    data['item_category_list:2'] =data['item_category_list:2'].astype('int')
    data['item_category_list:3'] = data['item_category_list:3'].astype('int')

    # 对f2:2、f2:3进行onehot编码
    temp = pd.get_dummies(data['item_category_list:2'],prefix = 'item_category_list:2')  
    data = pd.concat([
        data.loc[:,:'item_category_list:2'],
        temp,
        data.loc[:,'item_category_list:3':]
    ], axis = 1)
    
    temp = pd.get_dummies(data['item_category_list:3'],prefix = 'item_category_list:3')  
    return pd.concat([
        data.loc[:,:'item_category_list:3'],
        temp,
        data.loc[:,'item_property_list':]
    ], axis = 1)

data = split_f2(data)

#### 3.1.2.2f11

In [88]:
def onehot_f11(data):
    temp = pd.get_dummies(data['user_gender_id'])  
    temp = temp.rename({
        0.0:'user_gender_id:woman',
        1.0:'user_gender_id:man',
        2.0:'user_gender_id:family',
        -1.0:'user_gender_id:other'
    },axis = 'columns')
    return pd.concat([
        data.loc[:,:'user_gender_id'],
        temp,
        data.loc[:,'user_age_level':]
    ], axis = 1)

data = onehot_f11(data)

#### 3.1.2.3f13

In [89]:
def onehot_f13(data):
    temp = pd.get_dummies(data['user_occupation_id']).rename({
        2002.0:'user_occupation_id:2002.0',
        2003.0:'user_occupation_id:2003.0',
        2004.0:'user_occupation_id:2004.0',
        2005.0:'user_occupation_id:2005.0'
    },axis = 'columns')
    return pd.concat([
        data.loc[:,:'user_occupation_id'],
        temp,
        data.loc[:,'user_star_level':]
    ], axis = 1)

data = onehot_f13(data)

- 将f11、f13进行onehot编码后，xgboost的验证集loss由0.084306减小到0.084273
- 将f2;2用onehot表示后，xgboost的验证集loss由0.0775减小到0.07373
- 将f5用onehot表示后，xgboost的验证集loss由0.086977提高到0.087148，没用

### 3.1.3自然数编码
- 消耗内存小，训练时间快，但是相比one-hot特征的质量不高，含了一个假设：不同的类别之间，存在一种顺序关系。

### 3.1.4聚类编码
- 和独热编码相比，聚类编码试图充分利用每一列0与1的信息表达能力。聚类编码时一般需要特定的专业知识（domain knowledge），例如ZIP码可以根据精确度分层为ZIP3、ZIP4、ZIP5、ZIP6，然后按层次进行编码。

## 3.2连续数据
### 3.2.1标准化、归一化：分布太宽，做一下scaling

In [90]:
# scaling
# def scaling(data):
#     for i in range(0,8):
#         data['f12'] = data['f12'].replace(1000. + i,0 + i)
#     for i in range(0,11):
#         data['f14'] = data['f14'].replace(3000. + i,0 + i)
#     for i in range(0,20):
#         data['f17'] = data['f17'].replace(4001. + i,0 + i)
#     for i in range(0,22):
#         data['f22'] = data['f22'].replace(4999. + i,0 + i)
#         return data

# data = scaling(data)
# if(flag is False):
#     data_test = scaling(data_test)

- 将f112、f14、f17、f22进行范围缩放后，xgboost的验证集loss由0.0775没变，没用。

### 3.2.2正态化：对偏度大于0.75的数值特征（长尾分布）
- 用log1p函数进行转化使其更加服从高斯分布
np.log1p(train.SalePrice)
- Box-Cox变换

In [91]:
# f7:2 f8:2.5 f9:5 f20:1.5
# def boxcox(data):
#     data['f7'] = boxcox1p(data['f7'],2)
#     data['f8'] = boxcox1p(data['f8'],2.5)
#     data['f9'] = boxcox1p(data['f9'],5)
#     data['f20'] = boxcox1p(data['f20'],1.5)
#     return data

# data = boxcox(data)
# if(flag is False):
#     data_test = boxcox(data_test)

### 3.2.3离散化：Binning
- 只有在了解属性的领域知识的基础，确定属性能够划分成简洁的范围时分箱才有意义，即所有的数值落入一个分区时能够呈现出共同的特征。
- 当不想让模型总是尝试区分值之间是否太近时，分区可以避免出现过拟合。



- 正态化后，xgboost的验证集loss不变，没用

### 3.2.4时间数据的转换

In [92]:
def transform_f16(data):
    now = pd.to_datetime(data['context_timestamp'],unit='s',utc = True)
    now = now.apply(lambda x:x.astimezone(pytz.timezone('Asia/Shanghai')))
    
    # year = pd.Series([-1] * 477303)
    # month = pd.Series([-1] * 477303)
    day = now.apply(lambda x:x.day).rename('context_timestamp:day')
    hour = now.apply(lambda x:x.hour).rename('context_timestamp:hour')
    minute = now.apply(lambda x:x.minute).rename('context_timestamp:minute')
    second = now.apply(lambda x:x.second).rename('context_timestamp:second')
    dayofweek = now.apply(lambda x:x.dayofweek).rename('context_timestamp:dayofweek')

    data = pd.concat([
        data.loc[:,:'context_timestamp'],
        day,
        hour,
        minute,
        second,
        dayofweek,
        data.loc[:,'context_page_id':]
    ], axis = 1)
    data.rename({'is_trade':'label'}, axis='columns',inplace = True)
    return data

data = transform_f16(data)

- 转换时间f16为多个特征并删除特征f16后，xgboost的验证集loss较明显的降低
- 加入dayofweek特征后，xgboost的验证集loss没变，没用

## 3.3高势集数据（High Categorical）（f1、f3、f4、f10、f18、f19）
### 3.3.1高势集类别进行经验贝叶斯转换成数值feature
### 3.3.2平均数编码
- 平均数编码（mean encoding），针对高基数类别特征的有监督编码。当一个类别特征列包括了极多不同类别时（如家庭地址，动辄上万）时，可以采用。优点：和独热编码相比，节省内存、减少算法计算时间、有效增强模型表现。


- 将f2属性拆分成三个子属性后，xgboost的验证集loss不变，没用

### 3.3.3word embedding
#### 3.3.3.1抽取f3数据

In [93]:
def func(x):
    res = []
    for now in x.split(';'):
        if(len(now.split(':')) != 1):
            res += now.split(':')[1].split(',')
    if('-1' in res):
        # print(1)
        here = set(res)
        here.remove('-1')
        return list(here)
    else:
        return list(set(res))
predict_category_property = data['predict_category_property'].apply(func).tolist()

item_property_list = data['item_property_list'].apply(lambda x:x.split(';')).tolist()

In [94]:
predict_category_property_file = open('data/predict_category_property','w')
assert len(predict_category_property) == len(item_property_list)
for i in range(len(predict_category_property)):
    if(len(predict_category_property[i]) == 0):
        predict_category_property_file.write('\n')
    for j in range(len(predict_category_property[i])):
        if(j == 0 and j == len(predict_category_property[i]) - 1):
            predict_category_property_file.write(predict_category_property[i][j] + '\n')
        elif(j == 0):
            predict_category_property_file.write(predict_category_property[i][j])
        elif(j == len(predict_category_property[i]) - 1):
            predict_category_property_file.write(' ' + predict_category_property[i][j] + '\n')
        else:
            predict_category_property_file.write(' ' + predict_category_property[i][j])

    if(len(item_property_list[i]) == 0):
        predict_category_property_file.write('\n')
    for j in range(len(item_property_list[i])):
        if(j == 0 and j == len(item_property_list[i]) - 1):
            predict_category_property_file.write(item_property_list[i][j] + '\n')
        elif(j == 0):
            predict_category_property_file.write(item_property_list[i][j])
        elif(j == len(item_property_list[i]) - 1):
            predict_category_property_file.write(' ' + item_property_list[i][j] + '\n')
        else:
            predict_category_property_file.write(' ' + item_property_list[i][j])

#### 3.3.5.3doc2vec

In [95]:
# 模型训练
from gensim.models.doc2vec import Doc2Vec,TaggedLineDocument
if not os.path.exists('data/item_property_list_doc_model'):
    sentences = TaggedLineDocument('data/predict_category_property')
    model = Doc2Vec(sentences,size=10, window=7, min_count=1, negative=3, hs=0)
    model.save('data/item_property_list_doc_model')  
else:
    model = Doc2Vec.load('data/item_property_list_doc_model')  

In [96]:
from numpy import linalg

def cos(vector1,vector2):  
    dot_product = 0.0;  
    normA = 0.0;  
    normB = 0.0;  
    for a,b in zip(vector1,vector2):  
        dot_product += a*b  
        normA += a**2  
        normB += b**2  
    if normA == 0.0 or normB==0.0:  
        return None  
    else:  
        return dot_product / ((normA*normB)**0.5)  

In [97]:
temp = data.reset_index(drop = True).apply(
    lambda x:
    cos(
        model.infer_vector(item_property_list[x.name]),
        model.infer_vector(predict_category_property[x.name])
       ),axis = 1)

In [98]:
temp.index = data.index
data = pd.concat([
    data.loc[:,:'shop_score_description'],
    temp.rename('embedding-similarity'),
    data.loc[:,'label']
],axis = 1)
del temp

### 3.3.4计算Jaccard相似度

In [99]:
def func(x):
    res = []
    for cat in x['predict_category_property'].split(';'):
        for pro in cat.split(':')[1].split(','):
            if(pro != '-1'):
                res.append(pro)
    return set(res)

def get_jaccard(real,predict):
    temp = pd.Series([-1.0] * len(real))
    for i in range(len(temp)):
        temp[i] = len(real.iloc[i] & predict.iloc[i]) * 1.0 / len(real.iloc[i] | predict.iloc[i])
    return temp

In [100]:
predict = data.apply(func,axis = 1)
real = data.apply(lambda x:set(x['item_property_list'].split(';')),axis = 1)
jaccard = get_jaccard(real,predict)

jaccard.index = data.index

data = pd.concat([
    data.loc[:,:'embedding-similarity'],
    jaccard.rename('f3-f18-jaccard',inplace = True),
    data.loc[:,'label']
],axis = 1)

## 3.4保存特征转换、衍生的结果

In [101]:
data.to_pickle('data/data')

- 用了embedding处理f3后，验证集的logloss由0.081953下降到了0.081787

# 4.特征组合

In [90]:
days = 5 # 滑动窗口大小
day_num = 8

## 4.1当前点击前若干天购买次数

In [91]:
def get_count_temp(i,name):
    temp = data[
        (data['context_timestamp:day'] >= 18 + i - days) & 
        (data['context_timestamp:day'] <= 17 + i)]\
    .groupby(name).apply(lambda x:x['label'].sum())  # 前n天出现商品的转化率=
    
    if(len(name) == 1):
        name = name[0]
        res = data[data['context_timestamp:day'] == i + 18][name].apply(lambda x:temp[x] if x in temp else -1)
        del temp
        return res
    else:
        res = data[data['context_timestamp:day'] == i + 18].apply(lambda x:temp[(x[name[0]],x[name[1]])] if (x[name[0]],x[name[1]]) in temp else -1,axis = 1)
        del temp
        return res
    
def get_count(name):
    temp = pd.Series([])
    for i in range(day_num):
        now = get_count_temp(i,name)
        temp = temp.append(now)
    return temp

In [93]:
# 5 min
data_combination = pd.DataFrame()

start = datetime.datetime.now()
if(not os.path.exists('data/myfeature/buy_count_' + str(days))):
    features = ['item_id','item_brand_id','item_city_id','user_id','shop_id']
    res = {}
    
    # 并发执行
    pros = Pool()
    for feature in features:
        res[feature] = pros.apply_async(get_count,(list((feature,)),))
            
    for i in range(len(features)):
        for j in range(i + 1,len(features)):
            feature = features[i] + '-' + features[j]
            res[feature] = pros.apply_async(get_count,(list((features[i],features[j])),))
    pros.close()
    pros.join()
    
    # 组合
    for now in res:
        data_combination = pd.concat([
            data_combination,
            res[now].get().rename(now + '-buy-count-' + str(days))
        ],axis = 1)

        # 保存
    data_combination.to_pickle('data/myfeature/buy_count_' + str(days))

else:
    data_combination = pd.read_pickle(
        'data/myfeature/buy_count_' + str(days))
print(datetime.datetime.now() - start)

0:05:06.909757


## 4.2当前点击前若干天浏览广告次数

In [94]:
def get_browse_count_temp(i,name):
    temp = data[
        (data['context_timestamp:day'] >= 18 + i - days) & 
        (data['context_timestamp:day'] <= 17 + i)
    ].groupby(name).apply(lambda x:len(x))  # 前n天出现商品的转化率=
    
    if(len(name) == 1):
        name = name[0]
        res = data[data['context_timestamp:day'] == i + 18][name]\
        .apply(lambda x:temp[x] if x in temp else -1)
        del temp
        return res
    else:
        res = data[data['context_timestamp:day'] == i + 18]\
    .apply(lambda x:temp[(x[name[0]],x[name[1]])] if (x[name[0]],x[name[1]]) in temp else -1,axis = 1)
        del temp
        return res
    
def get_browse_count(name):
    temp = pd.Series([])
    for i in range(day_num):
        now = get_browse_count_temp(i,name)
        temp = temp.append(now)
        del now
    return temp

In [95]:
# 2.5min
start = datetime.datetime.now()
data_combination = pd.DataFrame()

if(not os.path.exists('data/myfeature/browse_count_' + str(days))):
    # 并发执行
    pros = Pool()
    res = {}
    
    for feature in features:
        res[feature] = pros.apply_async(get_browse_count,(list((feature,)),))
            
    for i in range(len(features)):
        for j in range(i + 1,len(features)):
            feature = features[i] + '-' + features[j]
            res[feature] = pros.apply_async(get_browse_count,(list((features[i],features[j])),))
    pros.close()
    pros.join()
    
    # 组合
    for now in res:
        data_combination = pd.concat([
            data_combination,
            res[now].get().rename(now + '-browse-count-' + str(days))
        ],axis = 1)

    # 保存
    data_combination.to_pickle('data/myfeature/browse_count_' + str(days))
else:
    data_combination = pd.read_pickle('data/myfeature/browse_count_' + str(days))
print(datetime.datetime.now() - start)

0:02:17.534638


## 4.3当前点击前若干天转化率

In [111]:
data_combination = pd.DataFrame()
if(not os.path.exists('data/myfeature/ratio_' + str(days))):
    temp = pd.concat([
        pd.read_pickle('data/myfeature/buy_count_' + str(days)),
        pd.read_pickle('data/myfeature/browse_count_' + str(days)),
    ],axis = 1)
    temp[temp == -1] = np.nan
    data_combination = pd.DataFrame()
    
    for feature in features:
        data_combination = pd.concat([
            data_combination,
            (temp[feature + '-buy-count-' + str(days)]/temp[feature + '-browse-count-' + str(days)]).rename(feature + '-ratio-' + str(days))
        ],axis = 1)
            
    for i in range(len(features)):
        for j in range(i + 1,len(features)):
            feature = features[i] + '-' + features[j]
            data_combination = pd.concat([
                data_combination,
                (temp[feature + '-buy-count-' + str(days)]/temp[feature + '-browse-count-' + str(days)]).rename(feature + '-ratio-' + str(days))
            ],axis = 1)
    
    data_combination.to_pickle('data/myfeature/ratio_' + str(days))

else:
    data_combination = pd.read_pickle('data/myfeature/ratio_' + str(days))

- 加入浏览次数特征后，xgb的logloss由0.08201下降到0.081756
- 将f19进行转换为点击概率后，xgboost的验证集loss由0.084273减小到0.080542
- 将f1进行转换为点击概率后，xgboost的验证集loss由0.080542减小到0.0775
- 将f5进行转换为点击概率后，lgb的验证集loss由0.08287减小到0.08221 
- 将f10进行转换为点击概率后，lgb的验证集loss由0.08221减小到0.0821587

- 加上f1-f10-label、f1-f4-label、f1-f5-label、f1-f19-label后，lbg的logloss从0.082215下降到0.0820568
- 加上f4-f5-label、f4-f10-label、f4-f19-label、f5-f10-label、f5-f19-label、f10-f19后，lbg的logloss从0.0820568下降到0.0819415

## 4.4当前点击前若干小时的浏览次数

In [100]:
length = 1 * 60 * 60 # 取前1个小时

In [103]:
def get_browse_count_hour(name):
    now = data.groupby(name)
    
    def func1(x,name):
        if(len(name) == 1):
            name = name[0]
            return len(now.get_group(x[name]).loc[x.name - length:x.name - 1])
        else:
            return len(now.get_group((x[name[0]],x[name[1]])).loc[x.name - length:x.name - 1])
    
    res =  data.apply(func1,axis = 1,args = (name,))
    del now

    return res

In [104]:
# 3.5小时
start = datetime.datetime.now()

if(not os.path.exists('data/myfeature/browse_count_hour_ago_' + str(length))):
    res = {}
    features = ['item_id','item_brand_id','item_city_id','user_id','shop_id']
    data_combination = pd.DataFrame()
    
    # 并发执行
    pros = Pool()
    for feature in features:
        res[feature] = pros.apply_async(get_browse_count_hour,(list((feature,)),))
            
    for i in range(len(features)):
        for j in range(i + 1,len(features)):
            feature = features[i] + '-' + features[j]
            res[feature] = pros.apply_async(get_browse_count_hour,(list((features[i],features[j])),))

    pros.close()
    pros.join()
    
    # 组合
    for now in res:
        data_combination = pd.concat([
            data_combination,
            res[now].get().rename(now + '-browse-count-hour-ago-' + str(length))
        ],axis = 1)

    del res

    # 保存
    data_combination.to_pickle('data/myfeature/browse_count_hour_ago_' + str(length))
else:
    data_combination = pd.read_pickle('data/myfeature/browse_count_hour_ago_' + str(length))
print(datetime.datetime.now() - start)

3:32:48.059314


## 4.5展示、收藏、销量之间的比例

In [135]:
if(not os.path.exists('data/myfeature/proportion')):
    data_combination = pd.DataFrame()
    
    # 展示（f9）-收藏（f8）
    data_combination = pd.concat([
        data_combination,
        (data['item_collected_level']/data['item_pv_level']).rename('item_collected_level-item_pv_level-proportion')
    ],axis = 1)
    
    # 收藏（f8）-销量（f7）
    data_combination = pd.concat([
        data_combination,
        (data['item_sales_level']/data['item_collected_level']).rename('item_sales_level-item_collected_level-proportion')
    ],axis = 1)

    # 展示（f9）-销量（f7）
    data_combination = pd.concat([
        data_combination,
        (data['item_sales_level']/data['item_pv_level']).rename('item_sales_level-item_pv_level-proportion')
    ],axis = 1)

    data_combination.to_pickle('data/myfeature/proportion')

else:
    data_combination = pd.read_pickle('data/myfeature/proportion')

## 4.6上次到这次浏览
### 4.6.1到这次浏览的时间间隔

In [137]:
def get_last_browse_time_interval(feature):
    now = data.groupby(feature)
    
    def func(x,feature):
        if(len(feature) == 1):
            feature = x[feature[0]]
        else:
            feature = tuple((x[feature[0]],x[feature[1]]))
            
        here = now.get_group(feature).loc[:x.name]
        if(len(here) >= 2):
            return x['context_timestamp'] - here.iloc[-2]['context_timestamp']
        return np.nan

    res = data.apply(func,args = (feature,),axis = 1)
    del now
    return res

In [None]:
# 3小时
start = datetime.datetime.now()

if(not os.path.exists('data/myfeature/last_browse_time_interval')):
    res = {}
    features = ['item_id','item_brand_id','item_city_id','user_id','shop_id']
    data_combination = pd.DataFrame()
    
    # 并发执行
    pros = Pool()
    for feature in features:
        res[feature] = pros.apply_async(get_last_browse_time_interval,args = (list((feature,)),))
        
    for i in range(len(features)):
        for j in range(i + 1,len(features)):
            feature = features[i] + '-' + features[j]
            res[feature] = pros.apply_async(get_last_browse_time_interval,args = (list((features[i],features[j])),))
    pros.close()
    pros.join()
    
    # 组合
    for now in res:
        data_combination = pd.concat([
            data_combination,
            res[now].get().rename(now + '-last-browse-time-interval')
        ],axis = 1)
        
    # 保存
    data_combination.to_pickle('data/myfeature/last_browse_time_interval')
else:
    data_combination = pd.read_pickle('data/myfeature/last_browse_time_interval')

print(datetime.datetime.now() - start)

### 4.6.2上次浏览时间

In [140]:
if(not os.path.exists('data/myfeature/last_browse_time')):
    data_combination = pd.DataFrame()
    temp = pd.read_pickle('data/myfeature/last_browse_time_interval')
    
    for feature in features:
        data_combination = pd.concat([
            data_combination,
            (data['context_timestamp'] - temp[feature + '-last-browse-time-interval']).rename(feature + '-last-browse-time')
        ],axis = 1)
    
    for i in range(len(features)):
        for j in range(i + 1,len(features)):
            feature = features[i] + '-' + features[j]
            data_combination = pd.concat([
                data_combination,
                (data['context_timestamp'] - temp[feature + '-last-browse-time-interval']).rename(feature + '-last-browse-time')
            ],axis = 1)
    
    data_combination.to_pickle('data/myfeature/last_browse_time')
    
else:
    data_combination = pd.read_pickle('data/myfeature/last_browse_time')

## 4.7这次浏览到下次浏览
### 4.7.1到下次浏览时间间隔

In [8]:
def get_next_browse_time_interval(feature):
    now = data.groupby(feature)

    def func(x,feature):
        if(len(feature) == 1):
            feature = x[feature[0]]
        else:
            feature = tuple((x[feature[0]],x[feature[1]]))
            
        here = now.get_group(feature).loc[x.name:]
        if(len(here) >= 2):
            return here.iloc[1]['context_timestamp'] - x['context_timestamp']
        return np.nan

    res = data.apply(func,args = (feature,),axis = 1)
    del now
    return res

In [None]:
# 3小时
start = datetime.datetime.now()

if(not os.path.exists('data/myfeature/next_browse_time_interval')):
    data_combination = pd.DataFrame()
    res = {}
    features = ['item_id','item_brand_id','item_city_id','user_id','shop_id']
    
    # 并发执行
    pros = Pool()
    for feature in features:
        res[feature] = pros.apply_async(get_next_browse_time_interval,args = (list((feature,)),))
 
    for i in range(len(features)):
        for j in range(i + 1,len(features)):
            feature = features[i] + '-' + features[j]
            res[feature] = pros.apply_async(get_next_browse_time_interval,args = (list((features[i],features[j])),))
    pros.close()
    pros.join()
    
    # 组合
    for now in res:
        data_combination = pd.concat([
            data_combination,
            res[now].get().rename(now + '-next-browse-time-interval')
        ],axis = 1)
    
    del res
    
    # 保存
    data_combination.to_pickle('data/myfeature/next_browse_time_interval')
else:
    data_combination = pd.read_pickle('data/myfeature/next_browse_time_interval')

print(datetime.datetime.now() - start)

### 4.7.2下次浏览时间

In [None]:
if(not os.path.exists('data/myfeature/next_browse_time')):
    data_combination = pd.DataFrame()
    temp = pd.read_pickle('data/myfeature/next_browse_time_interval')
    
    for feature in features:
        data_combination = pd.concat([
            data_combination,
            (data['context_timestamp'] + temp[feature + '-next-browse-time-interval']).rename(feature + '-next-browse-time')
        ],axis = 1)
    
    for i in range(len(features)):
        for j in range(i + 1,len(features)):
            feature = features[i] + '-' + features[j]
            data_combination = pd.concat([
                data_combination,
                (data['context_timestamp'] + temp[feature + '-next-browse-time-interval']).rename(feature + '-next-browse-time')
            ],axis = 1)
    
    data_combination.to_pickle('data/myfeature/next_browse_time')
else:
    data_combination = pd.read_pickle('data/myfeature/next_browse_time')

## 4.8动作前后浏览量（leak）
### 4.8.1当日

In [None]:
start = datetime.datetime.now()

if(not os.path.exists('data/myfeature/browse_count_today')):
    res = []
    features = ['item_id','item_brand_id','item_city_id','user_id','shop_id']
    data_combination = pd.DataFrame()
    
    for feature in features:
        temp = data.groupby([feature,'context_timestamp:day']).apply(lambda x:len(x)).to_dict()
        data_combination = pd.concat([
            data_combination,
            data.apply(lambda x,dic:dic[(x[feature],x['context_timestamp:day'])],axis = 1,args = (temp,)).rename(feature + '-browse-count-today')
        ],axis = 1)
    
    for i in range(len(features)):
        for j in range(i + 1,len(features)):
            feature = features[i] + '-' + features[j]
            temp = data.groupby([features[i],features[j],'context_timestamp:day']).apply(lambda x:len(x)).to_dict()
            data_combination = pd.concat([
                data_combination,
                data.apply(lambda x,dic:dic[(x[features[i]],x[features[j]],x['context_timestamp:day'])],axis = 1,args = (temp,)).rename(feature + '-browse-count-today')
            ],axis = 1)
    
    # 保存
    data_combination.to_pickle('data/myfeature/browse_count_today')
else:
    data_combination = pd.read_pickle('data/myfeature/browse_count_today')
print(datetime.datetime.now() - start)

### 4.8.2当小时

In [None]:
start = datetime.datetime.now()
if(not os.path.exists('data/myfeature/browse_count_tohour')):
    res = []
    data_combination = pd.DataFrame()
    
    for feature in features:
        temp = data.groupby([feature,'context_timestamp:day','context_timestamp:hour']).apply(lambda x:len(x)).to_dict()
        data_combination = pd.concat([
            data_combination,
            data.apply(lambda x,dic:dic[(x[feature],x['context_timestamp:day'],x['context_timestamp:hour'])],axis = 1,args = (temp,)).rename(feature + '-browse-count-tohour')
        ],axis = 1)
    
    for i in range(len(features)):
        for j in range(i + 1,len(features)):
            feature = features[i] + '-' + features[j]
            temp = data.groupby([features[i],features[j],'context_timestamp:day','context_timestamp:hour']).apply(lambda x:len(x)).to_dict()
            data_combination = pd.concat([
                data_combination,
                data.apply(lambda x,dic:dic[(x[features[i]],x[features[j]],x['context_timestamp:day'],x['context_timestamp:hour'])],axis = 1,args = (temp,)).rename(feature + '-browse-count-tohour')
            ],axis = 1)
    
    # 保存
    data_combination.to_pickle('data/myfeature/browse_count_tohour')
else:
    data_combination = pd.read_pickle('data/myfeature/browse_count_tohour')
print(datetime.datetime.now() - start)

## 4.9冷启动特征（leak）
### 4.9.1当天

In [None]:
# 2min
start = datetime.datetime.now()
if(not os.path.exists('data/myfeature/day_browse_is_last')):
    res = []
    data_combination = pd.DataFrame()
    
    # 执行
    # 是否第一次
    dic = data.groupby(['user_id','context_timestamp:day']).apply(lambda x:x.iloc[0].name).to_dict() # index？
    temp = data.apply(lambda x,dic:1 if x.name == dic[(x['user_id'],x['context_timestamp:day'])] else 0,axis = 1,args = (dic,))
    data_combination = pd.concat([
        data_combination,
        temp.rename('user_id-day-browse-is-first')
    ],axis = 1)
       
    # 是否最后一次
    dic = data.groupby(['user_id','context_timestamp:day']).apply(lambda x:x.iloc[-1].name).to_dict() # index？
    temp = data.apply(lambda x,dic:1 if x.name == dic[(x['user_id'],x['context_timestamp:day'])] else 0,axis = 1,args = (dic,))
    data_combination = pd.concat([
        data_combination,
        temp.rename('user_id-day-browse-is-last')
    ],axis = 1)

    # 保存
    data_combination.to_pickle('data/myfeature/day_browse_is_last')

else:
    data_combination = pd.read_pickle('data/myfeature/day_browse_is_last')
print(datetime.datetime.now() - start)

### 4.9.2当小时

In [None]:
# 2min
start = datetime.datetime.now()
if(not os.path.exists('data/myfeature/hour_browse_is_last')):
    res = []
    data_combination = pd.DataFrame()

    # 执行
    # 是否第一次
    dic = data.groupby(['user_id','context_timestamp:day','context_timestamp:hour']).apply(lambda x:x.iloc[0].name).to_dict() # index？
    temp = data.apply(lambda x,dic:1 if x.name == dic[(x['user_id'],x['context_timestamp:day'],x['context_timestamp:hour'])] else 0,axis = 1,args = (dic,))

    data_combination = pd.concat([
        data_combination,
        temp.rename('user_id-hour-browse-is-first')
    ],axis = 1)

    # 是否最后一次
    dic = data.groupby(['user_id','context_timestamp:day','context_timestamp:hour']).apply(lambda x:x.iloc[-1].name).to_dict() # index？
    temp = data.apply(lambda x,dic:1 if x.name == dic[(x['user_id'],x['context_timestamp:day'],x['context_timestamp:hour'])] else 0,axis = 1,args = (dic,))

    data_combination = pd.concat([
        data_combination,
        temp.rename('user_id-hour-browse-is-last')
    ],axis = 1)
    
    # 保存
    data_combination.to_pickle('data/myfeature/hour_browse_is_last')
else:
    data_combination = pd.read_pickle('data/myfeature/hour_browse_is_last')
print(datetime.datetime.now() - start)

## 4.10排序特征

In [None]:
# def get_sort(feature_now,temp):
#     if(len(feature_now) == 2):
#         return data.apply(lambda x:temp[x.loc[feature_now[0]], x.loc[feature_now[1]], x.name]['order'],axis = 1)
#     else:
#         return data.apply(lambda x:temp[x.loc[feature_now[0]], x.loc[feature_now[1]], x.loc[feature_now[2]], x.name]['order'],axis = 1)

# def get_sort_test(feature_now,temp):
#     if(len(feature_now) == 2):
#         return data_test.apply(lambda x:temp[x.loc[feature_now[0]], x.loc[feature_now[1]], x.name  + length_data]['order'],axis = 1)
#     else:
#         return data_test.apply(lambda x:temp[x.loc[feature_now[0]], x.loc[feature_now[1]], x.loc[feature_now[2]], x.name + length_data]['order'],axis = 1)

In [None]:
# start = datetime.datetime.now()
# if(not os.path.exists('data/myfeature/order') or not os.path.exists('data/myfeature/order_test')):
#     res = {}
#     res_test = {}
#     features = ['f1','f4','f5','f10','f19']
#     length_data = len(data)
#     data_test['f5'].fillna(-1,inplace = True)
    
#     # 执行
#     pros = Pool()
#     for feature in features:
#         feature_now = (feature,'f16:day') # 按天排序
#         print(feature_now)
#         temp = data.append(data_test).reset_index(drop = True).groupby(feature_now) \
#             .apply(lambda x:pd.concat([pd.Series(range(len(x)),index = x.index).rename('order'),x],axis = 1)).to_dict('index')
        
#         res[feature] = pros.apply_async(get_sort,args = (feature_now,temp))
#         if(flag is False):
#             res_test[feature] = pros.apply_async(get_sort_test,args = (feature_now,temp))
            
#     for i in range(len(features)):
#         for j in range(i + 1,len(features)):
#             feature_now = (features[i],features[j],'f16:day') # 按天排序
#             print(feature_now)
#             temp = data.append(data_test).reset_index(drop = True).groupby(feature_now) \
#                 .apply(lambda x:pd.concat([pd.Series(range(len(x)),index = x.index).rename('order'),x],axis = 1)).to_dict('index')
                
#             res[feature] = pros.apply_async(get_sort,args = (feature_now,temp))
#             if(flag is False):
#                 res_test[feature] = pros.apply_async(get_sort_test,args = (feature_now,temp))
            
#     pros.close()
#     pros.join()
    
#     for feature in res:
#         data_combination = pd.concat([data_combination,res[feature].get().rename(feature + '-sort-by-day')],axis = 1)
#         if(flag is False):
#             data_combination_test = pd.concat([data_combination_test,res_test[feature].get().rename(feature + '-sort-by-day')], axis = 1)
            
#     # 保存
#     data_combination.to_pickle('data/myfeature/order')
#     if(flag is False):
#         data_combination_test.to_pickle('data/myfeature/order_test')
# else:
#     data_combination = pd.read_pickle('data/myfeature/order')
#     data_combination_test = pd.read_pickle('data/myfeature/order_test')
# print(datetime.datetime.now() - start)

## 4.11贝叶斯平滑后的转化率

In [7]:
# import numpy
# import random
# import scipy.special as special


# class BayesianSmoothing(object):
#     def __init__(self, alpha, beta):
#         self.alpha = alpha
#         self.beta = beta

#     def sample(self, alpha, beta, num, imp_upperbound):
#         sample = numpy.random.beta(alpha, beta, num)
#         I = []
#         C = []
#         for clk_rt in sample:
#             imp = random.random() * imp_upperbound
#             imp = imp_upperbound
#             clk = imp * clk_rt
#             I.append(imp)
#             C.append(clk)
#         return I, C

#     def update(self, imps, clks, iter_num, epsilon):
#         for i in range(iter_num):
#             new_alpha, new_beta = self.__fixed_point_iteration(imps, clks, self.alpha, self.beta)
#             if abs(new_alpha-self.alpha) < epsilon and abs(new_beta - self.beta) < epsilon:
#                 break
#             self.alpha = new_alpha
#             self.beta = new_beta

#     def __fixed_point_iteration(self, imps, clks, alpha, beta):
#         numerator_alpha = 0.0
#         numerator_beta = 0.0
#         denominator = 0.0

#         for i in range(len(imps)):
#             numerator_alpha += (special.digamma(clks[i]+alpha) - special.digamma(alpha))
#             numerator_beta += (special.digamma(imps[i]-clks[i]+beta) - special.digamma(beta))
#             denominator += (special.digamma(imps[i]+alpha+beta) - special.digamma(alpha+beta))

#         return alpha*(numerator_alpha/denominator), beta*(numerator_beta/denominator)

In [None]:
# def smooth(feature,data,data_type):
#     print(feature)
#     bs = BayesianSmoothing(1, 1)
#     bs.update(data[feature + 'browse-count-5'].values, data[feature + 'count-5'].values, 1000, 0.001)
#     print(feature + 'update成功')
#     temp = (data[feature + 'count-5'] + bs.alpha) / (data[feature + 'browse-count-5'] + bs.alpha + bs.beta)
#     temp.to_pickle('data/myfeature/' + feature + data_type)
#     return True

# features_to_smooth = [
#     'f1-','f4-','f5-','f10-','f19-',
#     'f1-f4-','f1-f5-','f4-f5-','f4-f19-','f5-f19-'
# ]

# for feature in features_to_smooth:
#     pros.apply_async(smooth, (feature,data,''))  #增加新的进程
    
# for feature in features_to_smooth:
#     pros.apply_async(smooth, (feature,data_test,'test'))  #增加新的进程
    
# pros.close() # 禁止在增加新的进程
# pros.join()
# print("pool process done")

f1-
f4-
f5-
f10-
f19-
f1-f4-
f1-f5-
f4-f5-


# 5.特征合并
## 5.1读取先前处理的特征

In [102]:
# 读取3.特征变化、衍生产生的data
data = pd.read_pickle('data/data')

# 读取4.特征组合产生的data_combination
data_combination = pd.DataFrame()
for file in os.listdir('data/myfeature'):
    if('.' not in file):
        temp = pd.read_pickle('data/myfeature/' + file)
        data_combination = pd.concat([data_combination,temp],axis = 1)
        
# 删除特征取值少的冗余特征
column_to_del = []
for column in data_combination.columns:
    if(len(data_combination[column].value_counts()) <= 2):
        column_to_del.append(column)

    # column_to_del += ['f1-f19-browse-count-5','f5-f10-browse-count-5']
print(column_to_del)
data_combination.drop(column_to_del[:-4],axis = 1,inplace = True)

['user_id-day-browse-is-first', 'user_id-day-browse-is-last', 'user_id-hour-browse-is-first', 'user_id-hour-browse-is-last']


In [103]:
# 拼接data_f3、data_test_f3、data_combination、data_combination_test
data = pd.concat([data.loc[:,:'f3-f18-jaccard'],data_combination,data.loc[:,'label']],axis = 1)

In [104]:
data.sort_values(['index'],inplace = True)
data.set_index('index', inplace=True)# f16转换为索引

# 6.特征选择
- 除非万不得已，不要用PCA或者LDA降维，直接减原始特征就行了。

## 6.1质量不好的特征
- 缺失的行特别多，弃用该列，超过15%缺失的特征应该予以删除！
- 质量都不错，最多的f12（0.027）

## 6.2冗余特征（相关性强的保留一个）
- 有些 Feature 之间可能存在线性关系，影响 Model 的性能。
- Feature越少，训练越快。

## 6.3无关特征
- f0样本编号：近似唯一
- f1广告商品编号
- f10用户编号
- f15上下文信息编号：完全唯一
- f19店铺编号

## 6.4无法直接用的特征

In [105]:
# data.drop('f16',axis = 1)
data.drop([
    'item_category_list',
    'item_category_list:1',
    'item_category_list:2',
    'item_category_list:3',
    'item_property_list',
    'user_gender_id',
    'user_occupation_id',
    'predict_category_property'], axis=1,inplace = True)

# 7.标签处理
- 上采样、下采样、分层采样。

# 8.保存结果

In [106]:
data.to_pickle('data/round1_train2_5')