In [None]:
import numpy as np
import pandas as pd
import catboost as cbt
import lightgbm as lgb
import time
import gc
from tqdm import tqdm
from sklearn.metrics import roc_auc_score,log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder,MinMaxScaler

from itertools import combinations,permutations

import warnings
warnings.filterwarnings('ignore')
from gensim.models import Word2Vec

from sklearn.decomposition import PCA

def reduce_mem_usage(data):
    '''
    通过判断数据范围的上下限来选择最小能存储数据的类型
    注意:在存储feather前不要使用,因为feather不支持float16位类型
    data:输入dataframe
    return:返回优化后的dataframe
    '''
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = data.memory_usage().sum() / 1024**2    
    for col in tqdm(data.columns):
        col_type = data[col].dtypes
        if col_type in numerics:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    #为避免feather不支持此类型
                    #data[col] = data[col].astype(np.float16)
                    pass
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)    
    end_mem = data.memory_usage().sum() / 1024**2
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return data

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
label = pd.read_csv('train_label.csv')
train['label'] = label.label
data = pd.concat([train,test],axis=0,sort=False).reset_index(drop=True)

In [None]:
data['date'] =pd.to_datetime(data['date'])
data['hour'] = data['date'].dt.hour
data['day'] = data['date'].dt.day

A_ft = ['A'+str(i) for i in range(1,4)]
B_ft = ['B'+str(i) for i in range(1,4)]
C_ft = ['C'+str(i) for i in range(1,4)]
D_ft = ['D1','D2']

#类别特征
cat_list = A_ft+B_ft+C_ft+['hour','day']+['E'+str(i) for i in [1,6,14,20,27]]+['E'+str(i) for i in [4,11,12,24,26,28]]+['E'+str(i) for i in [8,15,18,25]]+['E'+str(i) for i in [23,29]]

#数值特征
numerical_cols = [col for col in data.columns if col not in ['ID','label','date']+cat_list+['D1','D2']]

#放缩，对树模型无影响
data[numerical_cols] = np.exp(data[numerical_cols])

for col in [['E4','E11'],['E12','E24']]:
    lbl = LabelEncoder()
    data[col[0]+col[1]+'_cross']=data[col[0]].astype('str')+data[col[1]].astype('str')
    lbl.fit(list(data[col[0]+col[1]+'_cross'].values))
    data[col[0]+col[1]+'_cross']=lbl.transform(list(data[col[0]+col[1]+'_cross'].values))

#按行统计
data['mean_numerical1'] = np.mean(data[numerical_cols],axis=1)
data['std_numerical1'] = np.std(data[numerical_cols],axis=1)
data['min_numerical1'] = np.min(data[numerical_cols],axis=1)
data['max_numerical1'] = np.max(data[numerical_cols],axis=1)

#交叉
for cb in [('E2','E7'),('E9','E17'),('E5','E9')]:
    data[cb[0]+'_plus_'+cb[1]] = data[cb[0]] + data[cb[1]]
    
for cb in [('E2','E7'),('E9','E17'),('E19','E9'),('E7','E9')]:
    data[cb[0]+'_mul_'+cb[1]] = data[cb[0]] + data[cb[1]]
    
for cb in [('E17','E9'),('E9','E17'),('E5','E9'),('E7','E2'),('E9','E2')]:
    data[cb[0]+'_devide_'+cb[1]] = data[cb[0]] + data[cb[1]]
    
#计数特征
count_feature = []
for col in tqdm(A_ft+B_ft+C_ft+['E'+str(i) for i in [1,6,14,20,27]]):
    data[col + "_count"] = data.groupby([col])[col].transform('count')
    count_feature.append(col + "_count")
    
#类别特征做词向量嵌入
#后计算相似度
def vec(data,col1,col2):
    dataword2vec2 = pd.concat((data[col1],data[col2]), axis=1)
    dataword2vec3=np.array(dataword2vec2.astype(str))
    dataword2vec3=dataword2vec3.tolist()  #必须用列表类型的数据才能训练词向量
    model = Word2Vec(dataword2vec3, size=200,iter=15, hs=1, min_count=1, window=5,workers=6)
    ws1=np.array(dataword2vec2[col1].astype('str'))
    ws2=np.array(dataword2vec2[col2].astype('str'))
    ws1=ws1.tolist()
    ws2=ws2.tolist()
    word2vecsim1=[]
    
    for i in tqdm(range(len(data))):
        ws3=[ws1[i]]
        ws4=[ws2[i]]
        word2vecsim2=model.wv.n_similarity(ws3,ws4)#计算两列的相似度
        word2vecsim1.append(word2vecsim2)
    data[col1+col2+'_vec'] = np.array(word2vecsim1)
    
for cols in combinations(A_ft,2):
    vec(data,cols[0],cols[1])
for cols in combinations(B_ft,2):
    vec(data,cols[0],cols[1])
for cols in combinations(C_ft,2):
    vec(data,cols[0],cols[1])

In [None]:
numerical_cols = [col for col in data.columns if col not in ['ID','label','date']+cat_list+['D1','D2']]
numerical_cols

In [None]:
data['mean_numerical2'] = np.mean(data[numerical_cols],axis=1)
data['std_numerical2'] = np.std(data[numerical_cols],axis=1)
data['min_numerical2'] = np.min(data[numerical_cols],axis=1)
data['max_numerical2'] = np.max(data[numerical_cols],axis=1)

In [None]:
#分组统计
for i in tqdm(['A2','A3','B2','C1','C3','E1','E14','E20']):
    for j in numerical_cols:
        data[i + "_mean_" + j] = data.groupby([i])[j].transform('mean')
        
for i in tqdm(['A2','A3','B2','C1','C3','E1','E14','E20']):
    for j in numerical_cols:
        data[i + "_std_" + j] = data.groupby([i])[j].transform('std')
        
for i in tqdm(['A2','A3','B2','C1','C3','E1','E14','E20']):
    for j in numerical_cols:
        data[i + "_sum_" + j] = data.groupby([i])[j].transform('sum')

In [None]:
#不加入训练的特征
useless_ft = ['E22','E3','E19']+['A3_std_A3B2_vec', 'B2_std_A1B2_vec', 'A2_std_B2B3_vec',
       'B2_mean_A3_count', 'B2_std_A3_count', 'B2_std_B2_count',
       'B2_std_A1B1_vec', 'E14_std_E21', 'A2_mean_B3_count', 'E14_std_E5',
       'B2_std_A1A2_vec', 'A2_std_A2B1_vec', 'B2_std_E16',
       'B2_std_A2B1_vec', 'A2_mean_B2_count', 'B2_std_A1B3_vec',
       'A3_std_B1B3_vec', 'B2_std_B3_count', 'C1_std_C1_count',
       'B2_mean_A2A3_vec', 'B2_std_B1B3_vec', 'A3_mean_B3_count',
       'B2_mean_A1_count', 'E14_std_E3', 'A2_mean_B1B2_vec',
       'A3_std_B1_count', 'A3_std_B1B2_vec', 'A2_std_A2B2_vec',
       'B2_std_A1A3_vec', 'B2_std_A1B1_vec', 'E20_std_E20_count', 'E25',
       'B2_std_B2B3_vec', 'A3_std_A3B1_vec', 'B2_std_A3B1_vec',
       'B2_std_A2B2_vec', 'E1_std_E1_count', 'B2_std_A3B3_vec',
       'E14_std_E5_plus_E9', 'B2_std_E10', 'A2_std_A1A2_vec',
       'B2_std_E22', 'B2_std_A2A3_vec', 'A2_std_A2_count',
       'B2_std_A2B3_vec', 'A3_std_A3B3_vec', 'C3_std_C3_count',
       'C3_std_C3_count', 'E14_std_E3', 'A3_std_B3_count',
       'E1_std_E27_count', 'A2_std_E16', 'E14_std_E9_devide_E2',
       'E14_std_E14_count', 'D2', 'E14_std_E14_count', 'D2',
       'A3_std_B3_count', 'C3_std_C3_count', 'E1_std_E27_count',
       'A3_std_B3_count', 'E14_std_E14_count', 'C3_std_E20_count',
       'A3_std_A3B3_vec', 'A2_std_E16', 'E1_std_E27_count',
       'B2_std_A3B2_vec', 'A2_std_A1B3_vec', 'A2_std_E16',
       'B2_std_B1B3_vec', 'A2_std_A1B3_vec', 'A2_std_B1_count',
       'B2_mean_A1_count', 'A2_std_B1B2_vec', 'E1_std_E1_count',
       'B2_std_E10', 'C1_std_C1_count', 'B2_std_A3_count', 'E14_std_E21',
       'B2_std_A2B1_vec', 'A2_std_A2_count', 'A2_std_A2B1_vec',
       'B2_std_E16', 'A2_std_B1_count', 'E14_std_E13', 'A2_std_B1B2_vec',
       'E14_std_E13', 'E14_std_E27_count', 'C3_std_E20_count',
       'E14_std_E27_count', 'A3_mean_B1B3_vec', 'C3_std_E20_count',
       'B2_std_A2B3_vec', 'E14_std_E5_devide_E9', 'B2_mean_A2_count',
       'B2_std_B3_count', 'A2_mean_B1B3_vec', 'A2_std_A1B1_vec',
       'B2_std_A2_count', 'B2_std_B2B3_vec', 'A2_std_B1B3_vec',
       'A3_std_A3_count', 'B2_std_A2A3_vec', 'B2_std_A3B2_vec',
       'B2_std_A3B1_vec', 'B2_mean_A1A3_vec', 'A2_mean_B1_count',
       'A2_std_B2_count', 'B2_std_A2B2_vec', 'A2_std_A1A2_vec',
       'A3_std_B2_count', 'A2_mean_B2B3_vec', 'B2_std_E22', 'E21',
       'A2_std_A2C2_vec', 'A2_std_A1B2_vec', 'B2_mean_A2A3_vec',
       'A3_mean_B2B3_vec', 'E8', 'E20_std_E20_count', 'A3_mean_B1B2_vec',
       'A2_std_B3_count', 'B2_std_A1A3_vec', 'B2_std_A1_count',
       'E14_std_E1_count', 'A2_mean_B3_count', 'B2_mean_A2_count',
       'A3_mean_B3_count', 'B2_mean_A1A2_vec', 'E14_std_E3',
       'A2_mean_B1B2_vec', 'A2_std_A2B2_vec', 'E8', 'A3_mean_B1_count',
       'A3_mean_B2_count', 'B2_std_B1_count', 'A3_std_B1B2_vec',
       'B2_std_B1B2_vec', 'A2_std_B1B3_vec', 'A2_std_A2B3_vec',
       'A2_mean_B1B3_vec', 'A2_std_A1B2_vec', 'B2_std_A1B2_vec',
       'A2_mean_B3_count', 'A2_mean_B2B3_vec', 'A2_std_B1B3_vec',
       'E20_sum_max_numerical1', 'B2_std_B1_count', 'B2_std_A1A2_vec',
       'A2_std_A2B3_vec', 'A3_mean_B1_count', 'E14_std_E5',
       'A2_mean_B2_count', 'A3_mean_B1B2_vec', 'A3_mean_B2_count',
       'E14_std_E1_count', 'B2_mean_A1A2_vec', 'B2_std_A1A3_vec',
       'E14_std_E1_count', 'B2_std_A1B1_vec', 'B2_std_A1_count',
       'B2_std_A3B3_vec', 'A3_std_B1B3_vec', 'A2_std_B3_count',
       'B2_std_A1B3_vec', 'B2_std_A3B3_vec', 'A3_std_A3B2_vec',
       'B2_std_B2_count', 'A3_mean_B1B2_vec', 'A2_std_B2B3_vec',
       'B2_std_B1_count', 'A2_std_A1_count', 'A3_std_B2_count', 'E20',
       'A3_std_B2B3_vec', 'A3_mean_B1_count', 'A2_std_B2_count',
       'E14_std_E13', 'B2_mean_A1A3_vec', 'B2_std_A3B2_vec',
       'A2_mean_B1B3_vec', 'B2_std_A2A3_vec', 'A3_std_B1_count',
       'A3_std_A3_count', 'A2_mean_A1B2_vec', 'B2_std_A2_count',
       'A2_std_A1B1_vec', 'E8', 'B2_mean_A3_count', 'A3_mean_B2B3_vec',
       'A2_mean_B1_count', 'E20_std_E20_count', 'A2_std_A2_count',
       'A2_std_B1B2_vec', 'E14_std_E27_count', 'A2_mean_B1_count',
       'A3_mean_B1B3_vec', 'B2_std_A1A2_vec', 'A2_std_A1B3_vec',
       'A2_mean_B2_count', 'A3_std_B1B3_vec', 'B2_std_B1B3_vec',
       'A2_std_E16', 'B2_std_A2B3_vec', 'B2_mean_A1_count',
       'A3_std_B3_count', 'E1_std_E27_count', 'E14_std_E14_count',
       'B2_mean_A1A3_vec', 'B2_std_E10', 'B2_std_A2B1_vec',
       'C3_std_C3_count', 'A2_std_A2_count', 'E14_std_E13',
       'C3_std_E20_count', 'C3_std_E20_count', 'A2_std_E22',
       'E14_std_E13', 'B2_mean_A3_count', 'A3_std_A3B2_vec',
       'A2_std_B2_count', 'B2_mean_A3_count', 'A2_std_B1_count',
       'A2_std_B3_count', 'E14_std_E5_devide_E9', 'B2_mean_A2_count',
       'A3_mean_B3_count', 'A2_mean_B1B2_vec', 'A3_std_B1B2_vec',
       'B2_std_A2B2_vec', 'B2_std_A1B3_vec', 'A2_std_A1A2_vec',
       'B2_std_E22', 'A3_std_A3B1_vec', 'B2_mean_A2A3_vec',
       'B2_std_B3_count', 'E1_std_E1_count', 'E14_std_E5_plus_E9',
       'B2_std_A1B2_vec', 'A2_std_A1_count', 'A2_std_B2B3_vec',
       'C1_std_C1_count', 'B2_std_B2_count', 'A3_std_B2B3_vec',
       'B2_std_A3_count', 'E14_std_E21', 'A2_std_A2B1_vec', 'B2_std_E16',
       'A2_std_A2_count', 'B2_std_A2B1_vec', 'B2_std_E10',
       'C3_std_C3_count', 'A2_std_B1_count', 'B2_std_E16',
       'A2_std_A2B1_vec', 'E14_std_E21', 'A3_std_B1B2_vec',
       'A2_mean_B1B2_vec', 'B2_std_A3_count', 'A3_std_A3_count',
       'A3_mean_B3_count', 'C1_std_C1_count', 'B2_mean_A2_count',
       'E14_std_E5_plus_E9', 'E1_std_E1_count', 'A2_std_B3_count',
       'B2_std_A1_count', 'A3_std_A3B1_vec', 'B2_mean_A1A2_vec',
       'A3_mean_B2_count', 'B2_std_A1B3_vec', 'A2_mean_A1B2_vec',
       'B2_std_B1B2_vec', 'A2_std_A2B3_vec', 'E14_std_E5',
       'A3_mean_B2B3_vec', 'A2_std_A1B1_vec', 'A2_std_B1B2_vec',
       'B2_std_A2B2_vec', 'E14_std_E27_count', 'A3_std_B3_count',
       'B2_mean_A1_count', 'B2_std_A2B3_vec', 'B2_std_B1B3_vec', 'D2',
       'A3_std_B1B3_vec', 'A2_mean_B2_count', 'E14_std_E14_count',
       'B2_std_A1A2_vec', 'E14_std_E9_devide_E2', 'A3_std_A3B2_vec',
       'E1_std_E27_count', 'B2_std_B2_count', 'A2_std_A1A2_vec',
       'A2_mean_B2C2_vec', 'A3_std_A3B3_vec', 'A2_std_E16',
       'A2_std_B2B3_vec', 'A2_std_A1_count', 'B2_std_A1B2_vec',
       'B2_std_B3_count', 'B2_mean_A2A3_vec', 'A2_std_A1B3_vec',
       'A3_mean_B1B3_vec', 'B2_std_E22', 'B2_std_A1_count',
       'B2_mean_A1A2_vec', 'B2_std_A2B1_vec', 'A2_std_B2B3_vec',
       'B2_std_A3B1_vec', 'B2_std_A1B2_vec', 'B2_std_B3_count',
       'B2_mean_A2A3_vec', 'E14_std_E2', 'B2_std_E22', 'A2_std_A1A2_vec',
       'B2_std_A2B2_vec', 'A3_std_B1B2_vec', 'A2_mean_B1B2_vec',
       'B2_mean_A3_count', 'A3_mean_B3_count', 'B2_mean_A2_count',
       'B2_std_A1_count', 'B2_mean_A1A2_vec', 'A3_mean_B2_count',
       'B2_std_B1B2_vec', 'B2_std_A3B1_vec', 'B2_std_B2B3_vec',
       'E20_std_E20_count', 'A2_std_A2B2_vec', 'A3_mean_B2B3_vec',
       'E14_std_E3', 'A2_std_A1B1_vec', 'B2_std_A2_count',
       'A2_std_A1_count', 'B2_std_A1B3_vec', 'A3_mean_B2_count',
       'B2_std_B2_count', 'E18', 'B2_std_E10', 'A2_std_A1B3_vec',
       'A3_mean_B1B3_vec', 'A2_std_A2B2_vec', 'B2_mean_A1_count',
       'E14_std_E27_count', 'B2_std_A2B3_vec', 'A2_std_B1B2_vec',
       'B2_std_B1B3_vec', 'A2_std_B1_count', 'A3_std_B1B3_vec',
       'B2_std_E16', 'A2_mean_B2_count', 'A2_std_A2B1_vec', 'E14_std_E21',
       'B2_std_A3_count', 'C1_std_C1_count', 'B2_std_A1A2_vec',
       'B2_std_A2_count', 'E25', 'E1_std_E1_count', 'A3_std_A3B2_vec',
       'A3_std_A3B1_vec', 'B2_std_B2B3_vec', 'A2_mean_B3_count',
       'B2_std_A3B3_vec', 'A3_std_A3_count', 'B2_std_A1B1_vec',
       'A2_std_B1B3_vec', 'A3_std_B2B3_vec', 'A2_std_B2_count',
       'B2_std_B1_count', 'E14_std_E9', 'A2_mean_B1_count',
       'A3_mean_B1_count', 'B2_mean_A1A3_vec', 'A3_mean_B1B2_vec',
       'E14_std_E1_count', 'B2_std_A1A3_vec', 'A3_std_A3_count',
       'B2_std_A3B3_vec', 'A2_mean_B3_count', 'A3_std_B2_count',
       'B2_std_A2_count', 'E14_std_E3', 'A2_std_A1B1_vec',
       'A2_std_A2B2_vec', 'E20_std_E20_count', 'A3_mean_B2B3_vec',
       'B2_std_B2B3_vec', 'B2_std_A3B1_vec', 'A2_std_A2B3_vec',
       'B2_std_B1B2_vec', 'A2_mean_B2B3_vec', 'A2_mean_B1B3_vec', 'E8',
       'E8', 'B2_std_A1A3_vec', 'E14_std_E1_count', 'A3_mean_B1B2_vec',
       'B2_mean_A1A3_vec', 'A3_mean_B1_count', 'B2_std_B1_count',
       'A2_mean_B1_count', 'A2_std_B2_count', 'A3_std_B2B3_vec',
       'A2_std_B1B3_vec', 'A2_mean_B1B3_vec', 'A3_std_B2_count',
       'A3_std_B2_count', 'A2_mean_B2B3_vec', 'A2_std_A1B2_vec',
       'A3_std_B1_count', 'B2_std_A2A3_vec', 'B2_std_A3B2_vec',
       'B2_std_A3B2_vec', 'B2_std_A2A3_vec', 'A2_std_C2_count',
       'A2_std_A1B2_vec', 'A2_mean_B2B3_vec', 'A2_std_A1B2_vec',
       'A2_std_B3_count']+['C3_sum_E17', 'B2_sum_A1_count', 'E20_sum_E16', 'E20_sum_A1_count',
       'B2_mean_A2B3_vec', 'B2_sum_E2_mul_E7', 'C3_mean_E20_count',
       'C3_std_E17_devide_E9', 'A2_mean_B3C2_vec', 'A1', 'B2_std_E19',
       'day', 'E20_sum_C2_count', 'E2_mul_day_sin', 'A3_sum_A2_count',
       'E20_sum_E9_plus_E17', 'E26', 'E20_sum_C2C3_vec', 'B2_std_E13',
       'B2_sum_A3_count', 'B2_std_E21', 'C3_mean_E13', 'B2_std_E13',
       'E20_sum_mean_numerical1', 'min_numerical1', 'A3_sum_B1_count',
       'A3_sum_E2', 'A3_sum_E2', 'E20_sum_A3B2_vec', 'E13_mul_day_sin',
       'A2_mean_C2_count', 'E1_std_E9', 'A3_sum_A3_count',
       'E20_mean_A2C3_vec', 'C3_sum_E21', 'C3_std_E13', 'E1_sum_B1B2_vec',
       'B2_sum_mean_numerical1', 'A3_sum_E7_devide_E2', 'A3_sum_B2B3_vec',
       'C3_sum_C3_count', 'E2', 'E20_std_C1C2_vec', 'E2_plus_E7',
       'E20_sum_E9_devide_E17', 'A3_std_A1B2_vec', 'C3_mean_B3_count',
       'A3_std_A1B2_vec', 'C3_std_B1_count', 'A3_sum_A1C3_vec',
       'E20_sum_B2C1_vec', 'A3_std_A1B1_vec', 'day_cos',
       'A3_sum_E9_devide_E17', 'B2_sum_A2B1_vec', 'A3_sum_A2C1_vec',
       'A3_sum_B1B3_vec', 'C3_std_E22', 'E20_count', 'E20_sum_B3_count',
       'E20_sum_E7_mul_E9', 'C3_sum_C3_count', 'A3_sum_E20_count',
       'A3_sum_B1C1_vec', 'E20_std_B3_count', 'A3_sum_A3_count',
       'A3_sum_A1B3_vec', 'E20_std_E13', 'E20_sum_E19_mul_E9',
       'E20_sum_E20_count', 'C3_sum_E13', 'B1', 'C3_mean_E17',
       'A3_sum_mean_numerical1', 'A2_sum_B2B3_vec',
       'B2_sum_E17_devide_E9', 'A3_sum_A1A3_vec', 'E24',
       'E20_sum_E2_mul_E7', 'E20_sum_E5_plus_E9', 'E16_mul_day_cos',
       'B2_mean_A3C2_vec', 'A3_sum_E9_mul_E17', 'A3_sum_B1B2_vec',
       'B2_sum_A1_count', 'E11', 'A2_sum_E19_mul_E9', 'E20_sum_A1B2_vec',
       'A2_mean_B3C2_vec', 'A3_sum_mean_numerical1', 'C3_mean_C3_count',
       'E20_sum_A1B1_vec', 'A3_std_E16', 'C3_std_E3', 'C3_sum_E16',
       'A2_std_A3B2_vec', 'A3_sum_E9_devide_E17', 'A3_mean_B3C2_vec',
       'E20_mean_E5', 'A3_sum_B3_count', 'B3_count', 'min_numerical1',
       'E20_sum_E2', 'C3_sum_E16', 'D1', 'B2_std_E13', 'C3_mean_E10',
       'A3_sum_E2_mul_E7', 'A2_sum_B1B2_vec', 'A3_sum_A1A3_vec',
       'C3_sum_E9', 'B2_mean_A3B3_vec', 'C3_mean_E20_count',
       'A3_sum_B1C3_vec', 'E20_mean_C2_count', 'A3_sum_E5_plus_E9',
       'C3_mean_C3_count', 'B2_sum_E9_plus_E17', 'C3_sum_C3_count',
       'A3_sum_B2B3_vec', 'E9_mul_E17', 'E20_sum_E9_devide_E17',
       'A3_sum_A3_count', 'A3_sum_B2B3_vec', 'E20_sum_A3B2_vec',
       'A3_sum_B3C3_vec', 'A2_sum_A2B3_vec', 'E28', 'A2_std_B2C2_vec',
       'A3_sum_E16', 'B2_mean_A3B3_vec', 'A3_sum_B3C1_vec',
       'A3_sum_A3_count', 'C3_sum_B3_count', 'A2_std_A1A3_vec',
       'E20_mean_E9_devide_E17', 'E9_mul_E17', 'E20_sum_E9_mul_E17',
       'A3_std_A1B2_vec', 'A2_std_A3B3_vec', 'E20_sum_A1A3_vec',
       'B2_sum_E9_mul_E17', 'E26', 'A3_sum_E5', 'E20_std_std_numerical1',
       'A3_sum_B3_count', 'E20_mean_A1A2_vec', 'A3_sum_E9_devide_E17',
       'E20_sum_A3B3_vec', 'E11', 'C3_count', 'C3_sum_E3',
       'A2_mean_A1B1_vec', 'C3_std_E13', 'E13_mul_day_sin',
       'A3_sum_E14_count', 'A3_sum_A1_count', 'A3_sum_B3C3_vec',
       'E20_sum_A3B2_vec', 'C3_sum_E21', 'A3_sum_C2C3_vec', 'E23',
       'B2_sum_A1_count', 'E20_std_E7', 'A2_mean_A1B3_vec',
       'B2_mean_B3_count', 'E20_sum_C2C3_vec', 'E20_sum_A1A3_vec',
       'A3_sum_B1C2_vec', 'E17', 'A2_mean_C2_count', 'E20_sum_E9',
       'E20_sum_E2_plus_E7', 'E20_sum_A1B2_vec', 'E5', 'A2_mean_A1B1_vec',
       'A2_mean_B3C2_vec', 'C3_mean_C3_count', 'A2_std_A3B1_vec', 'E17',
       'C3_sum_E16', 'B2_sum_A1B3_vec', 'E20_sum_A1B3_vec', 'A3_count',
       'B2_std_E13', 'E1_std_E9', 'A3_std_A1B1_vec', 'E20_std_B2C2_vec',
       'E11', 'A2_std_E10', 'C3_std_B1_count', 'E20_sum_A1A2_vec',
       'E20_sum_B1C1_vec', 'C3_sum_E13', 'A3_sum_B1_count',
       'A2_std_A3B1_vec', 'E20_sum_B2C1_vec', 'E20_sum_C1C2_vec',
       'C3_mean_C3_count', 'A2_std_A2A3_vec', 'E20_sum_B3C1_vec',
       'A3_std_B1C2_vec', 'A2_std_A3B2_vec', 'C3_sum_E16',
       'A2_std_B2C2_vec', 'A3_sum_E3', 'B2_sum_A2_count',
       'E20_sum_A1A3_vec', 'A3_sum_E5_plus_E9', 'C3_sum_E13',
       'E20_sum_A1A2_vec', 'A3_mean_B2C2_vec', 'E20_sum_E17_devide_E9',
       'E13', 'C3_count', 'E20_sum_A1B1_vec', 'A2_std_A1A3_vec', 'E16',
       'A2_mean_A1B1_vec', 'E15', 'E13', 'A3_sum_B1B2_vec', 'E16',
       'A2_std_A3B1_vec', 'E1_std_E9', 'A2_std_A2A3_vec',
       'A2_mean_A1B3_vec', 'E13', 'A3_count', 'E16', 'A2_std_A3B2_vec',
       'A3_count', 'A2_mean_A1_count', 'E15', 'E5', 'A3_sum_B1B3_vec',
       'A3_mean_B3C2_vec', 'C3_count', 'E15', 'A3_mean_B2C2_vec',
       'A3_sum_B1B3_vec', 'E20_sum_A1A3_vec', 'E16', 'E16',
       'A3_mean_C2_count', 'C3_sum_C3_count', 'E1_std_E9', 'E17']

In [None]:
#将时间分解后
#与数值特征作交叉

data['hour_sin'] = np.sin(2 * np.pi * data['hour']/23.0)
data['hour_cos'] = np.cos(2 * np.pi * data['hour']/23.0)
feature_name+=['hour_sin','hour_cos']

data['day_sin'] = np.sin(2 * np.pi * data['day']/11.0)
data['day_cos'] = np.cos(2 * np.pi * data['day']/11.0)
feature_name+=['day_sin','day_cos']
if 'day' in feature_name:feature_name.remove('day')

for col in tqdm(numerical_cols):
    data[col+'_mul_hour_sin'] = data['hour_sin'] * data[col]
    data[col+'_mul_hour_cos'] = data['hour_cos'] * data[col]
    data[col+'_mul_day_sin'] = data['day_sin'] * data[col]
    data[col+'_mul_day_cos'] = data['day_cos'] * data[col]
    feature_name+=[col+'_mul_hour_sin',col+'_mul_hour_cos',col+'_mul_day_sin',col+'_mul_day_cos']

In [None]:
feature_name = list(set([col for col in data.columns if col not in useless_ft+['ID','label','date']]))

if 'hour' in feature_name:
    feature_name.remove('hour')
if 'day' in feature_name:
    feature_name.remove('day')

print(feature_name)
print(len(feature_name))
print(cat_list)
print(len(cat_list))

In [None]:
data[cat_list] = data[cat_list].astype(int)

In [None]:
%time data = reduce_mem_usage(data)

In [None]:
tr_index = ~data['label'].isnull()
X_train = data.loc[tr_index,:].reset_index(drop=True)
y = data.loc[tr_index,:]['label'].reset_index(drop=True).astype(int)
X_test = data[~tr_index].reset_index(drop=True)
print(X_train.shape,X_test.shape)

In [None]:
def run_lgb_cv(train_X,train_Y,test_X,lgb_params,feature_name=None,split=5,iterations=1000,learning_rate=0.05,depth=7,reg_lambda=5,seed=20191031,cat_list=None,use_best=True):
    val_results = []
    models_list = []
    best_iterations = []
    train_pred = np.zeros(train_X.shape[0])
    test_pred = np.zeros(test_X.shape[0])
    seeds=range(seed,seed+split)
    feature_importance_df = pd.DataFrame()
    
    if feature_name == None:
        feature_name = [col for col in train_X.columns if col not in ['ID','label','date']]
    print('Using features:',feature_name)
    
    train_val_spliter = StratifiedKFold(n_splits=split, random_state=seeds[0], shuffle=True)
    
    for index, (train_index, test_index) in enumerate(train_val_spliter.split(train_X, train_Y)):
        print('fold:',index+1)
        val_result = []
        lgb_params['random_state'] = seeds[index]
        train_x, val_x, train_y, val_y = train_X[feature_name].iloc[train_index], train_X[feature_name].iloc[test_index], train_Y.iloc[train_index], train_Y.iloc[test_index]
        
        train_data = lgb.Dataset(train_x,label=train_y)
        val_data = lgb.Dataset(val_x,label=val_y)
        if cat_list == None:
            cat_list = ""
        
        lgb_model=lgb.train(lgb_params,train_data,valid_sets=[val_data],verbose_eval=lgb_params['verbose'])
        gc.collect()
        
        train_pred[test_index] += lgb_model.predict(val_x[feature_name])
        fold_test_pred = lgb_model.predict(X_test[feature_name])
        test_pred += fold_test_pred/split
        
        val_result.append(roc_auc_score(val_y, train_pred[test_index]))
        print('AUC: ',val_result[-1])
        val_result.append(log_loss(val_y, train_pred[test_index]))
        print('log_loss: ',val_result[-1])
        best_iterations.append(lgb_model.current_iteration())
        val_results.append(val_result)
    
    val_results = np.array(val_results)
    print('cv completed')
    print('mean best iteration: ',np.mean(best_iterations))
    print('std best iteration: ',np.std(best_iterations))
    print('oof AUC: ',roc_auc_score(train_Y,train_pred))
    print('mean AUC: ',np.mean(val_results[:,0]))
    print('std AUC: ',np.std(val_results[:,0]))
    print('oof log_loss: ',log_loss(train_Y,train_pred))
    print('mean log_loss: ',np.mean(val_results[:,1]))
    print('std log_loss: ',np.std(val_results[:,1]))
    return train_pred,test_pred

In [None]:
params1 = {
            'boosting_type': 'gbdt',
            'objective': 'binary',
            'metric': ['auc','binary_logloss'],
            'first_metric_only':True,
            'learning_rate': 0.005,
            'max_depth': 9,
            'num_leaves': 31,
            'feature_fraction': 0.075,
            'neg_bagging_fraction': 0.95,
            'bagging_freq': 20,
            'reg_alpha': 3.77, 
            'reg_lambda': 5.86,
            'num_boost_round':10000,
            'verbose':500,
            'early_stopping_rounds':1000
        }

In [None]:
#12个lgb模型
prediction_df = pd.DataFrame()
n_times = 12
oof_df = pd.DataFrame()
for r,i in enumerate([10*i+20191031 for i in range(n_times)]):
    print('round: ',r+1)
    oof,pred = run_lgb_cv(X_train,y,X_test,params1,feature_name,use_best=True,split=5,seed=i)
    gc.collect()
    
    prediction_temp = pd.DataFrame()
    prediction_temp['lgb_'+str(i)] = pred
    prediction_df = pd.concat([prediction_df,prediction_temp],axis=1)
    
    oof_temp = pd.DataFrame()
    oof_temp['lgb_'+str(i)] = oof
    oof_df = pd.concat([oof_df,oof_temp],axis=1)

In [None]:
oof_df.to_csv('oof_lgb_12.csv',index=False)
prediction_df.to_csv('test_lgb_12.csv',index=False)