In [None]:
import numpy as np
import pandas as pd
import catboost as cbt
import lightgbm as lgb
import time
import gc
from tqdm import tqdm
from sklearn.metrics import roc_auc_score,log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder,MinMaxScaler

from itertools import combinations,permutations

import warnings
warnings.filterwarnings('ignore')
from gensim.models import Word2Vec

from sklearn.decomposition import PCA

def reduce_mem_usage(data):
    '''
    通过判断数据范围的上下限来选择最小能存储数据的类型
    注意:在存储feather前不要使用,因为feather不支持float16位类型
    data:输入dataframe
    return:返回优化后的dataframe
    '''
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = data.memory_usage().sum() / 1024**2    
    for col in tqdm(data.columns):
        col_type = data[col].dtypes
        if col_type in numerics:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    #为避免feather不支持此类型
                    #data[col] = data[col].astype(np.float16)
                    pass
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)    
    end_mem = data.memory_usage().sum() / 1024**2
    print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return data

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
label = pd.read_csv('train_label.csv')
train['label'] = label.label
data = pd.concat([train,test],axis=0,sort=False).reset_index(drop=True)

data['date'] =pd.to_datetime(data['date'])
data['hour'] = data['date'].dt.hour
data['day'] = data['date'].dt.day

A_ft = ['A'+str(i) for i in range(1,4)]
B_ft = ['B'+str(i) for i in range(1,4)]
C_ft = ['C'+str(i) for i in range(1,4)]
D_ft = ['D1','D2']

cat_list = A_ft+B_ft+C_ft+['hour','day']+['E'+str(i) for i in [1,6,14,20,27]]+['E'+str(i) for i in [4,11,12,24,26,28]]+['E'+str(i) for i in [8,15,18,25]]+['E'+str(i) for i in [23,29]]

numerical_cols = [col for col in data.columns if col not in ['ID','label','date']+cat_list+['D1','D2']]

data[numerical_cols] = np.exp(data[numerical_cols])

data['mean_numerical1'] = np.mean(data[numerical_cols],axis=1)
data['std_numerical1'] = np.std(data[numerical_cols],axis=1)
data['min_numerical1'] = np.min(data[numerical_cols],axis=1)
data['max_numerical1'] = np.max(data[numerical_cols],axis=1)

for cb in [('E2','E7'),('E9','E17'),('E5','E9')]:
    data[cb[0]+'_plus_'+cb[1]] = data[cb[0]] + data[cb[1]]
    
for cb in [('E2','E7'),('E9','E17'),('E19','E9'),('E7','E9')]:
    data[cb[0]+'_mul_'+cb[1]] = data[cb[0]] + data[cb[1]]
    
for cb in [('E17','E9'),('E9','E17'),('E5','E9'),('E7','E2'),('E9','E2')]:
    data[cb[0]+'_devide_'+cb[1]] = data[cb[0]] + data[cb[1]]
    
count_feature = []
for col in tqdm(A_ft+B_ft+C_ft+['E'+str(i) for i in [1,6,14,20,27]]):
    data[col + "_count"] = data.groupby([col])[col].transform('count')
    count_feature.append(col + "_count")
    
def vec(data,col1,col2):
    dataword2vec2 = pd.concat((data[col1],data[col2]), axis=1)
    dataword2vec3=np.array(dataword2vec2.astype(str))
    dataword2vec3=dataword2vec3.tolist()  #必须用列表类型的数据才能训练词向量
    model = Word2Vec(dataword2vec3, size=200,iter=15, hs=1, min_count=1, window=5,workers=6)
    ws1=np.array(dataword2vec2[col1].astype('str'))
    ws2=np.array(dataword2vec2[col2].astype('str'))
    ws1=ws1.tolist()
    ws2=ws2.tolist()
    word2vecsim1=[]
    
    for i in tqdm(range(len(data))):
        ws3=[ws1[i]]
        ws4=[ws2[i]]
        word2vecsim2=model.wv.n_similarity(ws3,ws4)#计算两列的相似度
        word2vecsim1.append(word2vecsim2)
    data[col1+col2+'_vec'] = np.array(word2vecsim1)
    
for cols in combinations(A_ft,2):
    vec(data,cols[0],cols[1])
for cols in combinations(B_ft,2):
    vec(data,cols[0],cols[1])
for cols in combinations(C_ft,2):
    vec(data,cols[0],cols[1])

In [None]:
numerical_cols = [col for col in data.columns if col not in ['ID','label','date']+cat_list+['D1','D2']]

In [None]:
data['mean_numerical2'] = np.mean(data[numerical_cols],axis=1)
data['std_numerical2'] = np.std(data[numerical_cols],axis=1)
data['min_numerical2'] = np.min(data[numerical_cols],axis=1)
data['max_numerical2'] = np.max(data[numerical_cols],axis=1)

In [None]:
useless_ft = ['E22','E3','E19']

In [None]:
feature_name = list(set([col for col in data.columns if col not in useless_ft+['ID','label','date']]))
#cat_list = A_ft+B_ft+['hour','day']+['E1','E14']
print(feature_name)
print(len(feature_name))
print(cat_list)
print(len(cat_list))

In [None]:
data[cat_list] = data[cat_list].astype(int)

In [None]:
%time data = reduce_mem_usage(data)

In [None]:
tr_index = ~data['label'].isnull()
X_train = data.loc[tr_index,:].reset_index(drop=True)
y = data.loc[tr_index,:]['label'].reset_index(drop=True).astype(int)
X_test = data[~tr_index].reset_index(drop=True)
print(X_train.shape,X_test.shape)

In [None]:
def run_cbt_cv(train_X,train_Y,test_X,params,feature_name=None,split=5,seed=20191031,cat_list=None,use_best=True,iterations=10000):
    val_results = []
    models_list = []
    best_iterations = []
    train_pred = np.zeros(train_X.shape[0])
    test_pred = np.zeros(test_X.shape[0])
    seeds=range(seed,seed+split)
    
    learning_rate = params['learning_rate']
    depth = params['max_depth']
    reg_lambda = params['reg_lambda']
    bagging_temperature = params['bagging_temperature']
    random_strength = params['random_strength']
    
    if feature_name == None:
        feature_name = [col for col in train_X.columns if col not in ['ID','label','date']]
    print('Using features:',feature_name)
    print(len(feature_name)) 
    
    train_val_spliter = StratifiedKFold(n_splits=split, random_state=seeds[0], shuffle=True)
    
    for index, (train_index, test_index) in enumerate(train_val_spliter.split(train_X, train_Y)):
        print('fold:',index+1)
        val_result = []

        train_x, val_x, train_y, val_y = train_X[feature_name].iloc[train_index], train_X[feature_name].iloc[test_index], train_Y.iloc[train_index], train_Y.iloc[test_index]
        
        cbt_model = cbt.CatBoostClassifier(iterations=iterations,learning_rate=learning_rate,max_depth=depth,verbose=100,
                                   early_stopping_rounds=700,task_type='GPU',eval_metric='AUC',loss_function='Logloss',
                                   cat_features=cat_list,random_state=seeds[index],reg_lambda=reg_lambda,use_best_model=use_best)
        cbt_model.fit(train_x[feature_name], train_y,eval_set=(val_x[feature_name],val_y))
        gc.collect()
        
        train_pred[test_index] += cbt_model.predict_proba(val_x)[:,1]
        fold_test_pred = cbt_model.predict_proba(X_test[feature_name])[:,1]
        test_pred += fold_test_pred/split
        
        val_result.append(roc_auc_score(val_y, train_pred[test_index]))
        print('AUC: ',val_result[-1])
        val_result.append(log_loss(val_y, train_pred[test_index]))
        print('log_loss: ',val_result[-1])
        best_iterations.append(cbt_model.get_best_iteration())
        val_results.append(val_result)
        
        del cbt_model
        gc.collect()
        
    val_results = np.array(val_results)
    print('cv completed')
    print('mean best iteration: ',np.mean(best_iterations))
    print('std best iteration: ',np.std(best_iterations))
    print('oof AUC: ',roc_auc_score(train_Y,train_pred))
    print('mean AUC: ',np.mean(val_results[:,0]))
    print('std AUC: ',np.std(val_results[:,0]))
    print('oof log_loss: ',log_loss(train_Y,train_pred))
    print('mean log_loss: ',np.mean(val_results[:,1]))
    print('std log_loss: ',np.std(val_results[:,1]))
    return train_pred,test_pred

In [None]:
default_params = {'bagging_temperature': 1.0, 
          'learning_rate': 0.03, 
          'max_depth': 7, 
          'random_strength': 1.0,
          'reg_lambda': 6.0
}

In [None]:
prediction_df = pd.DataFrame()
n_times = 12
oof_df = pd.DataFrame()
for i in [10*i+20191091 for i in range(n_times)]:
    oof,pred = run_cbt_cv(X_train,y,X_test,params=default_params,feature_name=feature_name,seed=i,iterations=10000,use_best=True,split=5,cat_list=cat_list)
    gc.collect()
    
    prediction_temp = pd.DataFrame()
    prediction_temp['cbt_'+str(i)] = pred
    prediction_df = pd.concat([prediction_df,prediction_temp],axis=1)
    
    oof_temp = pd.DataFrame()
    oof_temp['cbt_'+str(i)] = oof
    oof_df = pd.concat([oof_df,oof_temp],axis=1)

In [None]:
oof_df.to_csv('oof_cbt_12.csv',index=False)
prediction_df.to_csv('test_cbt_12.csv',index=False)