In [1]:
# coding: utf-8
import multiprocessing
from collections import Counter
import xgboost as xgb
import pandas as pd
import numpy as np
import warnings

from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from tqdm import tqdm
from sklearn.model_selection import KFold
import gc
from sklearn import preprocessing
from scipy.stats import entropy
# from imblearn.over_sampling import SMOTE
# from imblearn.under_sampling import RandomUnderSampler
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.metrics import make_scorer, roc_auc_score
from sklearn.metrics import roc_auc_score, roc_curve
import datetime
import time
from itertools import product

nowtime = datetime.date.today()
nowtime = str(nowtime)[-5:]
print(nowtime)
warnings.filterwarnings('ignore')


def load_dataset(DATA_PATH):
    train_label = pd.read_csv(DATA_PATH + 'train.csv')['loan_default']
    train = pd.read_csv(DATA_PATH + 'train.csv')
    train = reduce_mem_usage(train)
    test = pd.read_csv(DATA_PATH + 'test.csv')
    test = reduce_mem_usage(test)
    feats = [f for f in train.columns if f not in ['customer_id', 'loan_default']]
    t_feats = [f for f in train.columns if f not in ['customer_id']]
    train = train[t_feats]
    test = test[feats]
    print('train.shape', train.shape)
    print('test.shape', test.shape)

    return train_label, train, test

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum()
    print('内存占用{:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum()
    print('优化后内存为: {:.2f} MB'.format(end_mem))
    print('内存使用减少 {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

# 处理时间
def transform_time(x):
    day = int(x.split(' ')[0])
    hour = int(x.split(' ')[2].split('.')[0].split(':')[0])
    minute = int(x.split(' ')[2].split('.')[0].split(':')[1])
    second = int(x.split(' ')[2].split('.')[0].split(':')[2])
    return 86400 * day + 3600 * hour + 60 * minute + second


def transform_day(date1):
    date2 = "2020-01-01"
    date1 = time.strptime(date1, "%Y-%m-%d")
    date2 = time.strptime(date2, "%Y-%m-%d")

    # 根据上面需要计算日期还是日期时间，来确定需要几个数组段。下标0表示年，小标1表示月，依次类推...
    # date1=datetime.datetime(date1[0],date1[1],date1[2],date1[3],date1[4],date1[5])
    # date2=datetime.datetime(date2[0],date2[1],date2[2],date2[3],date2[4],date2[5])
    date1 = datetime.datetime(date1[0], date1[1], date1[2])
    date2 = datetime.datetime(date2[0], date2[1], date2[2])
    # 返回两个变量相差的值，就是相差天数
    # print((date2 - date1).days)  # 将天数转成int型
    return (date2 - date1).days


# transform_day('2007-09-01')

def labelEncoder_df(df, features):
    for i in features:
        encoder = preprocessing.LabelEncoder()
        df[i] = encoder.fit_transform(df[i])



class MeanEncoder:
    def __init__(self, categorical_features, n_splits=5, target_type='classification', prior_weight_func=None):
        """
        :param categorical_features: list of str, the name of the categorical columns to encode
        类别特征，列表或者字符串，用于编码的类别列名
        :param n_splits: the number of splits used in mean encoding
        用于均值编码的拆分次数
        :param target_type: str, 'regression' or 'classification'
        定义目标类型是回归变量还是分类变量
        :param prior_weight_func:
        a function that takes in the number of observations, and outputs prior weight
        when a dict is passed, the default exponential decay function will be used:
        k: the number of observations needed for the posterior to be weighted equally as the prior
        f: larger f --> smaller slope
        
         吸收观测值并输出先前权重的函数
         传递字典时，将使用默认的指数衰减函数：
         k：后验与前验平均加权所需的观察数
         f：较大的f->较小的斜率
        '''
        >>>example:
        mean_encoder = MeanEncoder(
                        categorical_features=['regionidcity',
                          'regionidneighborhood', 'regionidzip'],
                target_type='regression'
                )

        X = mean_encoder.fit_transform(X, pd.Series(y))
        X_test = mean_encoder.transform(X_test)


        """

        self.categorical_features = categorical_features
        self.n_splits = n_splits
        self.learned_stats = {}

        if target_type == 'classification':
            self.target_type = target_type
            self.target_values = []
        else:
            self.target_type = 'regression'
            self.target_values = None

        # 判断是否为字典类型
        if isinstance(prior_weight_func, dict):
            self.prior_weight_func = eval('lambda x: 1 / (1 + np.exp((x - k) / f))', dict(prior_weight_func, np=np))
        # 判断函数是否可以被调用
        elif callable(prior_weight_func):
            self.prior_weight_func = prior_weight_func
        else:
            self.prior_weight_func = lambda x: 1 / (1 + np.exp((x - 2) / 1))


    #  返回函数的静态方法，该方法不强制要求传递参数
    @staticmethod
    def mean_encode_subroutine(X_train, y_train, X_test, variable, target, prior_weight_func):
        X_train = X_train[[variable]].copy()
        X_test = X_test[[variable]].copy()

        # 定义回归和分类两种调用方法
        if target is not None:
            nf_name = '{}_pred_{}'.format(variable, target)
            X_train['pred_temp'] = (y_train == target).astype(int)  # classification 分类方法
        else:
            nf_name = '{}_pred'.format(variable)
            X_train['pred_temp'] = y_train  # regression 回归方法
        prior = X_train['pred_temp'].mean()
        
        col_avg_y = X_train.groupby(variable)['pred_temp'].agg(['mean','size']).rename(columns={'mean':'mean','size':'beta'})
        col_avg_y['beta'] = prior_weight_func(col_avg_y['beta'])
        col_avg_y[nf_name] = col_avg_y['beta'] * prior + (1 - col_avg_y['beta']) * col_avg_y['mean']
        col_avg_y.drop(['beta', 'mean'], axis=1, inplace=True)

        nf_train = X_train.join(col_avg_y, on=variable)[nf_name].values
        nf_test = X_test.join(col_avg_y, on=variable).fillna(prior, inplace=False)[nf_name].values

        return nf_train, nf_test, prior, col_avg_y

    def fit_transform(self, X, y):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :param y: pandas Series or numpy array, n_samples
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()
        if self.target_type == 'classification':
            skf = StratifiedKFold(self.n_splits)
        else:
            skf = KFold(self.n_splits)

        if self.target_type == 'classification':
            self.target_values = sorted(set(y))
            self.learned_stats = {'{}_pred_{}'.format(variable, target): [] for variable, target in
                                  product(self.categorical_features, self.target_values)}
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, target, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        else:
            self.learned_stats = {'{}_pred'.format(variable): [] for variable in self.categorical_features}
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new.loc[:, nf_name] = np.nan
                for large_ind, small_ind in skf.split(y, y):
                    nf_large, nf_small, prior, col_avg_y = MeanEncoder.mean_encode_subroutine(
                        X_new.iloc[large_ind], y.iloc[large_ind], X_new.iloc[small_ind], variable, None, self.prior_weight_func)
                    X_new.iloc[small_ind, -1] = nf_small
                    self.learned_stats[nf_name].append((prior, col_avg_y))
        return X_new

    def transform(self, X):
        """
        :param X: pandas DataFrame, n_samples * n_features
        :return X_new: the transformed pandas DataFrame containing mean-encoded categorical features
        """
        X_new = X.copy()

        if self.target_type == 'classification':
            for variable, target in product(self.categorical_features, self.target_values):
                nf_name = '{}_pred_{}'.format(variable, target)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits
        else:
            for variable in self.categorical_features:
                nf_name = '{}_pred'.format(variable)
                X_new[nf_name] = 0
                for prior, col_avg_y in self.learned_stats[nf_name]:
                    X_new[nf_name] += X_new[[variable]].join(col_avg_y, on=variable).fillna(prior, inplace=False)[
                        nf_name]
                X_new[nf_name] /= self.n_splits

        return X_new









def gradeTrans(x):
    dict = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
    result = dict[x]
    return result






def myEntro(x):
    """
        calculate shanno ent of x
    """
    x = np.array(x)
    x_value_list = set([x[i] for i in range(x.shape[0])])
    ent = 0.0
    for x_value in x_value_list:
        p = float(x[x == x_value].shape[0]) / x.shape[0]
        logp = np.log2(p)
        ent -= p * logp
    #     print(x_value,p,logp)
    # print(ent)
    return ent


def myRms(records):
    records = list(records)
    """
    均方根值 反映的是有效值而不是平均值
    """
    return np.math.sqrt(sum([x ** 2 for x in records]) / len(records))


def myMode(x):
    return np.mean(pd.Series.mode(x))


def myQ25(x):
    return x.quantile(0.25)


def myQ75(x):
    return x.quantile(0.75)

def myQ10(x):
    return x.quantile(0.1)
    
def myQ90(x):
    return x.quantile(0.9)


def myRange(x):
    return pd.Series.max(x) - pd.Series.min(x)


# 预处理
def data_preprocess(DATA_PATH):
    train_label, train, test = load_dataset(DATA_PATH=DATA_PATH)
    # 拼接数据

    data = pd.concat([train, test], axis=0, ignore_index=True)
    print('初始拼接后：', data.shape)
    
    n_feat = ['main_account_loan_no','main_account_active_loan_no','main_account_overdue_no','main_account_outstanding_loan',
              'main_account_sanction_loan','main_account_disbursed_loan','main_account_inactive_loan_no', 'main_account_tenure',
              'sub_account_loan_no', 'sub_account_active_loan_no','sub_account_outstanding_loan','sub_account_sanction_loan',
              'sub_account_disbursed_loan','main_account_monthly_payment','sub_account_monthly_payment', 'sub_account_inactive_loan_no',
              'sub_account_tenure', 
              'total_account_loan_no','total_inactive_loan_no','total_overdue_no','total_outstanding_loan','total_sanction_loan',
              'total_disbursed_loan','total_monthly_payment',
#               'disbursed_amount','asset_cost','branch_id','supplier_id','manufacturer_id','area_id','employee_code_id','mobileno_flag',
#               'idcard_flag','Driving_flag','passport_flag','credit_score','last_six_month_new_loan_no','last_six_month_defaulted_no',
#               'average_age','credit_history','enquirie_no','loan_to_asset_ratio','outstanding_disburse_ratio','disburse_to_sactioned_ratio',
#               'active_to_inactive_act_ratio','year_of_birth','disbursed_date','Credit_level','employment_type','age',
             ]
   
    nameList = ['min', 'max', 'sum', 'mean', 'median', 'skew', 'std', 'mode', 'range', 'Q25','Q75']
    statList = ['min', 'max', 'sum', 'mean', 'median', 'skew', 'std', myMode, myRange, myQ25, myQ75]

#     nameList = ['max', 'sum', 'mean', 'median', 'skew', 'std']
#     statList = ['max', 'sum', 'mean', 'median', 'skew', 'std']




    for i in range(len(nameList)):
        data['n_feat_{}'.format(nameList[i])] = data[n_feat].agg(statList[i], axis=1)
    print('n特征处理后：', data.shape)

    # count编码
    count_list = ['branch_id', 'supplier_id', 'manufacturer_id', 'year_of_birth','area_id','employee_code_id','age',
                  'Driving_flag','passport_flag','employment_type']
    data = count_coding(data, count_list)
    print('count编码后：', data.shape)
    ### 用数值特征对类别特征做统计刻画，随便挑了几个跟price相关性最高的匿名特征
    cross_cat = ['branch_id', 'supplier_id', 'manufacturer_id', 'year_of_birth','area_id','employee_code_id','age',
                  'Driving_flag','passport_flag','employment_type']
    cross_num = [ 'disbursed_amount','asset_cost','credit_score','last_six_month_new_loan_no','last_six_month_defaulted_no',
                  'average_age','credit_history','enquirie_no','loan_to_asset_ratio','outstanding_disburse_ratio',
                  'disburse_to_sactioned_ratio','active_to_inactive_act_ratio','Credit_level',
              
#               'main_account_loan_no','main_account_active_loan_no','main_account_overdue_no','main_account_outstanding_loan',
#               'main_account_sanction_loan','main_account_disbursed_loan','main_account_inactive_loan_no', 'main_account_tenure',
#               'sub_account_loan_no', 'sub_account_active_loan_no','sub_account_outstanding_loan','sub_account_sanction_loan',
#               'sub_account_disbursed_loan','main_account_monthly_payment','sub_account_monthly_payment', 'sub_account_inactive_loan_no',
#               'sub_account_tenure', 
#               'total_account_loan_no','total_inactive_loan_no','total_overdue_no','total_outstanding_loan','total_sanction_loan',
#               'total_disbursed_loan','total_monthly_payment',
                ]


    data = cross_cat_num(data, cross_num, cross_cat)  # 一阶交叉
    print('一阶特征处理后：', data.shape)
#     data = cross_qua_cat_num(data)  # 二阶交叉
#     print('二阶特征处理后：', data.shape)


#     data['grade'] = data['grade'].apply(lambda x: gradeTrans(x))
#     data['subGrade'] = data['subGrade'].apply(lambda x: subGradeTrans(x))


    print('预处理完毕', data.shape)

    return data, train_label





def kfold_stats_feature(train, test, feats, k):
    folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=2021)  # 这里最好和后面模型的K折交叉验证保持一致

    train['fold'] = None
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['loan_default'])):
        train.loc[val_idx, 'fold'] = fold_

    kfold_features = []
    for feat in feats:
        nums_columns = ['loan_default']
        for f in nums_columns:
            colname = feat + '_' + f + '_kfold_mean'
            kfold_features.append(colname)
            train[colname] = None
            for fold_, (trn_idx, val_idx) in enumerate(folds.split(train, train['loan_default'])):
                tmp_trn = train.iloc[trn_idx]
                order_label = tmp_trn.groupby([feat])[f].mean()
                tmp = train.loc[train.fold == fold_, [feat]]
                train.loc[train.fold == fold_, colname] = tmp[feat].map(order_label)
                # fillna
                global_mean = train[f].mean()
                train.loc[train.fold == fold_, colname] = train.loc[train.fold == fold_, colname].fillna(global_mean)
            train[colname] = train[colname].astype(float)

        for f in nums_columns:
            colname = feat + '_' + f + '_kfold_mean'
            test[colname] = None
            order_label = train.groupby([feat])[f].mean()
            test[colname] = test[feat].map(order_label)
            # fillna
            global_mean = train[f].mean()
            test[colname] = test[colname].fillna(global_mean)
            test[colname] = test[colname].astype(float)
    del train['fold']
    return train, test

def GridSearch(clf, params, X, y):
    cscv = GridSearchCV(clf, params, scoring='roc_auc', n_jobs=8, cv=10)
    cscv.fit(X, y)
    print(cscv.cv_results_)
    print(cscv.best_params_)
    print(cscv.best_score_)

### count编码
def count_coding(df, fea_col):
    for f in fea_col:
        df[f + '_count'] = df[f].map(df[f].value_counts())
    return (df)


# 定义交叉特征统计
def cross_cat_num(df, num_col, cat_col):
    for f1 in tqdm(cat_col):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_col):
            feat = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max', '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
            })
            df = df.merge(feat, on=f1, how='left')
    return (df)


def cross_qua_cat_num(df):
    for f_pair in tqdm([
        
        ['branch_id','area_id'],['branch_id','employee_code_id'],['branch_id','employment_type'],
        ['supplier_id','employment_type'],['supplier_id', 'Driving_flag'],['supplier_id','passport_flag'],
        ['manufacturer_id','employment_type'],['manufacturer_id','year_of_birth'],['manufacturer_id','age'],
        ['employee_code_id','area_id'],['employee_code_id','Driving_flag'],['employee_code_id','passport_flag'],
        ['employee_code_id','employment_type']                     
    ]):
        ### 共现次数
        df['_'.join(f_pair) + '_count'] = df.groupby(f_pair)['customer_id'].transform('count')
        ### n unique、熵
        df = df.merge(df.groupby(f_pair[0], as_index=False)[f_pair[1]].agg({
            '{}_{}_nunique'.format(f_pair[0], f_pair[1]): 'nunique',
            '{}_{}_ent'.format(f_pair[0], f_pair[1]): lambda x: entropy(x.value_counts() / x.shape[0])
        }), on=f_pair[0], how='left')
        df = df.merge(df.groupby(f_pair[1], as_index=False)[f_pair[0]].agg({
            '{}_{}_nunique'.format(f_pair[1], f_pair[0]): 'nunique',
            '{}_{}_ent'.format(f_pair[1], f_pair[0]): lambda x: entropy(x.value_counts() / x.shape[0])
        }), on=f_pair[1], how='left')
        ### 比例偏好
        df['{}_in_{}_prop'.format(f_pair[0], f_pair[1])] = df['_'.join(f_pair) + '_count'] / df[f_pair[1] + '_count']
        df['{}_in_{}_prop'.format(f_pair[1], f_pair[0])] = df['_'.join(f_pair) + '_count'] / df[f_pair[0] + '_count']
    return (df)


### count编码
def count_coding(df, fea_col):
    for f in fea_col:
        df[f + '_count'] = df[f].map(df[f].value_counts())
    return (df)

def gen_basicFea(data):
    
    data['new_loan_to_asset_ratio'] = data['disbursed_amount'] / (data['asset_cost'] * data['credit_score'])
#     data['new_credit_ratio'] = (data['main_account_outstanding_loan'] + data['sub_account_outstanding_loan'])/\
#     (data['credit_history'] * data['credit_score'])

#     data['new_payment_ration'] = (data['main_account_sanction_loan']+data['main_account_disbursed_loan']+data['sub_account_sanction_loan']+\
#     data['sub_account_disbursed_loan'])/(data['main_account_monthly_payment']+data['sub_account_monthly_payment'])

#     for col in ['branch_id', 'supplier_id', 'manufacturer_id', 'year_of_birth','area_id','employee_code_id','age']:
#         data['{}_count'.format(col)] = data.groupby([col])['customer_id'].transform('count')

    return data


def plotroc(train_y, train_pred, test_y, val_pred):
    lw = 2
    ##train
    fpr, tpr, thresholds = roc_curve(train_y.values, train_pred, pos_label=1.0)
    train_auc_value = roc_auc_score(train_y.values, train_pred)
    ##valid
    fpr, tpr, thresholds = roc_curve(test_y.values, val_pred, pos_label=1.0)
    valid_auc_value = roc_auc_score(test_y.values, val_pred)

    return train_auc_value, valid_auc_value


def lgb_model(train, target, test, k):

#     saveFeature_df = pd.read_csv('../feature/lgb_importance.csv')
#     saveFeature_list = list(saveFeature_df['Feature'][:290])
    saveFeature_list=list(train.columns)
    feats = [f for f in saveFeature_list if f not in ['customer_id', 'loan_default']]
    feaNum = len(feats)
    print('Current num of lgb features:', len(feats))

    seeds = [2020,6666]
    output_preds = 0
    lgb_oof_probs = np.zeros(train.shape[0])

    for seed in seeds:
        folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        oof_probs = np.zeros(train.shape[0])

        offline_score = []
        feature_importance_df = pd.DataFrame()
        params = {
                    'boosting_type': 'gbdt','objective': 'binary','metric': 'auc','learning_rate': 0.01,
                    'bagging_fraction': 1.0, 'bagging_freq': 44, 'feature_fraction': 0.5, 
                    'max_depth': 3,
                    'min_child_weight': 10.0, 'min_data_in_leaf': 33, 'min_split_gain': 0.14174021024592806,
                    'num_leaves': 29, 'reg_alpha': 7.588866417707964, 'reg_lambda': 10.0,
                    'seed': seed,'n_jobs': -1,'verbose': -1,
                  }
        for i, (train_index, test_index) in enumerate(folds.split(train, target)):
            
            train_y, test_y = target[train_index], target[test_index]
            train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]
            train_matrix = lgb.Dataset(train_X, label=train_y)
            valid_matrix = lgb.Dataset(test_X, label=test_y)
            test_matrix = test[feats]
            watchlist = [train_matrix, valid_matrix]
            
            model = lgb.train(params, train_matrix, num_boost_round=20000, valid_sets=watchlist,
                          verbose_eval=500, early_stopping_rounds=500)
            
            val_pred = model.predict(test_X, num_iteration=model.best_iteration)
            train_pred = model.predict(train_X, num_iteration=model.best_iteration)
                                 
            lgb_oof_probs[test_index] += val_pred / len(seeds)
            # oof_probs[test_index] += val_pred
            test_pred = model.predict(test_matrix, num_iteration=model.best_iteration,predict_disable_shape_check=True)
#             test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)

            # 绘制roc曲线
            train_auc_value, valid_auc_value = plotroc(train_y, train_pred, test_y, val_pred)
            print('lgb_train_auc:{},valid_auc{}'.format(train_auc_value, valid_auc_value))
            offline_score.append(valid_auc_value)
            print(offline_score)
            output_preds += test_pred / k / len(seeds)
            
            fold_importance_df = pd.DataFrame()
#             booster = model.booster_
            fold_importance_df["Feature"] = model.feature_name()
            fold_importance_df["importance"] = model.feature_importance(importance_type='split')
            fold_importance_df["fold"] = i + 1

            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)


        print('lgb_all_auc:', roc_auc_score(target.values, oof_probs))
        print('OOF-MEAN-AUC lgb :%.6f, OOF-STD-AUC:%.6f' % (np.mean(offline_score), np.std(offline_score)))
        feature_sorted = feature_importance_df.groupby(['Feature'])['importance'].mean().sort_values(ascending=False)
        feature_sorted.to_csv('../feature/lgb_importance.csv')
        top_features = feature_sorted.index
        print(feature_importance_df.groupby(['Feature'])['importance'].mean().sort_values(ascending=False).head(50))
    return output_preds, lgb_oof_probs, np.mean(offline_score), feaNum

def xgb_model(train, target, test, k):

    saveFeature_list=list(train.columns)
    feats = [f for f in saveFeature_list if f not in ['customer_id', 'loan_default']]
    feaNum = len(feats)
    print('Current num of xgb features:', len(feats))

    seeds = [2020,6666]
    output_preds = 0
    xgb_oof_probs = np.zeros(train.shape[0])

    for seed in seeds:
        folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        oof_probs = np.zeros(train.shape[0])

        offline_score = []
        feature_importance_df = pd.DataFrame()
        params = {'booster': 'gbtree',
                  'objective': 'binary:logistic',
                  'eval_metric': 'auc',
                  'min_child_weight': 5,
                  'max_depth': 8,
                  'subsample': ss,
                  'colsample_bytree': fs,
                  'eta': 0.01,
                  'seed': seed,
                  'nthread': -1,
                  'tree_method': 'gpu_hist'
                  }
        for i, (train_index, test_index) in enumerate(folds.split(train, target)):
            
            train_y, test_y = target[train_index], target[test_index]
            train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]
            train_matrix = xgb.DMatrix(train_X, label=train_y, missing=np.nan)
            valid_matrix = xgb.DMatrix(test_X, label=test_y, missing=np.nan)
            test_matrix = xgb.DMatrix(test[feats], missing=np.nan)
            watchlist = [(train_matrix, 'train'), (valid_matrix, 'eval')]
            model = xgb.train(params, train_matrix, num_boost_round=20000, evals=watchlist, verbose_eval=100,
                              early_stopping_rounds=500)
            val_pred = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            train_pred = model.predict(train_matrix, ntree_limit=model.best_ntree_limit)
            xgb_oof_probs[test_index] += val_pred / len(seeds)
            # oof_probs[test_index] += val_pred
            test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)

            # 绘制roc曲线
            train_auc_value, valid_auc_value = plotroc(train_y, train_pred, test_y, val_pred)
            print('xgb_train_auc:{},valid_auc{}'.format(train_auc_value, valid_auc_value))
            offline_score.append(valid_auc_value)
            print(offline_score)
            output_preds += test_pred / k / len(seeds)
            
            fold_importance_df = pd.DataFrame()
            fold_importance_df["Feature"] = model.get_fscore().keys()
            fold_importance_df["importance"] = model.get_fscore().values()
            fold_importance_df["fold"] = i + 1

            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)


        print('xgb_all_auc:', roc_auc_score(target.values, oof_probs))
        print('OOF-MEAN-AUC xgb :%.6f, OOF-STD-AUC:%.6f' % (np.mean(offline_score), np.std(offline_score)))
        feature_sorted = feature_importance_df.groupby(['Feature'])['importance'].mean().sort_values(ascending=False)
        feature_sorted.to_csv('../feature/xgb_importance.csv')
        top_features = feature_sorted.index
        print(feature_importance_df.groupby(['Feature'])['importance'].mean().sort_values(ascending=False).head(50))
    return output_preds, xgb_oof_probs, np.mean(offline_score), feaNum




08-04


In [2]:
DATA_PATH = '../data/'
print('读取数据...')
data, train_label = data_preprocess(DATA_PATH=DATA_PATH)

print('开始特征工程...')
data = gen_basicFea(data)


print('data.shape', data.shape)

读取数据...
内存占用63600128.00 MB
优化后内存为: 18900128.00 MB
内存使用减少 70.3%
内存占用12480128.00 MB
优化后内存为: 3720128.00 MB
内存使用减少 70.2%
train.shape (150000, 52)
test.shape (30000, 51)
初始拼接后： (180000, 52)


  0%|                                                                                           | 0/10 [00:00<?, ?it/s]
  0%|                                                                                           | 0/13 [00:00<?, ?it/s][A

n特征处理后： (180000, 63)
count编码后： (180000, 73)



  8%|██████▍                                                                            | 1/13 [00:00<00:02,  4.89it/s][A
 23%|███████████████████▏                                                               | 3/13 [00:00<00:01,  5.99it/s][A
 38%|███████████████████████████████▉                                                   | 5/13 [00:00<00:01,  7.05it/s][A
 54%|████████████████████████████████████████████▋                                      | 7/13 [00:00<00:00,  7.83it/s][A
 69%|█████████████████████████████████████████████████████████▍                         | 9/13 [00:00<00:00,  8.55it/s][A
 85%|█████████████████████████████████████████████████████████████████████▍            | 11/13 [00:01<00:00,  9.18it/s][A
100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:01<00:00, 10.18it/s][A
 10%|████████▎                                                                          | 1/10 [00:01<00:11,  1.28s/it]
  0%|             

100%|██████████████████████████████████████████████████████████████████████████████████| 13/13 [00:02<00:00,  5.47it/s][A
 50%|█████████████████████████████████████████▌                                         | 5/10 [00:09<00:09,  1.90s/it]
  0%|                                                                                           | 0/13 [00:00<?, ?it/s][A
  8%|██████▍                                                                            | 1/13 [00:00<00:02,  5.22it/s][A
 15%|████████████▊                                                                      | 2/13 [00:00<00:02,  5.15it/s][A
 23%|███████████████████▏                                                               | 3/13 [00:00<00:01,  5.10it/s][A
 31%|█████████████████████████▌                                                         | 4/13 [00:00<00:01,  4.95it/s][A
 38%|███████████████████████████████▉                                                   | 5/13 [00:01<00:01,  4.93it/s][A
 46%|██████████████

 31%|█████████████████████████▌                                                         | 4/13 [00:01<00:02,  3.47it/s][A
 38%|███████████████████████████████▉                                                   | 5/13 [00:01<00:02,  3.45it/s][A
 46%|██████████████████████████████████████▎                                            | 6/13 [00:01<00:02,  3.41it/s][A
 54%|████████████████████████████████████████████▋                                      | 7/13 [00:02<00:01,  3.46it/s][A
 62%|███████████████████████████████████████████████████                                | 8/13 [00:02<00:01,  3.38it/s][A
 69%|█████████████████████████████████████████████████████████▍                         | 9/13 [00:02<00:01,  3.32it/s][A
 77%|███████████████████████████████████████████████████████████████                   | 10/13 [00:02<00:00,  3.30it/s][A
 85%|█████████████████████████████████████████████████████████████████████▍            | 11/13 [00:03<00:00,  3.31it/s][A
 92%|███████████

一阶特征处理后： (180000, 463)
预处理完毕 (180000, 463)
开始特征工程...
data.shape (180000, 464)


In [6]:
if __name__ == '__main__':
   
    print('开始模型训练...')
    train = data[~data['loan_default'].isnull()].copy()
    target = train_label
    test = data[data['loan_default'].isnull()].copy()

    target_encode_cols = ['branch_id', 'supplier_id', 'manufacturer_id', 'year_of_birth','area_id','employee_code_id','age']

    kflod_num = 5
    ss = 0.8
    fs = 0.4

    class_list = ['branch_id', 'supplier_id', 'manufacturer_id', 'year_of_birth','area_id','employee_code_id','age']
    MeanEnocodeFeature = class_list  # 声明需要平均数编码的特征
    ME = MeanEncoder(MeanEnocodeFeature, target_type='classification')  # 声明平均数编码的类
    train = ME.fit_transform(train, target)  # 对训练数据集的X和y进行拟合
    # x_train_fav = ME.fit_transform(x_train,y_train_fav)#对训练数据集的X和y进行拟合
    test = ME.transform(test)  # 对测试集进行编码
    print('num0:mean_encode train.shape', train.shape, test.shape)

    train, test = kfold_stats_feature(train, test, target_encode_cols, kflod_num)
    print('num1:target_encode train.shape', train.shape, test.shape)
    ### target encoding目标编码，回归场景相对来说做目标编码的选择更多，不仅可以做均值编码，还可以做标准差编码、中位数编码等
    enc_cols = []
    stats_default_dict = {
        'max': train['loan_default'].max(),
        'min': train['loan_default'].min(),
        'median': train['loan_default'].median(),
        'mean': train['loan_default'].mean(),
        'sum': train['loan_default'].sum(),
        'std': train['loan_default'].std(),
        'skew': train['loan_default'].skew(),
        'kurt': train['loan_default'].kurt(),
        'mad': train['loan_default'].mad()
    }
    ### 暂且选择这三种编码
    enc_stats = ['max', 'min', 'skew', 'std']
    skf = KFold(n_splits=kflod_num, shuffle=True, random_state=6666)
    for f in tqdm(['branch_id', 'supplier_id', 'manufacturer_id', 'year_of_birth','area_id','employee_code_id','age']):
        enc_dict = {}
        for stat in enc_stats:
            enc_dict['{}_target_{}'.format(f, stat)] = stat
            train['{}_target_{}'.format(f, stat)] = 0
            test['{}_target_{}'.format(f, stat)] = 0
            enc_cols.append('{}_target_{}'.format(f, stat))
        for i, (trn_idx, val_idx) in enumerate(skf.split(train, target)):
            trn_x, val_x = train.iloc[trn_idx].reset_index(drop=True), train.iloc[val_idx].reset_index(drop=True)
            enc_df = trn_x.groupby(f, as_index=False)['loan_default'].agg(enc_dict)
            val_x = val_x[[f]].merge(enc_df, on=f, how='left')
            test_x = test[[f]].merge(enc_df, on=f, how='left')
            for stat in enc_stats:
                val_x['{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format(f, stat)].fillna(
                    stats_default_dict[stat])
                test_x['{}_target_{}'.format(f, stat)] = test_x['{}_target_{}'.format(f, stat)].fillna(
                    stats_default_dict[stat])
                train.loc[val_idx, '{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format(f, stat)].values
                test['{}_target_{}'.format(f, stat)] += test_x['{}_target_{}'.format(f, stat)].values / skf.n_splits

    print('num2:target_encode train.shape', train.shape, test.shape)

    train.drop(['branch_id', 'supplier_id', 'manufacturer_id', 'year_of_birth','area_id','employee_code_id','age'], axis=1, inplace=True)
    test.drop(['branch_id', 'supplier_id', 'manufacturer_id', 'year_of_birth','area_id','employee_code_id','age'], axis=1, inplace=True)
    print('输入数据维度：', train.shape, test.shape)
    
    lgb_preds, lgb_oof, lgb_score, l_feaNum = lgb_model(train=train, target=target, test=test, k=kflod_num)
#     xgb_preds, xgb_oof, xgb_score, x_feaNum = xgb_model(train=train, target=target, test=test, k=kflod_num)



开始模型训练...
num0:mean_encode train.shape (150000, 478) (30000, 478)


  0%|                                                                                            | 0/7 [00:00<?, ?it/s]

num1:target_encode train.shape (150000, 485) (30000, 485)


100%|████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:13<00:00,  1.99s/it]


num2:target_encode train.shape (150000, 513) (30000, 513)
输入数据维度： (150000, 506) (30000, 506)
Current num of lgb features: 505
Training until validation scores don't improve for 500 rounds
[500]	training's auc: 0.702177	valid_1's auc: 0.662789
[1000]	training's auc: 0.725869	valid_1's auc: 0.665903
[1500]	training's auc: 0.746329	valid_1's auc: 0.668254
[2000]	training's auc: 0.764208	valid_1's auc: 0.670289
[2500]	training's auc: 0.779795	valid_1's auc: 0.67134
[3000]	training's auc: 0.79466	valid_1's auc: 0.672238
[3500]	training's auc: 0.808179	valid_1's auc: 0.67298
[4000]	training's auc: 0.820807	valid_1's auc: 0.673897
[4500]	training's auc: 0.83245	valid_1's auc: 0.674679
[5000]	training's auc: 0.843002	valid_1's auc: 0.675538
[5500]	training's auc: 0.852817	valid_1's auc: 0.676171
[6000]	training's auc: 0.862141	valid_1's auc: 0.676849
[6500]	training's auc: 0.870556	valid_1's auc: 0.677233
[7000]	training's auc: 0.878495	valid_1's auc: 0.677485
[7500]	training's auc: 0.885917	v

Training until validation scores don't improve for 500 rounds
[500]	training's auc: 0.700161	valid_1's auc: 0.674363
[1000]	training's auc: 0.72357	valid_1's auc: 0.677943
[1500]	training's auc: 0.743683	valid_1's auc: 0.679695
[2000]	training's auc: 0.762281	valid_1's auc: 0.681442
[2500]	training's auc: 0.77854	valid_1's auc: 0.682418
[3000]	training's auc: 0.793237	valid_1's auc: 0.683219
[3500]	training's auc: 0.806903	valid_1's auc: 0.683856
[4000]	training's auc: 0.819142	valid_1's auc: 0.684678
[4500]	training's auc: 0.830622	valid_1's auc: 0.685345
[5000]	training's auc: 0.841007	valid_1's auc: 0.685722
[5500]	training's auc: 0.850686	valid_1's auc: 0.686385
[6000]	training's auc: 0.859839	valid_1's auc: 0.686946
[6500]	training's auc: 0.868114	valid_1's auc: 0.687413
[7000]	training's auc: 0.876051	valid_1's auc: 0.687638
[7500]	training's auc: 0.883473	valid_1's auc: 0.687923
[8000]	training's auc: 0.890274	valid_1's auc: 0.688129
[8500]	training's auc: 0.896593	valid_1's auc

In [8]:
lgb_score = round(lgb_score, 5)
outpath = '../user_data/'

test = pd.read_csv('../data/test.csv')
sample_submit = test[['customer_id']]
test['loan_default']=-1
# test['loan_default'] = 0.5 * lgb_preds + 0.5 * xgb_preds
test['loan_default'] = lgb_preds


test['loan_default'] = test['loan_default'].apply(lambda x: 1 if x > 0.24 else 0).values
test_ = pd.read_csv('../data/test.csv')
sample_submit = test_[['customer_id']]
sample_submit['loan_default'] = test['loan_default']
sample_submit.to_csv(outpath+'final_sub_lgb0.csv', index=False)

train_df  = pd.read_csv('../data/train.csv')
subVal_df = train_df[['customer_id']].copy()
subVal_df['loan_default'] = lgb_oof

all_auc_score = roc_auc_score(train_label, subVal_df['loan_default'])
print('整体指标得分：', all_auc_score)
all_auc_score = round(all_auc_score, 5)

# subVal_df.to_csv(outpath+'force_alln_fea1_cross1D_lgb1Val0.csv',index=False)
# sub_df.to_csv(
#     outpath + str(all_auc_score) + '_' + str(feaNum) + '_' + nowtime + '_{}_{}_{}_xgb.csv'.format(ss, fs,
#                                                                                                   kflod_num),
#     index=False)
# subVal_df.to_csv(
#     outpath + str(all_auc_score) + '_' + str(feaNum) + '_' + nowtime + '_{}_{}_{}_subVal.csv'.format(ss, fs,
#                                                                                                      kflod_num),
#     index=False)

整体指标得分： 0.6901580870888346


In [None]:
def lgb_model(train, target, test, k):

    
    saveFeature_df = pd.read_csv('../feature/lgb_importance.csv')

    
    saveFeature_list = list(saveFeature_df['Feature'][:528])
#     saveFeature_list=list(train.columns)
    feats = [f for f in saveFeature_list if f not in ['customer_id', 'loan_default']]
    feaNum = len(feats)
    print('Current num of features:', len(feats))
    


    seeds = [2020,666666]
    output_preds = 0
    lgb_oof_probs = np.zeros(train.shape[0])

    for seed in seeds:
        folds = StratifiedKFold(n_splits=k, shuffle=True, random_state=seed)
        oof_probs = np.zeros(train.shape[0])

        offline_score = []
        feature_importance_df = pd.DataFrame()
        params = {
                    'boosting_type': 'gbdt','objective': 'binary','metric': 'auc','learning_rate': 0.01,
                    'bagging_fraction': 1.0, 'bagging_freq': 44, 'feature_fraction': 0.5, 'max_depth': 6,
                    'min_child_weight': 10.0, 'min_data_in_leaf': 33, 'min_split_gain': 0.14174021024592806,
                    'num_leaves': 29, 'reg_alpha': 7.588866417707964, 'reg_lambda': 10.0,
                    'seed': seed,'n_jobs': -1,'verbose': -1,
                  }
        for i, (train_index, test_index) in enumerate(folds.split(train, target)):
            
            train_y, test_y = target[train_index], target[test_index]
            train_X, test_X = train[feats].iloc[train_index, :], train[feats].iloc[test_index, :]
            train_matrix = lgb.Dataset(train_X, label=train_y)
            valid_matrix = lgb.Dataset(test_X, label=test_y)
            test_matrix = test[feats]
            watchlist = [train_matrix, valid_matrix]
            
            model = lgb.train(params, train_matrix, num_boost_round=20000, valid_sets=watchlist,
                          verbose_eval=500, early_stopping_rounds=500)
            
            val_pred = model.predict(test_X, num_iteration=model.best_iteration)
            train_pred = model.predict(train_X, num_iteration=model.best_iteration)
            
            
    
     
            lgb_oof_probs[test_index] += val_pred / len(seeds)
            # oof_probs[test_index] += val_pred
            test_pred = model.predict(test_matrix, num_iteration=model.best_iteration,predict_disable_shape_check=True)
#             test_pred = model.predict(test_matrix, ntree_limit=model.best_ntree_limit)

            # 绘制roc曲线
            train_auc_value, valid_auc_value = plotroc(train_y, train_pred, test_y, val_pred)
            print('train_auc:{},valid_auc{}'.format(train_auc_value, valid_auc_value))
            offline_score.append(valid_auc_value)
            print(offline_score)
            output_preds += test_pred / k / len(seeds)
            

            fold_importance_df = pd.DataFrame()
#             booster = model.booster_
            fold_importance_df["Feature"] = model.feature_name()
            fold_importance_df["importance"] = model.feature_importance(importance_type='split')
            fold_importance_df["fold"] = i + 1

            feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)



        print('all_auc:', roc_auc_score(target.values, oof_probs))
        print('OOF-MEAN-AUC:%.6f, OOF-STD-AUC:%.6f' % (np.mean(offline_score), np.std(offline_score)))
        feature_sorted = feature_importance_df.groupby(['Feature'])['importance'].mean().sort_values(ascending=False)
#         feature_sorted.to_csv('../feature/lgb_importance.csv')
        top_features = feature_sorted.index
        print(feature_importance_df.groupby(['Feature'])['importance'].mean().sort_values(ascending=False).head(50))
    return output_preds, lgb_oof_probs, np.mean(offline_score), feaNum


DATA_PATH = '../data/'
print('读取数据...')
data, train_label = data_preprocess(DATA_PATH=DATA_PATH)

print('开始特征工程...')
print('data.shape', data.shape)
print('开始模型训练...')
train = data[~data['loan_default'].isnull()].copy()
target = train_label
test = data[data['loan_default'].isnull()].copy()

target_encode_cols = ['branch_id', 'supplier_id', 'manufacturer_id', 'year_of_birth','area_id','employee_code_id','age']

kflod_num = 5
ss = 0.8
fs = 0.4

class_list = ['branch_id', 'supplier_id', 'manufacturer_id', 'year_of_birth','area_id','employee_code_id','age']
MeanEnocodeFeature = class_list  # 声明需要平均数编码的特征
ME = MeanEncoder(MeanEnocodeFeature, target_type='classification')  # 声明平均数编码的类
train = ME.fit_transform(train, target)  # 对训练数据集的X和y进行拟合
# x_train_fav = ME.fit_transform(x_train,y_train_fav)#对训练数据集的X和y进行拟合
test = ME.transform(test)  # 对测试集进行编码
print('num0:mean_encode train.shape', train.shape, test.shape)

train, test = kfold_stats_feature(train, test, target_encode_cols, kflod_num)
print('num1:target_encode train.shape', train.shape, test.shape)
### target encoding目标编码，回归场景相对来说做目标编码的选择更多，不仅可以做均值编码，还可以做标准差编码、中位数编码等
enc_cols = []
stats_default_dict = {
    'max': train['loan_default'].max(),
    'min': train['loan_default'].min(),
    'median': train['loan_default'].median(),
    'mean': train['loan_default'].mean(),
    'sum': train['loan_default'].sum(),
    'std': train['loan_default'].std(),
    'skew': train['loan_default'].skew(),
    'kurt': train['loan_default'].kurt(),
    'mad': train['loan_default'].mad()
}
### 暂且选择这三种编码
enc_stats = ['max', 'min', 'skew', 'std']
skf = KFold(n_splits=kflod_num, shuffle=True, random_state=6666)
for f in tqdm(['branch_id', 'supplier_id', 'manufacturer_id', 'year_of_birth','area_id','employee_code_id','age']):
    enc_dict = {}
    for stat in enc_stats:
        enc_dict['{}_target_{}'.format(f, stat)] = stat
        train['{}_target_{}'.format(f, stat)] = 0
        test['{}_target_{}'.format(f, stat)] = 0
        enc_cols.append('{}_target_{}'.format(f, stat))
    for i, (trn_idx, val_idx) in enumerate(skf.split(train, target)):
        trn_x, val_x = train.iloc[trn_idx].reset_index(drop=True), train.iloc[val_idx].reset_index(drop=True)
        enc_df = trn_x.groupby(f, as_index=False)['loan_default'].agg(enc_dict)
        val_x = val_x[[f]].merge(enc_df, on=f, how='left')
        test_x = test[[f]].merge(enc_df, on=f, how='left')
        for stat in enc_stats:
            val_x['{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format(f, stat)].fillna(
                stats_default_dict[stat])
            test_x['{}_target_{}'.format(f, stat)] = test_x['{}_target_{}'.format(f, stat)].fillna(
                stats_default_dict[stat])
            train.loc[val_idx, '{}_target_{}'.format(f, stat)] = val_x['{}_target_{}'.format(f, stat)].values
            test['{}_target_{}'.format(f, stat)] += test_x['{}_target_{}'.format(f, stat)].values / skf.n_splits

print('num2:target_encode train.shape', train.shape, test.shape)

train.drop(['branch_id', 'supplier_id', 'manufacturer_id', 'year_of_birth','area_id','employee_code_id','age'], axis=1, inplace=True)
test.drop(['branch_id', 'supplier_id', 'manufacturer_id', 'year_of_birth','area_id','employee_code_id','age'], axis=1, inplace=True)
print('输入数据维度：', train.shape, test.shape)

lgb_preds, lgb_oof, lgb_score, feaNum = lgb_model(train=train, target=target, test=test, k=kflod_num)