# 信用智能评分 baseline

In [74]:
import numpy as np 
import pandas as pd 
import lightgbm as lgb
import xgboost as xgb
from sklearn.linear_model import BayesianRidge
from sklearn.model_selection import KFold, RepeatedKFold
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from scipy import sparse
import warnings
import time
import sys
import os
import re
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import log_loss
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns',None)
pd.set_option('max_colwidth',100)

In [75]:
## 读取数据集
train = pd.read_csv('../data/train_dataset.csv', encoding='utf-8')
test = pd.read_csv('../data/test_dataset.csv', encoding='utf-8')
train.head()

Unnamed: 0,用户编码,用户实名制是否通过核实,用户年龄,是否大学生客户,是否黑名单客户,是否4G不健康客户,用户网龄（月）,用户最近一次缴费距今时长（月）,缴费用户最近一次缴费金额（元）,用户近6个月平均消费值（元）,用户账单当月总费用（元）,用户当月账户余额（元）,缴费用户当前是否欠费缴费,用户话费敏感度,当月通话交往圈人数,是否经常逛商场的人,近三个月月均商场出现次数,当月是否逛过福州仓山万达,当月是否到过福州山姆会员店,当月是否看电影,当月是否景点游览,当月是否体育场馆消费,当月网购类应用使用次数,当月物流快递类应用使用次数,当月金融理财类应用使用总次数,当月视频播放类应用使用次数,当月飞机类应用使用次数,当月火车类应用使用次数,当月旅游资讯类应用使用次数,信用分
0,a4651f98c82948b186bdcdc8108381b4,1,44,0,0,0,186,1,99.8,163.86,159.2,180,0,3,83,1,75,0,0,0,1,1,713,0,2740,7145,0,0,30,664
1,aeb10247db4e4d67b2550bbc42ff9827,1,18,0,0,1,5,1,29.94,153.28,145.1,110,0,3,21,1,16,0,0,0,0,0,414,0,2731,44862,0,0,0,530
2,5af23a1e0e77410abb25e9a7eee510aa,1,47,0,0,0,145,1,49.9,109.64,120.2,70,0,1,59,0,1,0,0,0,0,0,3391,0,0,4804,0,0,1,643
3,43c64379d3c24a15b8478851b22049e4,1,55,0,0,0,234,1,99.8,92.97,167.42,90,0,3,78,1,26,0,0,0,1,1,500,0,1931,3141,0,0,5,649
4,f1687f3b8a6f4910bd0b13eb634056e2,1,40,0,0,0,76,1,49.9,95.47,101.0,80,0,3,70,1,44,0,0,0,1,0,522,0,64,59,0,0,0,648


## 数据预处理

In [76]:
## 删除重复率超过90%的列
good_cols = list(train.columns)
for col in train.columns:
    rate = train[col].value_counts(normalize=True, dropna=False).values[0]
    if rate > 0.98:
        good_cols.remove(col)
        print(col, rate)

用户实名制是否通过核实 0.99022
是否大学生客户 0.99628
当月物流快递类应用使用次数 0.98558
当月飞机类应用使用次数 0.98628


In [77]:
## 查看信用分的分布特征
train['信用分'].describe()

count    50000.000000
mean       618.053060
std         42.443022
min        422.000000
25%        594.000000
50%        627.000000
75%        649.000000
max        719.000000
Name: 信用分, dtype: float64

In [78]:
target = train['信用分']
good_cols.remove('信用分')
train = train[good_cols]
test = test[good_cols]
data = pd.concat([train, test], axis=0, ignore_index=True)
data.head()

Unnamed: 0,用户编码,用户年龄,是否黑名单客户,是否4G不健康客户,用户网龄（月）,用户最近一次缴费距今时长（月）,缴费用户最近一次缴费金额（元）,用户近6个月平均消费值（元）,用户账单当月总费用（元）,用户当月账户余额（元）,缴费用户当前是否欠费缴费,用户话费敏感度,当月通话交往圈人数,是否经常逛商场的人,近三个月月均商场出现次数,当月是否逛过福州仓山万达,当月是否到过福州山姆会员店,当月是否看电影,当月是否景点游览,当月是否体育场馆消费,当月网购类应用使用次数,当月金融理财类应用使用总次数,当月视频播放类应用使用次数,当月火车类应用使用次数,当月旅游资讯类应用使用次数
0,a4651f98c82948b186bdcdc8108381b4,44,0,0,186,1,99.8,163.86,159.2,180,0,3,83,1,75,0,0,0,1,1,713,2740,7145,0,30
1,aeb10247db4e4d67b2550bbc42ff9827,18,0,1,5,1,29.94,153.28,145.1,110,0,3,21,1,16,0,0,0,0,0,414,2731,44862,0,0
2,5af23a1e0e77410abb25e9a7eee510aa,47,0,0,145,1,49.9,109.64,120.2,70,0,1,59,0,1,0,0,0,0,0,3391,0,4804,0,1
3,43c64379d3c24a15b8478851b22049e4,55,0,0,234,1,99.8,92.97,167.42,90,0,3,78,1,26,0,0,0,1,1,500,1931,3141,0,5
4,f1687f3b8a6f4910bd0b13eb634056e2,40,0,0,76,1,49.9,95.47,101.0,80,0,3,70,1,44,0,0,0,1,0,522,64,59,0,0


In [79]:
target.head()

0    664
1    530
2    643
3    649
4    648
Name: 信用分, dtype: int64

In [80]:
## 类别特征
categorical_columns = ['是否黑名单客户','是否4G不健康客户','缴费用户当前是否欠费缴费',\
              '用户话费敏感度', '是否经常逛商场的人', '当月是否逛过福州仓山万达','当月是否到过福州山姆会员店','当月是否看电影',\
              '当月是否景点游览', '当月是否体育场馆消费']
## 数字特征
all_columns = list(train.columns)
numerical_columns = [i for i in all_columns if i not in categorical_columns]
numerical_columns.remove('用户编码')

In [82]:
## 类别特征处理
# label encoder
for f in categorical_columns:
    data[f] = data[f].map(dict(zip(data[f].unique(), range(0, data[f].nunique()))))

In [97]:
data['当月账单-最近缴费'] = data['用户账单当月总费用（元）'] - data['缴费用户最近一次缴费金额（元）']
data['当月账单-最近缴费']

0         59.40
1        115.16
2         70.30
3         67.62
4         51.10
5        103.60
6        117.98
7         55.89
8        -25.01
9         -6.94
10       108.92
11       -11.80
12         4.10
13        88.00
14        98.30
15       111.00
16        60.00
17       348.00
18        53.26
19       101.00
20        98.00
21        62.50
22        49.00
23       -83.08
24        62.00
25        50.70
26        -3.79
27        28.12
28       116.20
29       115.79
          ...  
99970     -9.90
99971     95.44
99972    -40.80
99973    -71.54
99974     -9.70
99975    -18.70
99976    -26.90
99977    -52.10
99978     58.30
99979     80.44
99980     -8.30
99981     62.96
99982    139.00
99983     38.20
99984     25.81
99985     51.37
99986    112.00
99987    115.37
99988     51.20
99989    239.14
99990     68.50
99991     93.10
99992     29.69
99993    129.50
99994    -11.90
99995   -253.40
99996     54.95
99997     48.26
99998     56.06
99999     51.02
Name: 当月账单-最近缴费, Length:

In [98]:
data.head()

Unnamed: 0,用户编码,用户年龄,是否黑名单客户,是否4G不健康客户,用户网龄（月）,用户最近一次缴费距今时长（月）,缴费用户最近一次缴费金额（元）,用户近6个月平均消费值（元）,用户账单当月总费用（元）,用户当月账户余额（元）,缴费用户当前是否欠费缴费,用户话费敏感度,当月通话交往圈人数,是否经常逛商场的人,近三个月月均商场出现次数,当月是否逛过福州仓山万达,当月是否到过福州山姆会员店,当月是否看电影,当月是否景点游览,当月是否体育场馆消费,当月网购类应用使用次数,当月金融理财类应用使用总次数,当月视频播放类应用使用次数,当月火车类应用使用次数,当月旅游资讯类应用使用次数,当月账单-最近缴费
0,a4651f98c82948b186bdcdc8108381b4,44,0,0,186,1,99.8,163.86,159.2,180,0,0,83,0,75,0,0,0,0,0,713,2740,7145,0,30,59.4
1,aeb10247db4e4d67b2550bbc42ff9827,18,0,1,5,1,29.94,153.28,145.1,110,0,0,21,0,16,0,0,0,1,1,414,2731,44862,0,0,115.16
2,5af23a1e0e77410abb25e9a7eee510aa,47,0,0,145,1,49.9,109.64,120.2,70,0,1,59,1,1,0,0,0,1,1,3391,0,4804,0,1,70.3
3,43c64379d3c24a15b8478851b22049e4,55,0,0,234,1,99.8,92.97,167.42,90,0,0,78,0,26,0,0,0,0,0,500,1931,3141,0,5,67.62
4,f1687f3b8a6f4910bd0b13eb634056e2,40,0,0,76,1,49.9,95.47,101.0,80,0,0,70,0,44,0,0,0,0,1,522,64,59,0,0,51.1


In [99]:
## 将数据集划分开
train = data[:train.shape[0]]
test = data[train.shape[0]:]
print(train.shape)
print(test.shape)

(50000, 26)
(50000, 26)


In [100]:
## one hot
train = pd.get_dummies(train, columns=categorical_columns)
test = pd.get_dummies(test, columns=categorical_columns)

In [101]:
del train['用户编码']
del test['用户编码']

In [117]:
X_train = train[best_features].values
X_test = test[best_features].values
y_train = target.values
train[best_features].head()

Unnamed: 0,用户年龄,用户网龄（月）,缴费用户最近一次缴费金额（元）,用户近6个月平均消费值（元）,用户账单当月总费用（元）,用户当月账户余额（元）,当月通话交往圈人数,近三个月月均商场出现次数,当月网购类应用使用次数,当月金融理财类应用使用总次数,当月视频播放类应用使用次数,当月火车类应用使用次数,当月旅游资讯类应用使用次数,是否黑名单客户_0,是否黑名单客户_1,是否4G不健康客户_0,缴费用户当前是否欠费缴费_0,缴费用户当前是否欠费缴费_1,用户话费敏感度_0,用户话费敏感度_1,用户话费敏感度_2,用户话费敏感度_3,用户话费敏感度_4,用户话费敏感度_5,是否经常逛商场的人_0,是否经常逛商场的人_1,当月是否逛过福州仓山万达_0,当月是否逛过福州仓山万达_1,当月是否到过福州山姆会员店_0,当月是否到过福州山姆会员店_1,当月是否看电影_0,当月是否看电影_1,当月是否景点游览_0,当月是否景点游览_1,当月是否体育场馆消费_0,当月是否体育场馆消费_1
0,44,186,99.8,163.86,159.2,180,83,75,713,2740,7145,0,30,1,0,1,1,0,1,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0
1,18,5,29.94,153.28,145.1,110,21,16,414,2731,44862,0,0,1,0,0,1,0,1,0,0,0,0,0,1,0,1,0,1,0,1,0,0,1,0,1
2,47,145,49.9,109.64,120.2,70,59,1,3391,0,4804,0,1,1,0,1,1,0,0,1,0,0,0,0,0,1,1,0,1,0,1,0,0,1,0,1
3,55,234,99.8,92.97,167.42,90,78,26,500,1931,3141,0,5,1,0,1,1,0,1,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,1,0
4,40,76,49.9,95.47,101.0,80,70,44,522,64,59,0,0,1,0,1,1,0,1,0,0,0,0,0,1,0,1,0,1,0,1,0,1,0,0,1


## 开始训练

In [118]:
## lgb
param = {'num_leaves': 100,
         'min_data_in_leaf': 20, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.02,
         "min_child_samples": 30,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'mae',
         "lambda_l1": 0.1,
         "verbosity": -1}
folds = KFold(n_splits=5, shuffle=True, random_state=2018)
oof_lgb = np.zeros(len(train))
predictions_lgb = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof_lgb[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)
    print(oof_lgb)
    predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.8f}".format(mean_absolute_error(oof_lgb, target)))
print('final score: {:<8.8f}'.format(1 / (1 + mean_absolute_error(oof_lgb, target))))

fold n°1
Training until validation scores don't improve for 200 rounds.
[100]	training's l1: 15.2792	valid_1's l1: 16.1132
[200]	training's l1: 13.8199	valid_1's l1: 15.2205
[300]	training's l1: 13.2198	valid_1's l1: 15.0904
[400]	training's l1: 12.781	valid_1's l1: 15.0614
[500]	training's l1: 12.4053	valid_1's l1: 15.049
[600]	training's l1: 12.062	valid_1's l1: 15.0481
[700]	training's l1: 11.7366	valid_1's l1: 15.04
[800]	training's l1: 11.4338	valid_1's l1: 15.0431
[900]	training's l1: 11.1505	valid_1's l1: 15.0466
Early stopping, best iteration is:
[738]	training's l1: 11.622	valid_1's l1: 15.0386
[0. 0. 0. ... 0. 0. 0.]
fold n°2
Training until validation scores don't improve for 200 rounds.
[100]	training's l1: 15.355	valid_1's l1: 15.869
[200]	training's l1: 13.8992	valid_1's l1: 14.9259
[300]	training's l1: 13.3092	valid_1's l1: 14.7836
[400]	training's l1: 12.8768	valid_1's l1: 14.7484
[500]	training's l1: 12.4993	valid_1's l1: 14.7339
[600]	training's l1: 12.1482	valid_1's l

In [119]:
## xgb
xgb_params = {'eta': 0.005, 'max_depth': 10, 'subsample': 0.8, 'colsample_bytree': 0.8, 
          'objective': 'reg:linear', 'eval_metric': 'mae', 'silent': True, 'nthread': 4}

folds = KFold(n_splits=5, shuffle=True, random_state=2018)
oof_xgb = np.zeros(len(train))
predictions_xgb = np.zeros(len(test))

for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = xgb.DMatrix(X_train[trn_idx], y_train[trn_idx])
    val_data = xgb.DMatrix(X_train[val_idx], y_train[val_idx])

    watchlist = [(trn_data, 'train'), (val_data, 'valid_data')]
    clf = xgb.train(dtrain=trn_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=100, verbose_eval=100, params=xgb_params)
    oof_xgb[val_idx] = clf.predict(xgb.DMatrix(X_train[val_idx]), ntree_limit=clf.best_ntree_limit)
    predictions_xgb += clf.predict(xgb.DMatrix(X_test), ntree_limit=clf.best_ntree_limit) / folds.n_splits
    
print("CV score: {:<8.8f}".format(mean_absolute_error(oof_xgb, target)))
print('final score: {:<8.8f}'.format(1 / (1 + mean_absolute_error(oof_xgb, target))))


fold n°1
[0]	train-mae:614.328	valid_data-mae:615.054
Multiple eval metrics have been passed: 'valid_data-mae' will be used for early stopping.

Will train until valid_data-mae hasn't improved in 100 rounds.
[100]	train-mae:372.211	valid_data-mae:372.724
[200]	train-mae:225.539	valid_data-mae:225.902
[300]	train-mae:136.69	valid_data-mae:136.958
[400]	train-mae:82.9324	valid_data-mae:83.2282
[500]	train-mae:50.6494	valid_data-mae:51.2022
[600]	train-mae:31.8311	valid_data-mae:32.8916
[700]	train-mae:21.5657	valid_data-mae:23.197
[800]	train-mae:16.4594	valid_data-mae:18.6204
[900]	train-mae:14.099	valid_data-mae:16.6176
[1000]	train-mae:12.9612	valid_data-mae:15.7646
[1100]	train-mae:12.3466	valid_data-mae:15.3969
[1200]	train-mae:11.9525	valid_data-mae:15.2321
[1300]	train-mae:11.6508	valid_data-mae:15.1472
[1400]	train-mae:11.4024	valid_data-mae:15.101
[1500]	train-mae:11.192	valid_data-mae:15.0724
[1600]	train-mae:10.9921	valid_data-mae:15.0539
[1700]	train-mae:10.7931	valid_data-ma

In [120]:
## 将xgb和lgb结果进行stacking
train_stack = np.vstack([oof_lgb,oof_xgb]).transpose()
test_stack = np.vstack([predictions_lgb, predictions_xgb]).transpose()

folds_stack = RepeatedKFold(n_splits=5, n_repeats=2, random_state=2019)
oof_stack = np.zeros(train_stack.shape[0])
predictions = np.zeros(test_stack.shape[0])

for fold_, (trn_idx, val_idx) in enumerate(folds_stack.split(train_stack,target)):
    print("fold {}".format(fold_))
    trn_data, trn_y = train_stack[trn_idx], target.iloc[trn_idx].values
    val_data, val_y = train_stack[val_idx], target.iloc[val_idx].values
    
    clf_3 = BayesianRidge()
    clf_3.fit(trn_data, trn_y)
    
    oof_stack[val_idx] = clf_3.predict(val_data)
    predictions += clf_3.predict(test_stack) / 10

print("CV score: {:<8.8f}".format(mean_absolute_error(oof_stack, target)))
print('final score: {:<8.8f}'.format(1 / (1 + mean_absolute_error(oof_stack, target))))

fold 0
fold 1
fold 2
fold 3
fold 4
fold 5
fold 6
fold 7
fold 8
fold 9
CV score: 14.73582840
final score: 0.06354924


## 保存结果

In [121]:
predictions = predictions.astype(int)
print(predictions)

[602 533 669 ... 547 537 557]


In [122]:
sub_df = pd.read_csv("../data/submit_example.csv")
sub_df[" score"] = predictions
sub_df.to_csv("../data/submission_63549.csv", index=False)

## 特征选择

In [110]:
def modeling_cross_validation(params, X, y, nr_folds=5):
    
    oof_preds = np.zeros(X.shape[0])
    # Split data with kfold
    folds = KFold(n_splits=nr_folds, shuffle=False, random_state=4096)
    
    for fold_, (trn_idx, val_idx) in enumerate(folds.split(X, y)):
        print("fold n°{}".format(fold_+1))
        trn_data = lgb.Dataset(X[trn_idx], y[trn_idx])
        val_data = lgb.Dataset(X[val_idx], y[val_idx])

        num_round = 20000
        clf = lgb.train(params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 100)
        oof_preds[val_idx] = clf.predict(X[val_idx], num_iteration=clf.best_iteration)

    score = mean_absolute_error(oof_preds, target)
    
    return  1 / (1 + score)

In [112]:
def featureSelect(init_cols):
    params = {'num_leaves': 120,
             'min_data_in_leaf': 30, 
             'objective':'regression',
             'max_depth': -1,
             'learning_rate': 0.05,
             "min_child_samples": 30,
             "boosting": "gbdt",
             "feature_fraction": 0.9,
             "bagging_freq": 1,
             "bagging_fraction": 0.9 ,
             "bagging_seed": 11,
             "metric": 'mse',
             "lambda_l1": 0.02,
             "verbosity": -1}
    best_cols = init_cols.copy()
    best_score = modeling_cross_validation(params, train[init_cols].values, target.values, nr_folds=5)
    print("初始CV score: {:<8.8f}".format(best_score))
    for f in init_cols:

        best_cols.remove(f)
        score = modeling_cross_validation(params, train[best_cols].values, target.values, nr_folds=5)
        diff = best_score - score
        print('-'*10)
        if diff < 0.00001:
            print("当前移除特征: {}, CV score: {:<8.8f}, 最佳cv score: {:<8.8f}, 有效果,删除！！".format(f,score,best_score))
            best_score = score
        else:
            print("当前移除特征: {}, CV score: {:<8.8f}, 最佳cv score: {:<8.8f}, 没效果,保留！！".format(f,score,best_score))
            best_cols.append(f)
    print('-'*10)
    print("优化后CV score: {:<8.8f}".format(best_score))
    
    return best_cols
    
best_features = featureSelect(train.columns.tolist())

fold n°1
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[197]	training's l2: 244.626	valid_1's l2: 373.186
fold n°2
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[188]	training's l2: 248.914	valid_1's l2: 375.628
fold n°3
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[242]	training's l2: 231.28	valid_1's l2: 361.859
fold n°4
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[150]	training's l2: 265.603	valid_1's l2: 374.941
fold n°5
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[154]	training's l2: 262.36	valid_1's l2: 383.211
初始CV score: 0.06310493
fold n°1
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[178]	training's l2: 273.007	valid_1's l2: 401.258
fold n°2
Training until validat

Early stopping, best iteration is:
[229]	training's l2: 238.024	valid_1's l2: 363.378
fold n°4
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[156]	training's l2: 265.507	valid_1's l2: 376.208
fold n°5
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[138]	training's l2: 272.882	valid_1's l2: 384.367
----------
当前移除特征: 近三个月月均商场出现次数, CV score: 0.06300965, 最佳cv score: 0.06317508, 没效果,保留！！
fold n°1
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[227]	training's l2: 237.12	valid_1's l2: 373.619
fold n°2
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[202]	training's l2: 246.751	valid_1's l2: 375.197
fold n°3
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[199]	training's l2: 249.84	valid_1's l2: 362.351
fold n°4
Training until validation sco

Early stopping, best iteration is:
[155]	training's l2: 263.477	valid_1's l2: 381.754
----------
当前移除特征: 是否4G不健康客户_0, CV score: 0.06316573, 最佳cv score: 0.06318591, 没效果,保留！！
fold n°1
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[193]	training's l2: 248.738	valid_1's l2: 372.497
fold n°2
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[196]	training's l2: 247.791	valid_1's l2: 374.342
fold n°3
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[218]	training's l2: 241.407	valid_1's l2: 360.336
fold n°4
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[146]	training's l2: 268.715	valid_1's l2: 374.236
fold n°5
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[140]	training's l2: 270.603	valid_1's l2: 381.867
----------
当前移除特征: 是否4G不健康客户_1, CV sc

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[228]	training's l2: 235.281	valid_1's l2: 373.125
fold n°3
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[199]	training's l2: 249.391	valid_1's l2: 361.525
fold n°4
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[162]	training's l2: 261.951	valid_1's l2: 375.653
fold n°5
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[161]	training's l2: 260.814	valid_1's l2: 383.047
----------
当前移除特征: 是否经常逛商场的人_0, CV score: 0.06318047, 最佳cv score: 0.06323320, 没效果,保留！！
fold n°1
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[165]	training's l2: 261.077	valid_1's l2: 372.056
fold n°2
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[194]	training's l2: 247.

Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[162]	training's l2: 261.833	valid_1's l2: 377.522
fold n°5
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[168]	training's l2: 257.736	valid_1's l2: 381.149
----------
当前移除特征: 当月是否景点游览_1, CV score: 0.06309750, 最佳cv score: 0.06323320, 没效果,保留！！
fold n°1
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[190]	training's l2: 249.634	valid_1's l2: 373.016
fold n°2
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[188]	training's l2: 250.494	valid_1's l2: 374.471
fold n°3
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[201]	training's l2: 247.78	valid_1's l2: 360.527
fold n°4
Training until validation scores don't improve for 100 rounds.
Early stopping, best iteration is:
[172]	training's l2: 256.81

In [113]:
best_features

['用户年龄',
 '用户网龄（月）',
 '缴费用户最近一次缴费金额（元）',
 '用户近6个月平均消费值（元）',
 '用户账单当月总费用（元）',
 '用户当月账户余额（元）',
 '当月通话交往圈人数',
 '近三个月月均商场出现次数',
 '当月网购类应用使用次数',
 '当月金融理财类应用使用总次数',
 '当月视频播放类应用使用次数',
 '当月火车类应用使用次数',
 '当月旅游资讯类应用使用次数',
 '是否黑名单客户_0',
 '是否黑名单客户_1',
 '是否4G不健康客户_0',
 '缴费用户当前是否欠费缴费_0',
 '缴费用户当前是否欠费缴费_1',
 '用户话费敏感度_0',
 '用户话费敏感度_1',
 '用户话费敏感度_2',
 '用户话费敏感度_3',
 '用户话费敏感度_4',
 '用户话费敏感度_5',
 '是否经常逛商场的人_0',
 '是否经常逛商场的人_1',
 '当月是否逛过福州仓山万达_0',
 '当月是否逛过福州仓山万达_1',
 '当月是否到过福州山姆会员店_0',
 '当月是否到过福州山姆会员店_1',
 '当月是否看电影_0',
 '当月是否看电影_1',
 '当月是否景点游览_0',
 '当月是否景点游览_1',
 '当月是否体育场馆消费_0',
 '当月是否体育场馆消费_1']