In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.metrics import roc_auc_score
%matplotlib inline

In [2]:
sns.set_style('whitegrid')
train_data = pd.read_csv('E:/pnk/kaggle_credit/application_train.csv')
test_data = pd.read_csv('E:/pnk/kaggle_credit/application_test.csv')
bureau_data = pd.read_csv('E:/pnk/kaggle_credit/bureau.csv')

# 数据不平衡，重采样


In [82]:
print (train_data['TARGET'].sum(), len(train_data)-train_data['TARGET'].sum())

24825 282686


In [50]:
np.random.seed(1)
safe_data = train_data[train_data['TARGET']==0]
bad_data = train_data[train_data['TARGET']==1]
safe_part_index = np.random.randint(low=0, high=len(safe_data),size=99300)
safe_data.reset_index(drop=True,inplace=True)
safe_data = safe_data.loc[safe_part_index]
bad_data = pd.concat([bad_data,bad_data,bad_data,bad_data])
bad_data.reset_index(drop=True,inplace=True)

In [51]:
train_data = pd.concat([safe_data,bad_data])
train_data.reset_index(drop=True,inplace=True)

In [5]:
print (len(safe_data),len(bad_data))

99300 99300


# 探索bureau文件特征

In [3]:
bureau_data.drop(labels=['DAYS_ENDDATE_FACT','AMT_ANNUITY'], axis=1, inplace=True)

In [4]:
num_variables = []
cat_variables = []
for col in bureau_data.columns:
    if bureau_data[col].dtype=='object':
        cat_variables.append(col)
    else:
        num_variables.append(col)

In [5]:
num_dict = {}
for col in num_variables:
    num_dict[col] = bureau_data[col].count()/len(bureau_data)
print(num_dict)

{'SK_ID_CURR': 1.0, 'SK_ID_BUREAU': 1.0, 'DAYS_CREDIT': 1.0, 'CREDIT_DAY_OVERDUE': 1.0, 'DAYS_CREDIT_ENDDATE': 0.93850426583579383, 'AMT_CREDIT_MAX_OVERDUE': 0.34486736408401636, 'CNT_CREDIT_PROLONG': 1.0, 'AMT_CREDIT_SUM': 0.99999242613147765, 'AMT_CREDIT_SUM_DEBT': 0.84988068244051018, 'AMT_CREDIT_SUM_LIMIT': 0.65522585275933509, 'AMT_CREDIT_SUM_OVERDUE': 1.0, 'DAYS_CREDIT_UPDATE': 1.0}


In [6]:
def fill_with0(origin_data, fillmid_cols):
    for i in fillmid_cols:
        origin_data[i].fillna(0,inplace=True)
    return
fill_with0(bureau_data, list(num_dict.keys()))

In [7]:
bureau_data = pd.get_dummies(bureau_data)

In [8]:
bureau_data.columns

Index(['SK_ID_CURR', 'SK_ID_BUREAU', 'DAYS_CREDIT', 'CREDIT_DAY_OVERDUE',
       'DAYS_CREDIT_ENDDATE', 'AMT_CREDIT_MAX_OVERDUE', 'CNT_CREDIT_PROLONG',
       'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT',
       'AMT_CREDIT_SUM_OVERDUE', 'DAYS_CREDIT_UPDATE', 'CREDIT_ACTIVE_Active',
       'CREDIT_ACTIVE_Bad debt', 'CREDIT_ACTIVE_Closed', 'CREDIT_ACTIVE_Sold',
       'CREDIT_CURRENCY_currency 1', 'CREDIT_CURRENCY_currency 2',
       'CREDIT_CURRENCY_currency 3', 'CREDIT_CURRENCY_currency 4',
       'CREDIT_TYPE_Another type of loan', 'CREDIT_TYPE_Car loan',
       'CREDIT_TYPE_Cash loan (non-earmarked)', 'CREDIT_TYPE_Consumer credit',
       'CREDIT_TYPE_Credit card', 'CREDIT_TYPE_Interbank credit',
       'CREDIT_TYPE_Loan for business development',
       'CREDIT_TYPE_Loan for purchase of shares (margin lending)',
       'CREDIT_TYPE_Loan for the purchase of equipment',
       'CREDIT_TYPE_Loan for working capital replenishment',
       'CREDIT_TYPE_Microloan', 'CR

In [9]:
sum_features = ['CREDIT_DAY_OVERDUE','AMT_CREDIT_MAX_OVERDUE','CNT_CREDIT_PROLONG','AMT_CREDIT_SUM',
               'AMT_CREDIT_SUM_DEBT','AMT_CREDIT_SUM_LIMIT','AMT_CREDIT_SUM_OVERDUE']
for col in bureau_data.columns:
    if col.startswith('CREDIT_ACTIVE') or col.startswith('CREDIT_CURRENCY') or col.startswith('CREDIT_TYPE'):
        sum_features.append(col)
minmax_features = []
for col in bureau_data.columns:
    if col not in ['SK_ID_CURR','SK_ID_BUREAU'] and col not in sum_features:
        minmax_features.append(col)
sum_features.append('SK_ID_CURR')
minmax_features.append('SK_ID_CURR')
print(sum_features)
print(minmax_features)

['CREDIT_DAY_OVERDUE', 'AMT_CREDIT_MAX_OVERDUE', 'CNT_CREDIT_PROLONG', 'AMT_CREDIT_SUM', 'AMT_CREDIT_SUM_DEBT', 'AMT_CREDIT_SUM_LIMIT', 'AMT_CREDIT_SUM_OVERDUE', 'CREDIT_ACTIVE_Active', 'CREDIT_ACTIVE_Bad debt', 'CREDIT_ACTIVE_Closed', 'CREDIT_ACTIVE_Sold', 'CREDIT_CURRENCY_currency 1', 'CREDIT_CURRENCY_currency 2', 'CREDIT_CURRENCY_currency 3', 'CREDIT_CURRENCY_currency 4', 'CREDIT_TYPE_Another type of loan', 'CREDIT_TYPE_Car loan', 'CREDIT_TYPE_Cash loan (non-earmarked)', 'CREDIT_TYPE_Consumer credit', 'CREDIT_TYPE_Credit card', 'CREDIT_TYPE_Interbank credit', 'CREDIT_TYPE_Loan for business development', 'CREDIT_TYPE_Loan for purchase of shares (margin lending)', 'CREDIT_TYPE_Loan for the purchase of equipment', 'CREDIT_TYPE_Loan for working capital replenishment', 'CREDIT_TYPE_Microloan', 'CREDIT_TYPE_Mobile operator loan', 'CREDIT_TYPE_Mortgage', 'CREDIT_TYPE_Real estate loan', 'CREDIT_TYPE_Unknown type of loan', 'SK_ID_CURR']
['DAYS_CREDIT', 'DAYS_CREDIT_ENDDATE', 'DAYS_CREDIT_UPD

In [10]:
bureau_sum = bureau_data[sum_features].groupby(by=['SK_ID_CURR']).agg('sum')
bureau_sum.reset_index(inplace=True)

In [11]:
bureau_min = bureau_data[minmax_features].groupby(by=['SK_ID_CURR']).agg('min')
bureau_min.columns=['DAYS_CREDIT_MIN','DAYS_CREDIT_ENDDATE_MIN','DAYS_CREDIT_UPDATE_MIN']
bureau_min.reset_index(inplace=True)

In [12]:
bureau_max = bureau_data[minmax_features].groupby(by=['SK_ID_CURR']).agg('max')
bureau_max.columns=['DAYS_CREDIT_MAX','DAYS_CREDIT_ENDDATE_MAX','DAYS_CREDIT_UPDATE_MAX']
bureau_max.reset_index(inplace=True)

In [13]:
bureau_sum = bureau_sum.merge(bureau_min, on='SK_ID_CURR', how='inner')
bureau_sum = bureau_sum.merge(bureau_max, on='SK_ID_CURR', how='inner')

In [14]:
bureau_data.describe()

Unnamed: 0,SK_ID_CURR,SK_ID_BUREAU,DAYS_CREDIT,CREDIT_DAY_OVERDUE,DAYS_CREDIT_ENDDATE,AMT_CREDIT_MAX_OVERDUE,CNT_CREDIT_PROLONG,AMT_CREDIT_SUM,AMT_CREDIT_SUM_DEBT,AMT_CREDIT_SUM_LIMIT,...,CREDIT_TYPE_Interbank credit,CREDIT_TYPE_Loan for business development,CREDIT_TYPE_Loan for purchase of shares (margin lending),CREDIT_TYPE_Loan for the purchase of equipment,CREDIT_TYPE_Loan for working capital replenishment,CREDIT_TYPE_Microloan,CREDIT_TYPE_Mobile operator loan,CREDIT_TYPE_Mortgage,CREDIT_TYPE_Real estate loan,CREDIT_TYPE_Unknown type of loan
count,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,...,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0,1716428.0
mean,278214.9,5924434.0,-1142.108,0.8181666,479.1227,1319.262,0.006410406,354991.9,116506.0,4081.739,...,5.826053e-07,0.001150645,2.330421e-06,1.10695e-05,0.0002732419,0.007231879,5.826053e-07,0.01071469,1.573034e-05,0.0003233459
std,102938.6,532265.7,795.1649,36.54443,4839.776,121006.5,0.09622391,1149807.0,626405.8,36571.69,...,0.0007632858,0.03390165,0.00152657,0.003327068,0.01652778,0.08473242,0.0007632858,0.1029558,0.00396612,0.01797892
min,100001.0,5000000.0,-2922.0,0.0,-42060.0,0.0,0.0,0.0,-4705600.0,-586406.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,188866.8,5463954.0,-1666.0,0.0,-1074.0,0.0,0.0,51300.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,278055.0,5926304.0,-987.0,0.0,-237.0,0.0,0.0,125518.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,367426.0,6385681.0,-474.0,0.0,389.0,0.0,0.0,315000.0,1975.5,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,456255.0,6843457.0,0.0,2792.0,31199.0,115987200.0,9.0,585000000.0,170100000.0,4705600.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [15]:
bureau_data.dtypes.value_counts()

uint8      23
int64       6
float64     6
dtype: int64

# 百分比特征

In [16]:
bureau_sum['AMT_OVERDUE_PERCENT'] = bureau_sum['AMT_CREDIT_MAX_OVERDUE'] / bureau_sum['AMT_CREDIT_SUM']
bureau_sum['AMT_DEBT_PERCENT'] = bureau_sum['AMT_CREDIT_SUM_DEBT'] / bureau_sum['AMT_CREDIT_SUM']
train_data = train_data.merge(bureau_sum, on='SK_ID_CURR', how='left')
test_data = test_data.merge(bureau_sum, on='SK_ID_CURR', how='left')
train_data['HAS_BUREAU'] = train_data['CNT_CREDIT_PROLONG'].notnull().astype(int)
test_data['HAS_BUREAU'] = test_data['CNT_CREDIT_PROLONG'].notnull().astype(int)

# drop掉50%以上缺失字段+label encoding

In [17]:
train_data.dtypes.value_counts()

float64    103
int64       41
object      16
int32        1
dtype: int64

In [18]:
num_variables = []
cat_variables = []
for col in train_data.columns:
    if train_data[col].dtype=='object':
        cat_variables.append(col)
    else:
        num_variables.append(col)

In [19]:
num_dict = {}
for col in num_variables:
    num_dict[col] = train_data[col].count()/len(train_data)
for k, v in num_dict.items():
    if v < 0.5:
        train_data.drop(labels=k,axis=1,inplace=True)
        test_data.drop(labels=k,axis=1,inplace=True)

In [20]:
lb_cols = []
for col in cat_variables:
    if train_data[col].nunique() <= 2:
        lb_cols.append(col)
print(lb_cols)

['NAME_CONTRACT_TYPE', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'EMERGENCYSTATE_MODE']


In [21]:
# Create a label encoder objectcat_variables
le = preprocessing.LabelEncoder()
le_count = 0

# Iterate through the columns
for col in lb_cols:
    # Train on the training data
    train_data[col] = train_data[col].fillna('NAN')
    test_data[col] = test_data[col].fillna('NAN')
    le.fit(train_data[col])
    # Transform both training and testing data
    train_data[col] = le.transform(train_data[col])
    test_data[col] = le.transform(test_data[col])
    # Keep track of how many columns were label encoded
    le_count += 1
            
print('%d columns were label encoded.' % le_count)

4 columns were label encoded.


# 独热编码

In [22]:
train_data = pd.get_dummies(train_data)
test_data = pd.get_dummies(test_data)

In [23]:
train_target = train_data['TARGET']
train_data, test_data = train_data.align(test_data, join = 'inner', axis = 1)
print(train_data.shape, test_data.shape)

(307511, 239) (48744, 239)


# 连续变量补缺

In [24]:
pop_item = []
for k, v in num_dict.items():
    if v < 0.5:
        pop_item.append(k)
for i in pop_item:
    num_dict.pop(i)
num_dict.pop('TARGET')

1.0

In [25]:
def fill_withmid(origin_data, fillmid_cols):
    for i in fillmid_cols:
        mid_value = origin_data[i].quantile(.5)
        origin_data[i].fillna(mid_value,inplace=True)
    return
fill_withmid(train_data, list(num_dict.keys()))
fill_withmid(test_data, list(num_dict.keys()))

# 连续变量与TARGET的相关系数

In [26]:
train_num = train_data[list(num_dict.keys())].copy()
train_num['TARGET'] = train_target
train_corr = train_num.corr()

In [27]:
train_corr = train_corr['TARGET'].sort_values()
print('Most Positive Correlations:\n', train_corr.tail(15))
print('\nMost Negative Correlations:\n', train_corr.head(15))

Most Positive Correlations:
 DAYS_REGISTRATION              0.041975
FLAG_DOCUMENT_3                0.044346
REG_CITY_NOT_LIVE_CITY         0.044395
FLAG_EMP_PHONE                 0.045982
DAYS_CREDIT_MAX                0.048847
REG_CITY_NOT_WORK_CITY         0.050994
DAYS_ID_PUBLISH                0.051457
AMT_DEBT_PERCENT               0.052563
DAYS_LAST_PHONE_CHANGE         0.055218
REGION_RATING_CLIENT           0.058899
CREDIT_ACTIVE_Active           0.060544
REGION_RATING_CLIENT_W_CITY    0.060893
DAYS_CREDIT_MIN                0.067388
DAYS_BIRTH                     0.078239
TARGET                         1.000000
Name: TARGET, dtype: float64

Most Negative Correlations:
 EXT_SOURCE_2                 -0.160295
EXT_SOURCE_3                 -0.155892
DAYS_EMPLOYED                -0.044932
AMT_GOODS_PRICE              -0.039623
FLOORSMAX_AVG                -0.039385
FLOORSMAX_MEDI               -0.039157
FLOORSMAX_MODE               -0.038377
REGION_POPULATION_RELATIVE   -0.037227


In [28]:
#特殊列，新建分类变量
train_data['DAYS_EMPLOYED_NEW'] = (train_data['DAYS_EMPLOYED']==365243).astype(int)
train_data.loc[train_data['DAYS_EMPLOYED']==365243,'DAYS_EMPLOYED'] = -20000

In [29]:
test_data['DAYS_EMPLOYED_NEW'] = (test_data['DAYS_EMPLOYED']==365243).astype(int)
test_data.loc[test_data['DAYS_EMPLOYED']==365243,'DAYS_EMPLOYED'] = -20000

# 找出可以转为分类变量的连续变量（unique值少的）

In [30]:
maybe_cat = []
for col in abs(train_corr).sort_values().index:
    if train_num[col].nunique()>3 and train_num[col].nunique()<20:
        maybe_cat.append(col)
print(maybe_cat)

['AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_DAY', 'CREDIT_TYPE_Another type of loan', 'CNT_CREDIT_PROLONG', 'CREDIT_CURRENCY_currency 3', 'CREDIT_TYPE_Unknown type of loan', 'CREDIT_TYPE_Loan for business development', 'CREDIT_TYPE_Loan for working capital replenishment', 'AMT_REQ_CREDIT_BUREAU_QRT', 'CREDIT_CURRENCY_currency 2', 'CNT_FAM_MEMBERS', 'CREDIT_ACTIVE_Sold', 'CNT_CHILDREN', 'CREDIT_TYPE_Car loan', 'CREDIT_TYPE_Mortgage', 'DEF_60_CNT_SOCIAL_CIRCLE', 'DEF_30_CNT_SOCIAL_CIRCLE']


In [31]:
new_cat = []
for col in maybe_cat:
    s_point = train_num[col].min()-1
    e_point = train_num[col].max()
    split_num = train_num[col].nunique()//2 + 1
    labels = []
    for i in range(split_num-1):
        labels.append(str(i))
    col_name = col + '_CUT'
    new_cat.append(col_name)
    train_data[col_name] = pd.cut(train_data[col], bins = np.linspace(s_point, e_point, num = split_num), labels=labels)
    test_data[col_name] = pd.cut(test_data[col], bins = np.linspace(s_point, e_point, num = split_num), labels=labels)

In [32]:
#对新建的分类变量进行独热编码
train_data = pd.get_dummies(train_data, columns=new_cat)
test_data = pd.get_dummies(test_data, columns=new_cat)

In [33]:
train_data, test_data = train_data.align(test_data, join = 'inner', axis = 1)
print(train_data.shape, test_data.shape)

(307511, 312) (48744, 312)


In [34]:
train_data.describe()

Unnamed: 0,SK_ID_CURR,NAME_CONTRACT_TYPE,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,AMT_GOODS_PRICE,REGION_POPULATION_RELATIVE,...,CREDIT_TYPE_Mortgage_CUT_3,DEF_60_CNT_SOCIAL_CIRCLE_CUT_0,DEF_60_CNT_SOCIAL_CIRCLE_CUT_1,DEF_60_CNT_SOCIAL_CIRCLE_CUT_2,DEF_60_CNT_SOCIAL_CIRCLE_CUT_3,DEF_30_CNT_SOCIAL_CIRCLE_CUT_0,DEF_30_CNT_SOCIAL_CIRCLE_CUT_1,DEF_30_CNT_SOCIAL_CIRCLE_CUT_2,DEF_30_CNT_SOCIAL_CIRCLE_CUT_3,DEF_30_CNT_SOCIAL_CIRCLE_CUT_4
count,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,...,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0,307511.0
mean,278180.518577,0.095213,0.340108,0.693673,0.417052,168797.9,599026.0,27108.487841,538316.3,0.020868,...,3e-06,0.999984,1.3e-05,0.0,3e-06,0.99999,7e-06,0.0,0.0,3e-06
std,102790.175348,0.293509,0.473746,0.460968,0.722121,237123.1,402490.8,14493.461065,369289.0,0.013831,...,0.001803,0.004032,0.003607,0.0,0.001803,0.003123,0.00255,0.0,0.0,0.001803
min,100002.0,0.0,0.0,0.0,0.0,25650.0,45000.0,1615.5,40500.0,0.00029,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,189145.5,0.0,0.0,0.0,0.0,112500.0,270000.0,16524.0,238500.0,0.010006,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,278202.0,0.0,0.0,1.0,0.0,147150.0,513531.0,24903.0,450000.0,0.01885,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,367142.5,0.0,1.0,1.0,1.0,202500.0,808650.0,34596.0,679500.0,0.028663,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
max,456255.0,1.0,1.0,1.0,19.0,117000000.0,4050000.0,258025.5,4050000.0,0.072508,...,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0


# Polynomial features(随机森林筛选重要特征进行poly化)

In [35]:
#部分百分比inf，转为1
train_data['AMT_OVERDUE_PERCENT'] = train_data['AMT_OVERDUE_PERCENT'].apply(lambda x:1 if x==np.inf else x)
train_data['AMT_DEBT_PERCENT'] = train_data['AMT_DEBT_PERCENT'].apply(lambda x:1 if x==np.inf else x)
train_data['AMT_OVERDUE_PERCENT'] = train_data['AMT_OVERDUE_PERCENT'].apply(lambda x:0 if x==-np.inf else x)
train_data['AMT_DEBT_PERCENT'] = train_data['AMT_DEBT_PERCENT'].apply(lambda x:0 if x==-np.inf else x)
test_data['AMT_OVERDUE_PERCENT'] = test_data['AMT_OVERDUE_PERCENT'].apply(lambda x:1 if x==np.inf else x)
test_data['AMT_DEBT_PERCENT'] = test_data['AMT_DEBT_PERCENT'].apply(lambda x:1 if x==np.inf else x)
test_data['AMT_OVERDUE_PERCENT'] = test_data['AMT_OVERDUE_PERCENT'].apply(lambda x:0 if x==-np.inf else x)
test_data['AMT_DEBT_PERCENT'] = test_data['AMT_DEBT_PERCENT'].apply(lambda x:0 if x==-np.inf else x)

In [36]:
forest = ExtraTreesClassifier(n_estimators=150,
                              random_state=0,n_jobs=6)

forest.fit(train_data, train_target)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=6,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [37]:
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(train_data.shape[1]):
    print("%d. feature %s (%f)" % (f + 1, train_data.columns[indices[f]], importances[indices[f]]))

Feature ranking:
1. feature EXT_SOURCE_2 (0.028730)
2. feature EXT_SOURCE_3 (0.024233)
3. feature DAYS_BIRTH (0.016934)
4. feature DAYS_ID_PUBLISH (0.016559)
5. feature DAYS_REGISTRATION (0.016075)
6. feature DAYS_LAST_PHONE_CHANGE (0.016053)
7. feature SK_ID_CURR (0.015724)
8. feature HOUR_APPR_PROCESS_START (0.015578)
9. feature AMT_INCOME_TOTAL (0.015365)
10. feature REGION_POPULATION_RELATIVE (0.015364)
11. feature AMT_ANNUITY (0.015235)
12. feature AMT_CREDIT (0.015201)
13. feature AMT_GOODS_PRICE (0.014812)
14. feature DAYS_CREDIT_MIN (0.014300)
15. feature AMT_DEBT_PERCENT (0.014258)
16. feature DAYS_EMPLOYED (0.013951)
17. feature DAYS_CREDIT_MAX (0.013284)
18. feature DAYS_CREDIT_ENDDATE_MAX (0.013053)
19. feature DAYS_CREDIT_UPDATE_MIN (0.013007)
20. feature DAYS_CREDIT_ENDDATE_MIN (0.012978)
21. feature OBS_60_CNT_SOCIAL_CIRCLE (0.012798)
22. feature CREDIT_ACTIVE_Active (0.012797)
23. feature OBS_30_CNT_SOCIAL_CIRCLE (0.012765)
24. feature AMT_REQ_CREDIT_BUREAU_YEAR (0.0126

In [38]:
pf = preprocessing.PolynomialFeatures(degree=3, interaction_only=False, include_bias=False)
train_poly = train_data[train_data.columns[indices[0:35]]]
test_poly = test_data[test_data.columns[indices[0:35]]]
train_poly = pf.fit_transform(train_poly)
test_poly = pf.fit_transform(test_poly)
print('Polynomial Features shape: ', train_poly.shape)

Polynomial Features shape:  (307511, 8435)


# 对Poly特征进行PCA降维

In [39]:
scaler = preprocessing.MinMaxScaler(feature_range = (0, 1))
train_poly = scaler.fit_transform(train_poly)
test_poly = scaler.transform(test_poly)
pca = PCA(n_components=40)
pca.fit(train_poly)

PCA(copy=True, iterated_power='auto', n_components=40, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [40]:
print(pca.explained_variance_ratio_)
print(pca.explained_variance_ratio_.sum())

[ 0.22932364  0.12531627  0.09240243  0.05367791  0.04974506  0.04581404
  0.03522546  0.03387151  0.02758265  0.02633302  0.02453296  0.02342226
  0.0209767   0.0158931   0.01237814  0.01154838  0.01013906  0.00713305
  0.00626972  0.00596654  0.0046552   0.00375413  0.0033979   0.00330717
  0.00302226  0.00287486  0.00283576  0.00266432  0.00251222  0.00233027
  0.00221429  0.00220502  0.00200829  0.00196667  0.00187039  0.00182345
  0.001743    0.00169192  0.00167019  0.00160733]
0.907706497661


In [41]:
train_poly = pca.transform(train_poly)
test_poly = pca.transform(test_poly)

In [42]:
# Create a dataframe of the features 
train_poly= pd.DataFrame(train_poly)
# Add in the target
train_poly['TARGET'] = train_target
# Find the correlations with the target
poly_corrs = train_poly.corr()['TARGET'].sort_values()

# Put test features into dataframe
test_poly= pd.DataFrame(test_poly)

# Merge polynomial features into training dataframe

train_data_poly = train_data.merge(train_poly, left_index=True, right_index=True, how = 'left')

# Merge polnomial features into testing dataframe
test_poly['SK_ID_CURR'] = test_data['SK_ID_CURR']
test_data_poly = test_data.merge(test_poly, on = 'SK_ID_CURR', how = 'left')

# Align the dataframes
train_data_poly, test_data_poly = train_data_poly.align(test_data_poly, join = 'inner', axis = 1)

# Print out the new shapes
print('Training data with polynomial features shape: ', train_data_poly.shape)
print('Testing data with polynomial features shape:  ', test_data_poly.shape)

Training data with polynomial features shape:  (307511, 352)
Testing data with polynomial features shape:   (48744, 352)


# 最后一次随机森林0.714

In [127]:
forest = ExtraTreesClassifier(n_estimators=150,
                              random_state=0,n_jobs=6)

forest.fit(train_data_poly, train_target)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=6,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [128]:
r_result = forest.predict_proba(test_data_poly)

In [129]:
forest.classes_

array([0, 1], dtype=int64)

In [130]:
r_result=pd.DataFrame(list(r_result[:,1]),columns=['TARGET'])
r_result['SK_ID_CURR'] = test_data['SK_ID_CURR']

In [131]:
r_result.to_csv('E:/pnk/kaggle_credit/test_result7.csv')

# 普通GBDT、0.731、0.7466（StratifiedKFold应对不平衡数据，split后再做balance）

In [110]:
#train_data_poly['TARGET'] = train_target
#train_data_poly = train_data_poly.sample(frac=1).reset_index(drop=True)
#train_target = train_data_poly['TARGET']
#train_data_poly.drop(labels='TARGET', axis=1, inplace=True)

In [46]:
skf = StratifiedKFold(n_splits=5)
np.random.seed(1)
n_estimators = [500, 1000]
max_depth = [2, 3]
score_dict = {}
for n_es in n_estimators:
    for m_dep in max_depth:
        gbc = GradientBoostingClassifier(n_estimators=n_es, max_depth=m_dep, random_state=1)
        for train_index, valid_index in skf.split(train_data_poly, train_target):
            train_x = train_data_poly.loc[train_index]
            train_y = train_target[train_index]
            valid_x = train_data_poly.loc[valid_index]
            valid_y = train_target[valid_index]
            train_x['TARGET'] = train_y
            safe_data = train_x[train_x['TARGET']==0]
            bad_data = train_x[train_x['TARGET']==1]
            safe_part_index = np.random.randint(low=0, high=len(safe_data),size=79440)
            safe_data.reset_index(drop=True,inplace=True)
            safe_data = safe_data.loc[safe_part_index]
            bad_data = pd.concat([bad_data,bad_data,bad_data,bad_data])
            bad_data.reset_index(drop=True,inplace=True)
            train_x = pd.concat([safe_data,bad_data])
            train_x.reset_index(drop=True,inplace=True)
            train_y = train_x['TARGET']
            train_x.drop(labels='TARGET', axis=1, inplace=True)
            print('fitting for %s n_estimators, %s max_depth...' % (n_es, m_dep))
            gbc.fit(train_x, train_y)
            valid_result = gbc.predict_proba(valid_x)
            t_score = roc_auc_score(valid_y, valid_result[:,1])
            print('result for %s n_estimators, %s max_depth: %s' % (n_es, m_dep, t_score))
            score_dict[(n_es,m_dep)] = t_score

fitting for 50 n_estimators, 3 max_depth...
result for 50 n_estimators, 3 max_depth: 0.740760805849
fitting for 50 n_estimators, 3 max_depth...
result for 50 n_estimators, 3 max_depth: 0.743665760791
fitting for 50 n_estimators, 3 max_depth...
result for 50 n_estimators, 3 max_depth: 0.738185206843
fitting for 50 n_estimators, 3 max_depth...
result for 50 n_estimators, 3 max_depth: 0.745710467996
fitting for 50 n_estimators, 3 max_depth...
result for 50 n_estimators, 3 max_depth: 0.738876256761
fitting for 50 n_estimators, 5 max_depth...
result for 50 n_estimators, 5 max_depth: 0.745817638821
fitting for 50 n_estimators, 5 max_depth...
result for 50 n_estimators, 5 max_depth: 0.750674114952
fitting for 50 n_estimators, 5 max_depth...
result for 50 n_estimators, 5 max_depth: 0.745515556024
fitting for 50 n_estimators, 5 max_depth...
result for 50 n_estimators, 5 max_depth: 0.751304752953
fitting for 50 n_estimators, 5 max_depth...
result for 50 n_estimators, 5 max_depth: 0.740077469965


In [50]:
#确定参数后，对所有train数据做balance，训练模型
np.random.seed(1)
train_data_poly['TARGET'] = train_target
safe_data = train_data_poly[train_data_poly['TARGET']==0]
bad_data = train_data_poly[train_data_poly['TARGET']==1]
safe_part_index = np.random.randint(low=0, high=len(safe_data),size=74475)
safe_data.reset_index(drop=True,inplace=True)
safe_data = safe_data.loc[safe_part_index]
bad_data = pd.concat([bad_data,bad_data,bad_data])
bad_data.reset_index(drop=True,inplace=True)
gbdt_features = pd.concat([safe_data,bad_data])
gbdt_features.reset_index(drop=True,inplace=True)
gbdt_target = gbdt_features['TARGET']
gbdt_features.drop(labels='TARGET', axis=1, inplace=True)
train_data_poly.drop(labels='TARGET', axis=1, inplace=True)

In [51]:
gbc = GradientBoostingClassifier(n_estimators=500, max_depth=3, random_state=1)
gbc.fit(gbdt_features, gbdt_target)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=500,
              presort='auto', random_state=1, subsample=1.0, verbose=0,
              warm_start=False)

In [52]:
r_result = gbc.predict_proba(test_data_poly)

In [53]:
r_result=pd.DataFrame(list(r_result[:,1]),columns=['TARGET'])
r_result['SK_ID_CURR'] = test_data['SK_ID_CURR']

In [54]:
r_result.to_csv('E:/pnk/kaggle_credit/test_result12.csv')

In [None]:
tuned_parameters = [{'n_estimators': [10,50,100,150,250], 'max_depth': [3, 5, 10, 18],
                     'random_state': [1, 10]}]

scores = ['roc_auc']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(GradientBoostingClassifier(), tuned_parameters, cv=5, n_jobs=6,
                       scoring='%s' % score)
    clf.fit(train_data_poly, train_target)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

# Tuning hyper-parameters for roc_auc

