In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest, SelectFromModel
from sklearn.feature_selection import chi2
from sklearn import preprocessing
from sklearn.ensemble import ExtraTreesClassifier
%matplotlib inline

In [198]:
sns.set_style('whitegrid')
train_data = pd.read_csv('E:/pnk/kaggle_credit/application_train.csv')
test_data = pd.read_csv('E:/pnk/kaggle_credit/application_test.csv')

# 数据不平衡，重采样


In [199]:
print (train_data['TARGET'].sum(), len(train_data)-train_data['TARGET'].sum())

24825 282686


In [200]:
np.random.seed(1)
safe_data = train_data[train_data['TARGET']==0]
bad_data = train_data[train_data['TARGET']==1]
safe_part_index = np.random.randint(low=0, high=len(safe_data),size=99300)
safe_data.reset_index(drop=True,inplace=True)
safe_data = safe_data.loc[safe_part_index]
bad_data = pd.concat([bad_data,bad_data,bad_data,bad_data])
bad_data.reset_index(drop=True,inplace=True)

In [201]:
print (len(safe_data),len(bad_data))

99300 99300


In [202]:
train_data = pd.concat([safe_data,bad_data])
train_data.reset_index(drop=True,inplace=True)

# drop掉50%以上缺失字段+label encoding

In [203]:
train_data.dtypes.value_counts()

float64    65
int64      41
object     16
dtype: int64

In [204]:
num_variables = []
cat_variables = []
for col in train_data.columns:
    if train_data[col].dtype=='object':
        cat_variables.append(col)
    else:
        num_variables.append(col)

In [205]:
num_dict = {}
for col in num_variables:
    num_dict[col] = train_data[col].count()/len(train_data)
for k, v in num_dict.items():
    if v < 0.5:
        train_data.drop(labels=k,axis=1,inplace=True)
        test_data.drop(labels=k,axis=1,inplace=True)

In [206]:
lb_cols = []
for col in cat_variables:
    if train_data[col].nunique() <= 2:
        lb_cols.append(col)
print(lb_cols)

['NAME_CONTRACT_TYPE', 'CODE_GENDER', 'FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'EMERGENCYSTATE_MODE']


In [207]:
# Create a label encoder objectcat_variables
le = preprocessing.LabelEncoder()
le_count = 0

# Iterate through the columns
for col in lb_cols:
    # Train on the training data
    train_data[col] = train_data[col].fillna('NAN')
    test_data[col] = test_data[col].fillna('NAN')
    le.fit(train_data[col])
    # Transform both training and testing data
    train_data[col] = le.transform(train_data[col])
    test_data[col] = le.transform(test_data[col])
    # Keep track of how many columns were label encoded
    le_count += 1
            
print('%d columns were label encoded.' % le_count)

5 columns were label encoded.


# 独热编码方法

In [97]:
first_chi2cols = ['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','NAME_TYPE_SUITE','NAME_INCOME_TYPE'
                 ,'NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','OCCUPATION_TYPE','WEEKDAY_APPR_PROCESS_START'
                 ,'ORGANIZATION_TYPE','FLAG_MOBIL','FLAG_EMP_PHONE','FLAG_WORK_PHONE','FLAG_CONT_MOBILE','FLAG_PHONE'
                 ,'FLAG_EMAIL','REGION_RATING_CLIENT','REGION_RATING_CLIENT_W_CITY','REG_REGION_NOT_LIVE_REGION'
                 ,'REG_REGION_NOT_WORK_REGION','LIVE_REGION_NOT_WORK_REGION','REG_CITY_NOT_LIVE_CITY'
                 ,'REG_CITY_NOT_WORK_CITY','LIVE_CITY_NOT_WORK_CITY','FLAG_DOCUMENT_2','FLAG_DOCUMENT_3'
                 ,'FLAG_DOCUMENT_4','FLAG_DOCUMENT_5','FLAG_DOCUMENT_6','FLAG_DOCUMENT_7'
                 ,'FLAG_DOCUMENT_8','FLAG_DOCUMENT_9','FLAG_DOCUMENT_10','FLAG_DOCUMENT_11'
                 ,'FLAG_DOCUMENT_12','FLAG_DOCUMENT_13','FLAG_DOCUMENT_14','FLAG_DOCUMENT_15'
                 ,'FLAG_DOCUMENT_16','FLAG_DOCUMENT_17','FLAG_DOCUMENT_18','FLAG_DOCUMENT_19'
                 ,'FLAG_DOCUMENT_20','FLAG_DOCUMENT_21']
dummy_features = ['NAME_CONTRACT_TYPE','CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','NAME_TYPE_SUITE',
                  'NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','OCCUPATION_TYPE',
                 'WEEKDAY_APPR_PROCESS_START','ORGANIZATION_TYPE']
def get_cat_dummy(origin_data, dummy_features):
    return pd.get_dummies(origin_data, prefix=dummy_features, columns=dummy_features)
def get_cat_features(origin_data, cat_features, dummy_features, k=15):
    model_chi = SelectKBest(chi2, k=k)
    chi_data = origin_data[cat_features]
    chi_target = origin_data['TARGET']
    chi_matrix = get_cat_dummy(chi_data, dummy_features)
    model_chi.fit(chi_matrix, chi_target)
    return chi_matrix.columns[model_chi.pvalues_.argsort()[0:k]]

# 分类变量单独做卡方检验

In [98]:
get_cat_features(train_data, first_chi2cols, dummy_features, k=30)

Index(['NAME_EDUCATION_TYPE_Higher education', 'NAME_INCOME_TYPE_Pensioner',
       'ORGANIZATION_TYPE_XNA', 'REG_CITY_NOT_WORK_CITY', 'CODE_GENDER_M',
       'NAME_INCOME_TYPE_Working', 'REG_CITY_NOT_LIVE_CITY',
       'OCCUPATION_TYPE_Laborers', 'CODE_GENDER_F',
       'NAME_CONTRACT_TYPE_Revolving loans', 'FLAG_DOCUMENT_6',
       'LIVE_CITY_NOT_WORK_CITY', 'OCCUPATION_TYPE_Drivers',
       'NAME_EDUCATION_TYPE_Secondary / secondary special',
       'NAME_HOUSING_TYPE_With parents', 'NAME_INCOME_TYPE_State servant',
       'ORGANIZATION_TYPE_Self-employed', 'FLAG_WORK_PHONE', 'FLAG_DOCUMENT_3',
       'NAME_FAMILY_STATUS_Single / not married',
       'OCCUPATION_TYPE_Accountants', 'NAME_FAMILY_STATUS_Civil marriage',
       'OCCUPATION_TYPE_Low-skill Laborers', 'OCCUPATION_TYPE_Managers',
       'REGION_RATING_CLIENT_W_CITY', 'REGION_RATING_CLIENT',
       'ORGANIZATION_TYPE_Construction', 'FLAG_EMP_PHONE',
       'OCCUPATION_TYPE_Core staff',
       'ORGANIZATION_TYPE_Business Enti

# 连续变量补缺

In [99]:
first_ctcols = ['CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','AMT_GOODS_PRICE',
                'REGION_POPULATION_RELATIVE','DAYS_BIRTH','DAYS_EMPLOYED','DAYS_REGISTRATION','DAYS_ID_PUBLISH',
                'CNT_FAM_MEMBERS','HOUR_APPR_PROCESS_START','EXT_SOURCE_2','OBS_30_CNT_SOCIAL_CIRCLE',
                'DEF_30_CNT_SOCIAL_CIRCLE','OBS_60_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE','DAYS_LAST_PHONE_CHANGE',
                'AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK',
                'AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_QRT']
features = []
features.extend(first_chi2cols)
features.extend(first_ctcols)
features = list(set(features))

In [100]:
train_data['NAME_TYPE_SUITE'] = train_data['NAME_TYPE_SUITE'].fillna('Nodata')
train_data['OCCUPATION_TYPE'] = train_data['OCCUPATION_TYPE'].fillna('Nodata')
train_data['OWN_CAR_AGE'] = train_data['OWN_CAR_AGE'].fillna(-100)
fillmid_cols = ['AMT_ANNUITY','AMT_GOODS_PRICE','CNT_FAM_MEMBERS','EXT_SOURCE_2',
               'OBS_30_CNT_SOCIAL_CIRCLE','DEF_30_CNT_SOCIAL_CIRCLE','OBS_60_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE',
               'DAYS_LAST_PHONE_CHANGE','AMT_REQ_CREDIT_BUREAU_HOUR','AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK',
               'AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_QRT']
def fill_withmid(origin_data, fillmid_cols):
    for i in fillmid_cols:
        mid_value = origin_data[i].quantile(.5)
        origin_data[i].fillna(mid_value,inplace=True)
    return
fill_withmid(train_data, fillmid_cols)

In [101]:
test_data['NAME_TYPE_SUITE'] = test_data['NAME_TYPE_SUITE'].fillna('Nodata')
test_data['OCCUPATION_TYPE'] = test_data['OCCUPATION_TYPE'].fillna('Nodata')
test_data['OWN_CAR_AGE'] = test_data['OWN_CAR_AGE'].fillna(-100)
fill_withmid(test_data, fillmid_cols)

In [102]:
train_x=train_data[features]
train_y=train_data['TARGET']

In [103]:
test_x=test_data[features]

# 独热编码

In [104]:
train_x = get_cat_dummy(train_x, dummy_features)

In [105]:
test_x = get_cat_dummy(test_x, dummy_features)

In [106]:
forest = ExtraTreesClassifier(n_estimators=150,
                              random_state=0,n_jobs=6)

forest.fit(train_x, train_y)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(train_x.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

Feature ranking:
1. feature 37 (0.058736)
2. feature 48 (0.031231)
3. feature 18 (0.029571)
4. feature 23 (0.028970)
5. feature 42 (0.027448)
6. feature 19 (0.027382)
7. feature 21 (0.026920)
8. feature 43 (0.026312)
9. feature 26 (0.026061)
10. feature 45 (0.025721)
11. feature 6 (0.025395)
12. feature 7 (0.024745)
13. feature 27 (0.020267)
14. feature 54 (0.019988)
15. feature 50 (0.016485)
16. feature 1 (0.014179)
17. feature 14 (0.013644)
18. feature 31 (0.013502)
19. feature 10 (0.012142)
20. feature 121 (0.011725)
21. feature 122 (0.011675)
22. feature 63 (0.011500)
23. feature 64 (0.011481)
24. feature 120 (0.011359)
25. feature 117 (0.011355)
26. feature 116 (0.011326)
27. feature 53 (0.011139)
28. feature 4 (0.011125)
29. feature 128 (0.010859)
30. feature 87 (0.010816)
31. feature 29 (0.010756)
32. feature 72 (0.010402)
33. feature 20 (0.010026)
34. feature 118 (0.009463)
35. feature 0 (0.009077)
36. feature 66 (0.008957)
37. feature 80 (0.008771)
38. feature 165 (0.008573)
3

In [107]:
for f in range(train_x.shape[1]):
    print(train_x.columns[indices[f]])

EXT_SOURCE_2
DAYS_BIRTH
DAYS_LAST_PHONE_CHANGE
DAYS_ID_PUBLISH
DAYS_REGISTRATION
AMT_GOODS_PRICE
AMT_CREDIT
AMT_ANNUITY
REGION_POPULATION_RELATIVE
HOUR_APPR_PROCESS_START
AMT_INCOME_TOTAL
DAYS_EMPLOYED
OBS_30_CNT_SOCIAL_CIRCLE
OBS_60_CNT_SOCIAL_CIRCLE
CNT_FAM_MEMBERS
CNT_CHILDREN
AMT_REQ_CREDIT_BUREAU_QRT
FLAG_PHONE
AMT_REQ_CREDIT_BUREAU_MON
WEEKDAY_APPR_PROCESS_START_TUESDAY
WEEKDAY_APPR_PROCESS_START_WEDNESDAY
FLAG_OWN_REALTY_N
FLAG_OWN_REALTY_Y
WEEKDAY_APPR_PROCESS_START_THURSDAY
WEEKDAY_APPR_PROCESS_START_MONDAY
WEEKDAY_APPR_PROCESS_START_FRIDAY
REGION_RATING_CLIENT
REGION_RATING_CLIENT_W_CITY
ORGANIZATION_TYPE_Business Entity Type 3
NAME_FAMILY_STATUS_Married
FLAG_WORK_PHONE
NAME_TYPE_SUITE_Unaccompanied
DEF_30_CNT_SOCIAL_CIRCLE
WEEKDAY_APPR_PROCESS_START_SATURDAY
FLAG_DOCUMENT_3
NAME_TYPE_SUITE_Family
NAME_INCOME_TYPE_Working
ORGANIZATION_TYPE_Self-employed
OCCUPATION_TYPE_Laborers
OCCUPATION_TYPE_Nodata
NAME_EDUCATION_TYPE_Secondary / secondary special
DEF_60_CNT_SOCIAL_CIRCLE
N

In [108]:
critical_features = list(train_x.columns[indices[0:80]])

In [109]:
train_matrix = train_x[critical_features]
test_matrix = test_x[critical_features]

In [110]:
forest = ExtraTreesClassifier(n_estimators=150,
                              random_state=0,n_jobs=6)

forest.fit(train_matrix, train_y)

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=150, n_jobs=6,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [111]:
r_result = forest.predict_proba(test_matrix)

In [112]:
forest.classes_

array([0, 1], dtype=int64)

In [113]:
r_result=pd.DataFrame(list(r_result[:,1]),columns=['TARGET'])
r_result['SK_ID_CURR'] = test_data['SK_ID_CURR']

In [114]:
r_result.to_csv('E:/pnk/kaggle_credit/test_result6.csv')