# [作業目標]
- 使用 Day 12 剛學到的方法, 對較完整的資料生成離散化特徵
- 觀察上述離散化特徵, 對於目標值的預測有沒有幫助

# [作業重點]
- 仿照 Day 12 的語法, 將年齡資料 ('DAYS_BIRTH' 除以 365) 離散化
- 繪製上述的 "離散化標籤" 與目標值 ('TARGET') 的長條圖

In [1]:
# 載入需要的套件
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# 設定 data_path
dir_data = './data/'

### 之前做過的處理

In [2]:
# 讀取資料檔
f_app_train = os.path.join(dir_data, 'application_train.csv')
app_train = pd.read_csv(f_app_train)
app_train.shape

(307511, 122)

In [3]:
# 將只有兩種值的類別型欄位, 做 Label Encoder, 計算相關係數時讓這些欄位可以被包含在內
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

# 檢查每一個 column
for col in app_train:
    if app_train[col].dtype == 'object':
        # 如果只有兩種值的類別型欄位
        if len(list(app_train[col].unique())) <= 2:
            # 就做 Label Encoder, 以加入相關係數檢查
            app_train[col] = le.fit_transform(app_train[col])            
print(app_train.shape)
app_train.head()

(307511, 122)


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,0,M,0,1,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,0,F,0,0,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,1,M,1,1,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,0,F,0,1,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,0,M,0,1,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
# 受雇日數為異常值的資料, 另外設一個欄位記錄, 並將異常的日數轉成空值 (np.nan)
app_train['DAYS_EMPLOYED_ANOM'] = app_train["DAYS_EMPLOYED"] == 365243
app_train['DAYS_EMPLOYED'].replace({365243: np.nan}, inplace = True)

# 出生日數 (DAYS_BIRTH) 取絕對值 
app_train['DAYS_BIRTH'] = abs(app_train['DAYS_BIRTH'])

## 練習時間
參考 Day 12 範例程式，離散化你覺得有興趣的欄位，並嘗試找出有趣的訊息

In [5]:
app_train['YEARS_BIRTH'] = app_train['DAYS_BIRTH'] / 365 # 轉成年齡
app_train['YEARS_BIRTH'].value_counts()

37.668493    43
36.934247    42
49.994521    41
27.452055    41
28.197260    40
43.208219    40
39.438356    39
39.087671    39
31.956164    39
36.336986    39
40.523288    38
41.210959    38
37.216438    38
37.353425    38
37.775342    38
42.643836    38
36.931507    38
38.057534    38
40.794521    37
35.084932    37
40.520548    37
27.923288    37
27.769863    37
36.210959    37
39.112329    37
30.432877    37
54.997260    37
42.638356    37
27.690411    37
40.457534    37
             ..
68.273973     1
68.419178     1
68.849315     1
21.736986     1
68.539726     1
68.706849     1
68.156164     1
68.975342     1
68.939726     1
68.616438     1
68.726027     1
67.356164     1
68.213699     1
68.109589     1
65.358904     1
69.016438     1
21.909589     1
68.841096     1
66.443836     1
68.164384     1
68.789041     1
68.764384     1
21.619178     1
68.624658     1
66.939726     1
68.416438     1
68.942466     1
21.408219     1
68.808219     1
68.580822     1
Name: YEARS_BIRTH, Lengt

In [6]:
sub_df=app_train['YEARS_BIRTH'].sort_values()
print('max age=\n',sub_df.tail(1))#最大年齡
print('min age=\n',sub_df.head(1))#最小年齡

max age=
 265026    69.120548
Name: YEARS_BIRTH, dtype: float64
min age=
 235444    20.517808
Name: YEARS_BIRTH, dtype: float64


In [7]:
#分為：20~70，每5歲切一組
app_train['YEARS_BINNED'] = pd.cut(app_train['YEARS_BIRTH'], bins = range(20, 70, 5))

# 顯示不同年齡群的數量
print(app_train['YEARS_BINNED'].value_counts())
app_train.head()

(35, 40]    42868
(40, 45]    41406
(30, 35]    39440
(45, 50]    35135
(50, 55]    34942
(55, 60]    33120
(25, 30]    32862
(60, 65]    27476
(20, 25]    12159
Name: YEARS_BINNED, dtype: int64


Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR,DAYS_EMPLOYED_ANOM,YEARS_BIRTH,YEARS_BINNED
0,100002,1,0,M,0,1,0,202500.0,406597.5,24700.5,...,0,0.0,0.0,0.0,0.0,0.0,1.0,False,25.920548,"(25, 30]"
1,100003,0,0,F,0,0,0,270000.0,1293502.5,35698.5,...,0,0.0,0.0,0.0,0.0,0.0,0.0,False,45.931507,"(45, 50]"
2,100004,0,1,M,1,1,0,67500.0,135000.0,6750.0,...,0,0.0,0.0,0.0,0.0,0.0,0.0,False,52.180822,"(50, 55]"
3,100006,0,0,F,0,1,0,135000.0,312682.5,29686.5,...,0,,,,,,,False,52.068493,"(50, 55]"
4,100007,0,0,M,0,1,0,121500.0,513000.0,21865.5,...,0,0.0,0.0,0.0,0.0,0.0,0.0,False,54.608219,"(50, 55]"


In [8]:
app_train['YEARS_BINNED'].value_counts().sort_index()#列出每個年齡群的數目

(20, 25]    12159
(25, 30]    32862
(30, 35]    39440
(35, 40]    42868
(40, 45]    41406
(45, 50]    35135
(50, 55]    34942
(55, 60]    33120
(60, 65]    27476
Name: YEARS_BINNED, dtype: int64

In [9]:
sub_df1=app_train['AMT_ANNUITY'].fillna(0).sort_values()
filter = app_train["AMT_ANNUITY"] > 0 # 大於 0
print('max AMT_ANNUITY=\n',sub_df1.tail(1))#最大AMT_ANNUITY
print('min AMT_ANNUITY=\n',sub_df1[filter].head(1))#最小AMT_ANNUITY

max AMT_ANNUITY=
 17948    258025.5
Name: AMT_ANNUITY, dtype: float64
min AMT_ANNUITY=
 277186    1615.5
Name: AMT_ANNUITY, dtype: float64


In [10]:
app_train['AMT_ANNUITY_BINNED'] = pd.cut(app_train['AMT_ANNUITY'], bins = range(0, 300000, 10000))
print(app_train['AMT_ANNUITY_BINNED'].value_counts())

(20000, 30000]      92466
(10000, 20000]      83222
(30000, 40000]      58562
(40000, 50000]      28669
(0, 10000]          23283
(50000, 60000]      13165
(60000, 70000]       5051
(70000, 80000]       1558
(80000, 90000]        607
(90000, 100000]       411
(100000, 110000]      188
(110000, 120000]      125
(120000, 130000]       72
(130000, 140000]       39
(220000, 230000]       24
(170000, 180000]       15
(140000, 150000]       13
(150000, 160000]        9
(210000, 220000]        7
(160000, 170000]        4
(180000, 190000]        3
(200000, 210000]        3
(190000, 200000]        1
(230000, 240000]        1
(250000, 260000]        1
(270000, 280000]        0
(240000, 250000]        0
(260000, 270000]        0
(280000, 290000]        0
Name: AMT_ANNUITY_BINNED, dtype: int64


In [11]:
app_train['AMT_ANNUITY_BINNED'].value_counts().sort_index()

(0, 10000]          23283
(10000, 20000]      83222
(20000, 30000]      92466
(30000, 40000]      58562
(40000, 50000]      28669
(50000, 60000]      13165
(60000, 70000]       5051
(70000, 80000]       1558
(80000, 90000]        607
(90000, 100000]       411
(100000, 110000]      188
(110000, 120000]      125
(120000, 130000]       72
(130000, 140000]       39
(140000, 150000]       13
(150000, 160000]        9
(160000, 170000]        4
(170000, 180000]       15
(180000, 190000]        3
(190000, 200000]        1
(200000, 210000]        3
(210000, 220000]        7
(220000, 230000]       24
(230000, 240000]        1
(240000, 250000]        0
(250000, 260000]        1
(260000, 270000]        0
(270000, 280000]        0
(280000, 290000]        0
Name: AMT_ANNUITY_BINNED, dtype: int64