### 這堂課目標：三個進化版的機器學習套件
<li>XGB (Extreme Gradient Boosting)</li>
<li>LightGBM (Light Gradient Boosting Machine)</li>
<li>Catboost (Categorical Boosting)</li>
    
#### 參考資料
- <a href="https://towardsdatascience.com/catboost-vs-light-gbm-vs-xgboost-5f93620723db">總整理（英文）</a>
- <a href="https://codertw.com/程式語言/510420/">總整理（中譯）</a>
- <a href="https://lavanya.ai/2019/06/27/battle-of-the-boosting-algorithms/">另一個探討文章（英文）</a>
- <a href="https://kknews.cc/zh-tw/tech/vlxj8n2.html">另一個探討文章（中譯）</a>
- <a href="https://arxiv.org/pdf/1809.04559.pdf">研究論文</a>

#### 第0步：安裝套件
code originally written by Alvira Swalin

In [1]:
import numpy as np
import pandas as pd
import time
from sklearn.model_selection import train_test_split

#### 1.載入資料

In [2]:
data = pd.read_csv("flights_500k.csv")
data = data.sample(frac = 0.2, random_state=10)  #如果電腦跑太慢可以回來使用

In [3]:
print(data.shape)
data.head()

(116382, 31)


Unnamed: 0,YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,...,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY
167584,2015,6,25,4,DL,2114,N371DA,COS,ATL,630,...,1326.0,126.0,0,0,,0.0,0.0,126.0,0.0,0.0
560454,2015,11,21,6,VX,1916,N844VA,SFO,LAS,2105,...,2223.0,-7.0,0,0,,,,,,
218029,2015,5,21,4,AA,1221,N4YBAA,FAT,DFW,600,...,1109.0,-6.0,0,0,,,,,,
156358,2015,5,12,2,WN,734,N941WN,MDW,SEA,1450,...,1708.0,-7.0,0,0,,,,,,
408552,2015,2,20,5,WN,4905,N431WN,FLL,BUF,1510,...,1757.0,-23.0,0,0,,,,,,


#### 2.清洗資料（data cleaning / Feature engineering）

In [4]:
##選其中幾欄出來
data = data[["MONTH","DAY","DAY_OF_WEEK","AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT",
                 "ORIGIN_AIRPORT","AIR_TIME", "DEPARTURE_TIME","DISTANCE","ARRIVAL_DELAY"]]
print(data.shape)
data.head()

(116382, 11)


Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,DESTINATION_AIRPORT,ORIGIN_AIRPORT,AIR_TIME,DEPARTURE_TIME,DISTANCE,ARRIVAL_DELAY
167584,6,25,4,DL,2114,ATL,COS,142.0,846.0,1184,126.0
560454,11,21,6,VX,1916,LAS,SFO,60.0,2103.0,414,-7.0
218029,5,21,4,AA,1221,DFW,FAT,173.0,556.0,1313,-6.0
156358,5,12,2,WN,734,SEA,MDW,244.0,1448.0,1733,-7.0
408552,2,20,5,WN,4905,BUF,FLL,155.0,1504.0,1165,-23.0


In [5]:
##丟掉缺失值欄位
data.dropna(inplace=True)
print(data.shape)
data.head()

(114320, 11)


Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,DESTINATION_AIRPORT,ORIGIN_AIRPORT,AIR_TIME,DEPARTURE_TIME,DISTANCE,ARRIVAL_DELAY
167584,6,25,4,DL,2114,ATL,COS,142.0,846.0,1184,126.0
560454,11,21,6,VX,1916,LAS,SFO,60.0,2103.0,414,-7.0
218029,5,21,4,AA,1221,DFW,FAT,173.0,556.0,1313,-6.0
156358,5,12,2,WN,734,SEA,MDW,244.0,1448.0,1733,-7.0
408552,2,20,5,WN,4905,BUF,FLL,155.0,1504.0,1165,-23.0


In [6]:
##建立預測目標
data["ARRIVAL_DELAY"] = (data["ARRIVAL_DELAY"]>10)*1
data.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,DESTINATION_AIRPORT,ORIGIN_AIRPORT,AIR_TIME,DEPARTURE_TIME,DISTANCE,ARRIVAL_DELAY
167584,6,25,4,DL,2114,ATL,COS,142.0,846.0,1184,1
560454,11,21,6,VX,1916,LAS,SFO,60.0,2103.0,414,0
218029,5,21,4,AA,1221,DFW,FAT,173.0,556.0,1313,0
156358,5,12,2,WN,734,SEA,MDW,244.0,1448.0,1733,0
408552,2,20,5,WN,4905,BUF,FLL,155.0,1504.0,1165,0


In [7]:
##把類別資料上數字標籤
cols = ["AIRLINE","FLIGHT_NUMBER","DESTINATION_AIRPORT","ORIGIN_AIRPORT"]
for item in cols:
    data[item] = data[item].astype("category").cat.codes +1
data.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,DESTINATION_AIRPORT,ORIGIN_AIRPORT,AIR_TIME,DEPARTURE_TIME,DISTANCE,ARRIVAL_DELAY
167584,6,25,4,4,2110,298,346,142.0,846.0,1184,1
560454,11,21,6,13,1913,452,551,60.0,2103.0,414,0
218029,5,21,4,1,1218,364,383,173.0,556.0,1313,0
156358,5,12,2,14,732,555,468,244.0,1448.0,1733,0
408552,2,20,5,14,4822,327,387,155.0,1504.0,1165,0


In [8]:
data["ARRIVAL_DELAY"].value_counts()

0    89193
1    25127
Name: ARRIVAL_DELAY, dtype: int64

#### 3.分訓練集跟測試集（train test split）

In [9]:
X_train, X_test, y_train, y_test = train_test_split(data.drop(["ARRIVAL_DELAY"], axis=1), data["ARRIVAL_DELAY"],
                                                random_state=10, test_size=0.25)

## XGBoost

#### 4.定義模型

In [10]:
import xgboost as xgb
from sklearn import metrics

def auc(m, train, test): 
    return (metrics.roc_auc_score(y_train,m.predict_proba(train)[:,1]),
                            metrics.roc_auc_score(y_test,m.predict_proba(test)[:,1]))
model = xgb.XGBClassifier(max_depth=50, min_child_weight=1, n_estimators=200,verbosity=1,learning_rate=0.16,)



  data = yaml.load(f.read()) or {}
  defaults = yaml.load(f)


#### 5.訓練模型

In [11]:
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, eval_metric="auc", eval_set=eval_set, verbose=True,early_stopping_rounds=2)

[0]	validation_0-auc:0.63007
Will train until validation_0-auc hasn't improved in 2 rounds.
[1]	validation_0-auc:0.64879
[2]	validation_0-auc:0.65687
[3]	validation_0-auc:0.66291
[4]	validation_0-auc:0.66546
[5]	validation_0-auc:0.67162
[6]	validation_0-auc:0.67534
[7]	validation_0-auc:0.67919
[8]	validation_0-auc:0.68148
[9]	validation_0-auc:0.68384
[10]	validation_0-auc:0.68640
[11]	validation_0-auc:0.68832
[12]	validation_0-auc:0.69072
[13]	validation_0-auc:0.69216
[14]	validation_0-auc:0.69382
[15]	validation_0-auc:0.69590
[16]	validation_0-auc:0.69699
[17]	validation_0-auc:0.69884
[18]	validation_0-auc:0.70010
[19]	validation_0-auc:0.70135
[20]	validation_0-auc:0.70226
[21]	validation_0-auc:0.70282
[22]	validation_0-auc:0.70354
[23]	validation_0-auc:0.70410
[24]	validation_0-auc:0.70452
[25]	validation_0-auc:0.70532
[26]	validation_0-auc:0.70607
[27]	validation_0-auc:0.70681
[28]	validation_0-auc:0.70757
[29]	validation_0-auc:0.70813
[30]	validation_0-auc:0.70861
[31]	validation_0

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
       importance_type='gain', interaction_constraints=None,
       learning_rate=0.16, max_delta_step=0, max_depth=50,
       min_child_weight=1, missing=nan, monotone_constraints=None,
       n_estimators=200, n_jobs=0, num_parallel_tree=1,
       objective='binary:logistic', random_state=0, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
       validate_parameters=False, verbosity=1)

#### 6.評估模型表現

In [12]:
auc(model, X_train, X_test)

(1.0, 0.7173158083296013)

In [16]:
model.predict(X_test[:100])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

#### 7.調整參數（重複5,6）

## LightGBM

#### 4.定義模型

In [23]:
import lightgbm as lgb
from sklearn import metrics

model = lgb.LGBMClassifier(silent=False,max_depth=50,n_estimators=300,num_leaves=900,learning_rate=0.1)

#### 5.訓練模型

In [24]:
model.fit(X_train,y_train, eval_metric='auc',eval_set=eval_set,
          categorical_feature = 'auto',early_stopping_rounds=2)

[1]	valid_0's auc: 0.649466	valid_0's binary_logloss: 0.516854
Training until validation scores don't improve for 2 rounds
[2]	valid_0's auc: 0.662754	valid_0's binary_logloss: 0.510482
[3]	valid_0's auc: 0.668986	valid_0's binary_logloss: 0.505514
[4]	valid_0's auc: 0.676477	valid_0's binary_logloss: 0.500946
[5]	valid_0's auc: 0.682268	valid_0's binary_logloss: 0.496916
[6]	valid_0's auc: 0.687497	valid_0's binary_logloss: 0.493426
[7]	valid_0's auc: 0.690124	valid_0's binary_logloss: 0.490761
[8]	valid_0's auc: 0.691	valid_0's binary_logloss: 0.488933
[9]	valid_0's auc: 0.694617	valid_0's binary_logloss: 0.486524
[10]	valid_0's auc: 0.696161	valid_0's binary_logloss: 0.484875
[11]	valid_0's auc: 0.69816	valid_0's binary_logloss: 0.483084
[12]	valid_0's auc: 0.699375	valid_0's binary_logloss: 0.481686
[13]	valid_0's auc: 0.701199	valid_0's binary_logloss: 0.480332
[14]	valid_0's auc: 0.702114	valid_0's binary_logloss: 0.47924
[15]	valid_0's auc: 0.704176	valid_0's binary_logloss: 0.4

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
        importance_type='split', learning_rate=0.1, max_depth=50,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=300, n_jobs=-1, num_leaves=900, objective=None,
        random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=False,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

#### 6.評估模型表現

In [25]:
auc(model, X_train, X_test)

(0.956441350361449, 0.7146763063143753)

#### 7.調整參數（重複5,6）

## Catboost

#### 4.定義模型

In [26]:
data.head()

Unnamed: 0,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,DESTINATION_AIRPORT,ORIGIN_AIRPORT,AIR_TIME,DEPARTURE_TIME,DISTANCE,ARRIVAL_DELAY
167584,6,25,4,4,2110,298,346,142.0,846.0,1184,1
560454,11,21,6,13,1913,452,551,60.0,2103.0,414,0
218029,5,21,4,1,1218,364,383,173.0,556.0,1313,0
156358,5,12,2,14,732,555,468,244.0,1448.0,1733,0
408552,2,20,5,14,4822,327,387,155.0,1504.0,1165,0


In [27]:
import catboost as cb
cat_features_index = [0,1,2,3,4,5,6]


#With Categorical features
clf = cb.CatBoostClassifier(eval_metric="AUC",one_hot_max_size=31, \
                            depth=10, iterations= 500, l2_leaf_reg= 9, learning_rate= 0.15)


#### 5.訓練模型

In [28]:
clf.fit(X_train,y_train, cat_features= cat_features_index,eval_set=eval_set,early_stopping_rounds=2)

0:	test: 0.6297311	best: 0.6297311 (0)	total: 360ms	remaining: 2m 59s
1:	test: 0.6435833	best: 0.6435833 (1)	total: 661ms	remaining: 2m 44s
2:	test: 0.6464603	best: 0.6464603 (2)	total: 1.14s	remaining: 3m 8s
3:	test: 0.6468310	best: 0.6468310 (3)	total: 1.34s	remaining: 2m 46s
4:	test: 0.6508252	best: 0.6508252 (4)	total: 2.02s	remaining: 3m 20s
5:	test: 0.6529403	best: 0.6529403 (5)	total: 2.6s	remaining: 3m 34s
6:	test: 0.6572084	best: 0.6572084 (6)	total: 2.96s	remaining: 3m 28s
7:	test: 0.6624582	best: 0.6624582 (7)	total: 3.37s	remaining: 3m 27s
8:	test: 0.6652365	best: 0.6652365 (8)	total: 3.84s	remaining: 3m 29s
9:	test: 0.6674497	best: 0.6674497 (9)	total: 4.27s	remaining: 3m 29s
10:	test: 0.6684949	best: 0.6684949 (10)	total: 4.59s	remaining: 3m 24s
11:	test: 0.6712439	best: 0.6712439 (11)	total: 5.2s	remaining: 3m 31s
12:	test: 0.6721717	best: 0.6721717 (12)	total: 5.93s	remaining: 3m 42s
13:	test: 0.6744488	best: 0.6744488 (13)	total: 6.44s	remaining: 3m 43s
14:	test: 0.676

<catboost.core.CatBoostClassifier at 0x1a2cc53160>

#### 6.評估模型表現

In [29]:
auc(clf, X_train, X_test)

(0.7547017963865237, 0.7082117832512316)

# 無作業