In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib as mpl
import seaborn as sns
import re,pip,conda
import time
import os

In [2]:
for package in [sklearn,mpl,np,pd,sns,pip,conda]:
    print(re.findall("([^']*)",str(package))[2],package.__version__)

sklearn 0.23.1
matplotlib 3.2.2
numpy 1.18.5
pandas 1.0.5
seaborn 0.10.1
pip 21.0.1
conda 4.10.3


# GBDT梯度提升树
工业界应用最多的

GBDT包含Boosting三要素：
- 损失函数
- 弱评估器
- 综合集成结果

遵循 Boosting 算法的基本流程

GBDT 关键改变：
- 弱评估器：
    - 无论GBDT整体在执行回归、分类、排序任务，弱评估器一定是回归器。
    - 通过sigmoid 或 softmax函数输出具体的分类结果。
- 损失函数：任意可微的函数
- 拟合残差：
    - GBDT 不通过AdaBoost一样调整`数据分布`来间接影响后续弱评估器。而是，通过修改后续弱评估器的`拟合目标`来直接影响后续弱评估器的结构。
    - GBDT不修改样本权重，而是通过拟合残差（y-H(xi)）来影响后续弱评估器结构。
- 抽样思想：加入了随机森林中随机抽样的思想。每次建树前，对样本和特征进行抽样来增大弱评估器之间的独立性。

In [3]:
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor as GBR
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn.ensemble import AdaBoostRegressor as ABR
from sklearn.ensemble import RandomForestRegressor as RFR
from sklearn.model_selection import cross_validate,KFold

In [4]:
data = pd.read_csv(r"/Users/feishuoren/Projects/machine_learning/datasets/HousePrice/train_encode.csv",index_col=0)
data.head()

Unnamed: 0,Id,住宅类型,住宅区域,街道接触面积(英尺),住宅面积,街道路面状况,巷子路面状况,住宅形状(大概),住宅现状,水电气,...,泳池面积,泳池质量,篱笆质量,其他配置,其他配置的价值,销售月份,销售年份,销售类型,销售状态,SalePrice
0,0.0,5.0,3.0,36.0,327.0,1.0,0.0,3.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2.0,8.0,4.0,208500
1,1.0,0.0,3.0,51.0,498.0,1.0,0.0,3.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,1.0,8.0,4.0,181500
2,2.0,5.0,3.0,39.0,702.0,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,8.0,2.0,8.0,4.0,223500
3,3.0,6.0,3.0,31.0,489.0,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0,0.0,140000
4,4.0,5.0,3.0,55.0,925.0,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,2.0,8.0,4.0,250000


In [5]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

In [6]:
X.shape

(1460, 80)

In [7]:
y.describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

In [8]:
# 定义所需的交叉验证方式
cv = KFold(n_splits=5,shuffle=True,random_state=1412)

def RMSE(result,name):
    return abs(result[name].mean())

In [9]:
# 梯度提升树
gbr = GBR(random_state=1412) # 实例化
result_gbdt = cross_validate(gbr,X,y,cv=cv
                            ,scoring="neg_root_mean_squared_error" # 负根均方误差
                            ,return_train_score=True
                            ,verbose=True
                            ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    8.8s remaining:   13.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    9.0s finished


In [10]:
result_gbdt

{'fit_time': array([2.89450908, 2.91139889, 2.87773395, 2.96040297, 2.89958405]),
 'score_time': array([0.01958275, 0.02070498, 0.01994514, 0.0180521 , 0.02827597]),
 'test_score': array([-25841.43338683, -44920.79906295, -24242.00022617, -20817.99371904,
        -28097.54532128]),
 'train_score': array([-13716.93826094, -13750.03069928, -14480.27626562, -14515.79537532,
        -13490.91346828])}

In [11]:
RMSE(result_gbdt,"train_score")

13990.790813889864

In [12]:
RMSE(result_gbdt,"test_score")

28783.954343252786

In [13]:
# 梯度提升回归与其他算法的对比
modelname = ["GBDT","RF","AdaBoost","RF-TPE","Ada-TPR"]

models = [GBR(random_state=1412)
        ,RFR(random_state=1412,n_jobs=-1)
        ,ABR(random_state=1412)
        ,RFR(n_estimators=89,max_depth=22,max_features=14,min_impurity_decrease=0
            ,random_state=1412,verbose=False,n_jobs=-1)
        ,ABR(n_estimators=39,learning_rate=0.94,loss="exponential"
            ,random_state=1412)]
colors=["green","gray","orange","red","blue"]

In [14]:
for name,model in zip(modelname,models):
    start = time.time()
    result = cross_validate(model,X,y,cv=cv,scoring="neg_root_mean_squared_error"
                           ,return_train_score=True
                           ,verbose=False
                           ,n_jobs=-1)
    end = time.time()-start
    print(name)
    print("\t train_score:{:.3f}".format(RMSE(result,"train_score")))
    print("\t test_score:{:.3f}".format(RMSE(result,"test_score")))    
    print("\t time:{:.2f}s".format(end))
    print("\n")

GBDT
	 train_score:13990.791
	 test_score:28783.954
	 time:5.45s


RF
	 train_score:11177.272
	 test_score:30571.267
	 time:7.83s


AdaBoost
	 train_score:27062.107
	 test_score:35345.931
	 time:1.80s


RF-TPE
	 train_score:11208.818
	 test_score:28346.673
	 time:1.68s


Ada-TPR
	 train_score:27401.542
	 test_score:35169.730
	 time:1.33s




In [15]:
# 梯度提升树分类

In [16]:
data.head()

Unnamed: 0,Id,住宅类型,住宅区域,街道接触面积(英尺),住宅面积,街道路面状况,巷子路面状况,住宅形状(大概),住宅现状,水电气,...,泳池面积,泳池质量,篱笆质量,其他配置,其他配置的价值,销售月份,销售年份,销售类型,销售状态,SalePrice
0,0.0,5.0,3.0,36.0,327.0,1.0,0.0,3.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,2.0,8.0,4.0,208500
1,1.0,0.0,3.0,51.0,498.0,1.0,0.0,3.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.0,1.0,8.0,4.0,181500
2,2.0,5.0,3.0,39.0,702.0,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,8.0,2.0,8.0,4.0,223500
3,3.0,6.0,3.0,31.0,489.0,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,8.0,0.0,140000
4,4.0,5.0,3.0,55.0,925.0,1.0,0.0,0.0,3.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,2.0,8.0,4.0,250000


In [17]:
#分类书籍
x_clf = data.iloc[:,:-2]
y_clf = data.iloc[:,-2]

In [18]:
np.unique(y_clf)

array([0., 1., 2., 3., 4., 5.])

In [19]:
# GBDT 分类的实现
clf = GBC()
cv = KFold(n_splits=5,shuffle=True,random_state=1412)
result_clf = cross_validate(clf,x_clf,y_clf,cv=cv
                           ,return_train_score=True
                           ,verbose=True
                           ,n_jobs=-1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   29.3s remaining:   44.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   29.6s finished


In [20]:
result_clf["train_score"].mean()

0.9909246575342466

In [21]:
result_clf["test_score"].mean()

0.9013698630136986

In [22]:
from sklearn.tree import DecisionTreeRegressor as DTR

In [23]:
tree_reg = DTR(random_state=1412)
rf = RFR(n_estimators=89,max_depth=22,max_features=14,min_impurity_decrease=0
        ,random_state=1412,verbose=False,n_jobs=-1)

In [24]:
for init in [tree_reg,rf,"zero",None]:
    reg = GBR(init = init,random_state=1412)
    cv = KFold(n_splits=5,shuffle=True,random_state=1412)
    result_reg = cross_validate(reg,X,y,cv=cv,scoring = "neg_root_mean_squared_error"
                               ,return_train_score=True
                               ,verbose=False
                               ,n_jobs=-1)
    print("\n")
    print(RMSE(result_reg,"train_score"))
    print(RMSE(result_reg,"test_score"))    



0.0
42065.93924112058


5669.291478825804
27213.78342923997


13990.791639702458
28739.882050269225


13990.790813889864
28783.954343252786


**sklearn 中 GBDT 的相关参数：**

|类型|参数/属性|
|----|----|
|迭代过程|参数：n_estimators,learning_rate,loss,aplha,init<br />属性：loss_,init_,estimators|
|弱分类器结构|criterion,max_depth,min_samples_split,<br />min_samples_leaf,min_weight_fraction_leaf,<br />max_leaf_nodes,min_impurity_decrease|
|提前停止|参数：validation_fraction,n_iter_no_change,tol<br />属性：n_estimators_|
|弱分类器的训练数据|参数：subsample,max_features,random_state<br />属性：oob_improvement,train_score_|
|其他|ccp_alpha,warm_start|