In [2]:
pip install xgboost -i https://mirrors.aliyun.com/pypi/simple/

Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
Collecting xgboost
  Downloading https://mirrors.aliyun.com/pypi/packages/24/14/d9ecb9fa86727f51bfb35f1c2b0428ebc6cd5ffde24c5e2dc583d3575a6f/xgboost-1.6.2-py3-none-win_amd64.whl (125.4MB)
Installing collected packages: xgboost
Successfully installed xgboost-1.6.2
Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'python -m pip install --upgrade pip' command.


In [3]:
from xgboost import XGBRegressor

import pandas as pd

In [21]:
data_path = 'bike-sharing/hour.csv'  #读取数据到内存，rides为一个dataframe对象
rides = pd.read_csv(data_path)

dummy_fields = ['season', 'weathersit', 'mnth', 'hr', 'weekday'] #所有类型编码变量的名称
for each in dummy_fields:
    #取出所有类型变量，并将它们转变为独热编码
    dummies = pd.get_dummies(rides[each], prefix=each, drop_first=False)
    #将新的独热编码变量与原有的所有变量合并到一起
    rides = pd.concat([rides, dummies], axis=1)

#将原来的类型变量从数据表中删除
fields_to_drop = ['instant', 'dteday', 'season', 'weathersit', 'weekday', 'atemp', 'mnth', 'workingday', 
    'hr'] #要删除的类型变量的名称
data = rides.drop(fields_to_drop, axis=1) #将它们从数据库的变量中删除

data

Unnamed: 0,yr,mnth,hr,holiday,temp,hum,windspeed,casual,registered,cnt,...,weathersit_2,weathersit_3,weathersit_4,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
0,0,1,0,0,-1.334609,0.947345,-1.553844,3,13,-0.956312,...,0,0,0,0,0,0,0,0,0,1
1,0,1,1,0,-1.438475,0.895513,-1.553844,8,32,-0.823998,...,0,0,0,0,0,0,0,0,0,1
2,0,1,2,0,-1.438475,0.895513,-1.553844,5,27,-0.868103,...,0,0,0,0,0,0,0,0,0,1
3,0,1,3,0,-1.334609,0.636351,-1.553844,3,10,-0.972851,...,0,0,0,0,0,0,0,0,0,1
4,0,1,4,0,-1.334609,0.636351,-1.553844,0,1,-1.039008,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17374,1,12,19,0,-1.230743,-0.141133,-0.211685,11,108,-0.388467,...,1,0,0,0,1,0,0,0,0,0
17375,1,12,20,0,-1.230743,-0.141133,-0.211685,8,81,-0.553859,...,1,0,0,0,1,0,0,0,0,0
17376,1,12,21,0,-1.230743,-0.141133,-0.211685,7,83,-0.548346,...,0,0,0,0,1,0,0,0,0,0
17377,1,12,22,0,-1.230743,-0.348463,-0.456086,13,48,-0.708224,...,0,0,0,0,1,0,0,0,0,0


In [22]:
#将最后一个月的数据作为测试集
test_data = data[-30*24:] #选出训练集
train_data = data[:-30*24] #选出测试集

#目标列包含的字段
target_fields = ['cnt','casual', 'registered'] 

#训练集划分成特征变量列和目标特征列
features, targets = train_data.drop(target_fields, axis=1), train_data[target_fields]

#测试集划分成特征变量列和目标特征列
test_features, test_targets = test_data.drop(target_fields, axis=1), test_data[target_fields]



调参刚开始的时候，一般要先初始化一些值：

1，选择较高的学习速率（learning rate）。一般情况下，学习速率的值为0.1。但是对于不同的问题，理想的学习速率有时候会在0.05到0.3之间波动。选择对应于此学习速率的理想决策树数量。 Xgboost有一个很有用的函数“cv”，这个函数可以在每一次迭代中使用交叉验证，并返回理想的决策树数量。

2，对于给定的学习速率和决策树数量，进行决策树特定参数调优（max_depth，min_child_weight，gamma，subsample，colsample_bytree）。在确定一棵树的过程中，我们可以选择不同的参数。

3，Xgboost的正则化参数的调优。（lambda，alpha）。这些参数可以降低模型的复杂度，从而提高模型的表现。

4，降低学习速率，确定理想参数。

In [9]:
from sklearn.model_selection import GridSearchCV

param_test1 = {

'n_estimators':range(100,2000,100)

}
gsearch1 = GridSearchCV(estimator=XGBRegressor(learning_rate=0.5,max_depth=5,

min_child_weight=1,gamma=0,subsample=0.8,colsample_bytree=0.8,

nthread=4,scale_pos_weight=1,seed=27,),

param_grid=param_test1,cv=5)

gsearch1.fit(features, targets)

print(gsearch1.best_params_, gsearch1.best_score_)

{'n_estimators': 100} 0.7746599275566237


In [10]:
param_test2 = {

'max_depth':range(3,10,2),

'min_child_weight':range(1,6,2)

}

gsearch2 = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1,n_estimators=200),

param_grid=param_test2)

gsearch2.fit(features, targets)

print(gsearch2.best_params_, gsearch2.best_score_)

{'max_depth': 7, 'min_child_weight': 3} 0.8170068418527869


In [11]:
param_test4 = {
    'gamma': [i / 10.0 for i in range(0, 5)]
}
gsearch4 = GridSearchCV(estimator=XGBRegressor(learning_rate=0.1, n_estimators=200, max_depth=5, min_child_weight=5),
                        param_grid=param_test4)
gsearch4.fit(features, targets)
print(gsearch4.best_params_, gsearch4.best_score_)

{'gamma': 0.0} 0.7980945444054409


In [12]:
param_test5 = {
    'subsample': [i / 10.0 for i in range(6, 10)],
    'colsample_bytree': [i / 10.0 for i in range(6, 10)]
}
gsearch5 = GridSearchCV(
    estimator=XGBRegressor(learning_rate=0.1, n_estimators=200, max_depth=5, min_child_weight=5, gamma=0.0),
    param_grid=param_test5)
gsearch5.fit(features, targets)
print(gsearch5.best_params_, gsearch5.best_score_)

{'colsample_bytree': 0.7, 'subsample': 0.6} 0.8120718723386627


In [28]:
param_test6 = {
    'reg_alpha': [0, 0.001, 0.005, 0.01, 0.05]
}
gsearch6 = GridSearchCV(
    estimator=XGBRegressor(learning_rate=0.1, n_estimators=200, max_depth=5, min_child_weight=5, gamma=0.0,
                           colsample_bytree=0.9, subsample=0.7),
    param_grid=param_test6)
gsearch6.fit(features, targets)
print(gsearch6.best_params_, gsearch6.best_score_)

KeyboardInterrupt: 

In [29]:
reg = XGBRegressor(learning_rate=0.1, n_estimators=100, max_depth=7, min_child_weight=5, gamma=0.0,
                   colsample_bytree=0.7, subsample=0.7, reg_alpha=0.05)
        
reg.fit(features,targets)
y_pred = reg.predict(test_features)

# 输出预测结果至my_XGB_prediction.csv
# rides['y'] = y_pred
# rides.to_csv('output/my_XGB_tc2_prediction.csv', index=False)

In [30]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error 
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error

mse = mean_squared_error(test_targets, y_pred)
evs = explained_variance_score(test_targets, y_pred)
print("\n#### xgboost的表现 ####")
print('平均绝对误差:',mean_absolute_error(test_targets, y_pred))
print("均方误差:", round(mse, 2))
print("解释方差分:", round(evs, 2))
print("R方值(R2_score)",r2_score(test_targets,y_pred))

print("分类模型分数",reg.score(features,targets))


#### xgboost的表现 ####
平均绝对误差: 16.928750542804494
均方误差: 1472.4
解释方差分: 0.78
R方值(R2_score) 0.7571025153986238
分类模型分数 0.9569351196861668
