In [1]:
# 读入数据
from instrument import read_bunch
from sklearn.datasets.base import Bunch
from sklearn.feature_extraction.text import TfidfVectorizer

# load data from local
train_bunch_path = './data_bunch/train_bunch.dat'
validate_bunch_path = './data_bunch/validate_bunch.dat'
train_bunch = read_bunch(train_bunch_path)
validate_bunch = read_bunch(validate_bunch_path)

# 创建词向量空间
stop_words_list = None
max_df = 0.7

# create TF-IDF words vector space with train data
tfidf_train = Bunch(Id=train_bunch.news_id, Label=train_bunch.news_pic_label, tdm=[], vocabulary={})
train_vectorizer = TfidfVectorizer(stop_words=stop_words_list, sublinear_tf=True, max_df=max_df)
tfidf_train.tdm = train_vectorizer.fit_transform(train_bunch.news_words_jieba)                # jieba 分词结果或
tfidf_train.vocabulary = train_vectorizer.vocabulary_

# create TF-IDF words vector space with validate data
tfidf_validate = Bunch(Id=validate_bunch.news_id, tdm=[], vocabulary={})
tfidf_validate.vocabulary = tfidf_train.vocabulary
validate_vectorizer = TfidfVectorizer(stop_words=stop_words_list, sublinear_tf=True, max_df=max_df,
                                      vocabulary=tfidf_train.vocabulary)
tfidf_validate.tdm = validate_vectorizer.fit_transform(validate_bunch.news_words_jieba)        # jieba 分词结果

# 将数据分为训练集与测试集
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(tfidf_train.tdm,
                                                    tfidf_train.Label,
                                                    test_size=0.3,
                                                    random_state=33)

# 构建模型
from sklearn.metrics import classification_report

# XGBoost调参

In [5]:
# 寻找最佳的学习率
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV

xgbc_1 = XGBClassifier(silent=0,                      # 设置为1则没有运行信息输出，设置为0则有运行信息输出
                        learning_rate=0.01,            # 学习率
                        min_child_weight=1,            # 该参数越小，越容易过拟合
                        max_depth=8,                   # 构建的树的深度，越大越容易过拟合
                        gamma=0,                       # 越大越保守，一般取值为0.1，0.2
                        subsample=0.8,
                        max_delta_step=0,              # 最大增量步长，我们允许每个树的权重估计
                        colsample_bytree=0.8,          # 生成树时进行的列采样
                        reg_lambda=1,                  # L2 正则化参数，越大越不容易过拟合
                        scale_pos_weight=1,            # 取值大于0，在类别样本不平衡时有助于快速收敛
                        objective='multi:softmax',     # 多分类问题
                        num_class=3,                   # 类别数
                        n_estimators=900,              # 树的个数
                        eval_metric='merror',          # 多分类的损失函数
                        seed=1000,
                       n_jobs=-1
                      )
parameters_1 = [
    {
        'learning_rate': [i/100 for i in range(1,19)],
#         'max_depth': [i for i in range(3,11)],
#         'subsample': [i/10 for i in range(5,11)],
#         'scale_pos_weight':[1,1.5,2,2.5],
#         'reg_lambda': [0.5, 1, 5, 10],
#         'n_estimators': [800, 1000, 1500, 2000],
#         'min_child_weight': range(1, 6, 2),
#         'gamma': [i/10.0 for i in range(0, 5)],
#         'colsample_bytree': [i/100.0 for i in range(75, 100,5)]
    }
]
gs_xgb_1 = GridSearchCV(xgbc_1, parameters_1, verbose=True, cv=4, n_jobs=-1)
gs_xgb_1.fit(x_train,y_train)
y_pred_1 = gs_xgb_1.predict(x_test)
print('The accuracy of classifying training data with XGBoost is :',
      gs_xgb_1.score(x_test, y_test))
print(classification_report(y_test, gs_xgb_1.predict(x_test)))

Fitting 4 folds for each of 18 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  18 out of  72 | elapsed: 479.8min remaining: 1439.4min
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed: 582.3min finished
  if diff:
  if diff:


The accuracy of classifying training data with XGBoost is : 0.692931793179318
             precision    recall  f1-score   support

          0       0.70      0.82      0.75      7192
          1       0.52      0.23      0.32      2357
          2       0.72      0.73      0.73      4995

avg / total       0.68      0.69      0.67     14544



  if diff:


In [7]:
gs_xgb_1.best_estimator_,gs_xgb_1.best_score_,gs_xgb_1.best_params_

(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=0.8, eval_metric='merror', gamma=0,
        learning_rate=0.09, max_delta_step=0, max_depth=8,
        min_child_weight=1, missing=None, n_estimators=900, n_jobs=1,
        nthread=None, num_class=3, objective='multi:softprob',
        random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
        seed=1000, silent=0, subsample=0.8),
 0.7044436586515794,
 {'learning_rate': 0.09})

In [8]:
# 寻找最佳的 subsample
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV

xgbc_2 = XGBClassifier(silent=0,                      # 设置为1则没有运行信息输出，设置为0则有运行信息输出
                        learning_rate=0.09,            # 学习率
                        min_child_weight=1,            # 该参数越小，越容易过拟合
                        max_depth=8,                   # 构建的树的深度，越大越容易过拟合
                        gamma=0,                       # 越大越保守，一般取值为0.1，0.2
                        subsample=0.8,
                        max_delta_step=0,              # 最大增量步长，我们允许每个树的权重估计
                        colsample_bytree=0.8,          # 生成树时进行的列采样
                        reg_lambda=1,                  # L2 正则化参数，越大越不容易过拟合
                        scale_pos_weight=1,            # 取值大于0，在类别样本不平衡时有助于快速收敛
                        objective='multi:softmax',     # 多分类问题
                        num_class=3,                   # 类别数
                        n_estimators=900,              # 树的个数
                        eval_metric='merror',          # 多分类的损失函数
                        seed=1000,
                      n_jobs=-1)
parameters_2 = [
    {
#         'learning_rate': [i/100 for i in range(1,19)],
#         'max_depth': [i for i in range(3,11)],
        'subsample': [i/10 for i in range(5,11)],
#         'scale_pos_weight':[1,1.5,2,2.5],
#         'reg_lambda': [0.5, 1, 5, 10],
#         'n_estimators': [800, 1000, 1500, 2000],
#         'min_child_weight': range(1, 6, 2),
#         'gamma': [i/10.0 for i in range(0, 5)],
#         'colsample_bytree': [i/100.0 for i in range(75, 100,5)]
    }
]
gs_xgb_2 = GridSearchCV(xgbc_2, parameters_2, verbose=True, cv=4, n_jobs=-1)
gs_xgb_2.fit(x_train,y_train)
y_pred_2 = gs_xgb_2.predict(x_test)
print('The accuracy of classifying training data with XGBoost is :',
      gs_xgb_2.score(x_test, y_test))
print(classification_report(y_test, gs_xgb_2.predict(x_test)))

Fitting 4 folds for each of 6 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  22 out of  24 | elapsed: 181.0min remaining: 16.5min
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed: 181.2min finished
  if diff:
  if diff:


The accuracy of classifying training data with XGBoost is : 0.692931793179318
             precision    recall  f1-score   support

          0       0.70      0.82      0.75      7192
          1       0.52      0.23      0.32      2357
          2       0.72      0.73      0.73      4995

avg / total       0.68      0.69      0.67     14544



  if diff:


In [9]:
gs_xgb_2.best_estimator_,gs_xgb_2.best_score_,gs_xgb_2.best_params_

(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=0.8, eval_metric='merror', gamma=0,
        learning_rate=0.09, max_delta_step=0, max_depth=8,
        min_child_weight=1, missing=None, n_estimators=900, n_jobs=-1,
        nthread=None, num_class=3, objective='multi:softprob',
        random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
        seed=1000, silent=0, subsample=0.8),
 0.7044436586515794,
 {'subsample': 0.8})

In [10]:
# 寻找最佳的 n_estimators
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV

xgbc_3 = XGBClassifier(silent=0,                      # 设置为1则没有运行信息输出，设置为0则有运行信息输出
                        learning_rate=0.09,            # 学习率
                        min_child_weight=1,            # 该参数越小，越容易过拟合
                        max_depth=8,                   # 构建的树的深度，越大越容易过拟合
                        gamma=0,                       # 越大越保守，一般取值为0.1，0.2
                        subsample=0.8,
                        max_delta_step=0,              # 最大增量步长，我们允许每个树的权重估计
                        colsample_bytree=0.8,          # 生成树时进行的列采样
                        reg_lambda=1,                  # L2 正则化参数，越大越不容易过拟合
                        scale_pos_weight=1,            # 取值大于0，在类别样本不平衡时有助于快速收敛
                        objective='multi:softmax',     # 多分类问题
                        num_class=3,                   # 类别数
                        n_estimators=900,              # 树的个数
                        eval_metric='merror',          # 多分类的损失函数
                        seed=1000,
                      n_jobs=-1)
parameters_3 = [
    {
#         'learning_rate': [i/100 for i in range(1,19)],
#         'max_depth': [i for i in range(3,11)],
#         'subsample': [i/10 for i in range(5,11)],
#         'scale_pos_weight':[1,1.5,2,2.5],
#         'reg_lambda': [0.5, 1, 5, 10],
        'n_estimators': [800, 1000, 1500, 2000],
#         'min_child_weight': range(1, 6, 2),
#         'gamma': [i/10.0 for i in range(0, 5)],
#         'colsample_bytree': [i/100.0 for i in range(75, 100,5)]
    }
]
gs_xgb_3 = GridSearchCV(xgbc_3, parameters_3, verbose=True, cv=4, n_jobs=-1)
gs_xgb_3.fit(x_train,y_train)
y_pred_3 = gs_xgb_3.predict(x_test)
print('The accuracy of classifying training data with XGBoost is :',
      gs_xgb_3.score(x_test, y_test))
print(classification_report(y_test, gs_xgb_3.predict(x_test)))

Fitting 4 folds for each of 4 candidates, totalling 16 fits


[Parallel(n_jobs=-1)]: Done   8 out of  16 | elapsed: 131.2min remaining: 131.2min
[Parallel(n_jobs=-1)]: Done  16 out of  16 | elapsed: 180.3min finished
  if diff:
  if diff:


The accuracy of classifying training data with XGBoost is : 0.693069306930693
             precision    recall  f1-score   support

          0       0.70      0.82      0.75      7192
          1       0.52      0.23      0.32      2357
          2       0.72      0.73      0.72      4995

avg / total       0.68      0.69      0.67     14544



  if diff:


In [11]:
gs_xgb_3.best_estimator_, gs_xgb_3.best_score_, gs_xgb_3.best_params_

(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=0.8, eval_metric='merror', gamma=0,
        learning_rate=0.09, max_delta_step=0, max_depth=8,
        min_child_weight=1, missing=None, n_estimators=800, n_jobs=-1,
        nthread=None, num_class=3, objective='multi:softprob',
        random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
        seed=1000, silent=0, subsample=0.8),
 0.7048267326732673,
 {'n_estimators': 800})

In [12]:
# 寻找最佳的 subsample
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV

xgbc_4 = XGBClassifier(silent=0,                      # 设置为1则没有运行信息输出，设置为0则有运行信息输出
                        learning_rate=0.09,            # 学习率
                        min_child_weight=1,            # 该参数越小，越容易过拟合
                        max_depth=8,                   # 构建的树的深度，越大越容易过拟合
                        gamma=0,                       # 越大越保守，一般取值为0.1，0.2
                        subsample=0.8,
                        max_delta_step=0,              # 最大增量步长，我们允许每个树的权重估计
                        colsample_bytree=0.8,          # 生成树时进行的列采样
                        reg_lambda=1,                  # L2 正则化参数，越大越不容易过拟合
                        scale_pos_weight=1,            # 取值大于0，在类别样本不平衡时有助于快速收敛
                        objective='multi:softmax',     # 多分类问题
                        num_class=3,                   # 类别数
                        n_estimators=800,              # 树的个数
                        eval_metric='merror',          # 多分类的损失函数
                        seed=1000,
                      n_jobs=-1)
parameters_4 = [
    {
#         'learning_rate': [i/100 for i in range(1,19)],
        'max_depth': [i for i in range(3,11)],
#         'subsample': [i/10 for i in range(5,11)],
#         'scale_pos_weight':[1,1.5,2,2.5],
#         'reg_lambda': [0.5, 1, 5, 10],
#         'n_estimators': [800, 1000, 1500, 2000],
#         'min_child_weight': range(1, 6, 2),
#         'gamma': [i/10.0 for i in range(0, 5)]
    }
]
gs_xgb_4 = GridSearchCV(xgbc_4, parameters_4, verbose=2, cv=4, n_jobs=-1)
gs_xgb_4.fit(x_train,y_train)
y_pred_4 = gs_xgb_4.predict(x_test)
print('The accuracy of classifying training data with XGBoost is :',
      gs_xgb_4.score(x_test, y_test))
print(classification_report(y_test, gs_xgb_4.predict(x_test)))

Fitting 4 folds for each of 8 candidates, totalling 32 fits


[Parallel(n_jobs=15)]: Done  20 out of  32 | elapsed: 147.4min remaining: 88.4min
[Parallel(n_jobs=15)]: Done  32 out of  32 | elapsed: 182.3min finished
  if diff:
  if diff:


The accuracy of classifying training data with XGBoost is : 0.693069306930693
             precision    recall  f1-score   support

          0       0.70      0.82      0.75      7192
          1       0.52      0.23      0.32      2357
          2       0.72      0.73      0.72      4995

avg / total       0.68      0.69      0.67     14544



  if diff:


In [13]:
gs_xgb_4.best_estimator_, gs_xgb_4.best_score_, gs_xgb_4.best_params_

(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=0.8, eval_metric='merror', gamma=0,
        learning_rate=0.09, max_delta_step=0, max_depth=8,
        min_child_weight=1, missing=None, n_estimators=800, n_jobs=15,
        nthread=None, num_class=3, objective='multi:softprob',
        random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
        seed=1000, silent=0, subsample=0.8),
 0.7048267326732673,
 {'max_depth': 8})

In [14]:
# 寻找最佳的 re_lambda
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import GridSearchCV

xgbc_5 = XGBClassifier(silent=0,                      # 设置为1则没有运行信息输出，设置为0则有运行信息输出
                        learning_rate=0.09,            # 学习率
                        min_child_weight=1,            # 该参数越小，越容易过拟合
                        max_depth=8,                   # 构建的树的深度，越大越容易过拟合
                        gamma=0,                       # 越大越保守，一般取值为0.1，0.2
                        subsample=0.8,
                        max_delta_step=0,              # 最大增量步长，我们允许每个树的权重估计
                        colsample_bytree=0.8,          # 生成树时进行的列采样
                        reg_lambda=1,                  # L2 正则化参数，越大越不容易过拟合
                        scale_pos_weight=1,            # 取值大于0，在类别样本不平衡时有助于快速收敛
                        objective='multi:softmax',     # 多分类问题
                        num_class=3,                   # 类别数
                        n_estimators=800,              # 树的个数
                        eval_metric='merror',          # 多分类的损失函数
                        seed=1000,
                      n_jobs=-1)
parameters_5 = {
#         'learning_rate': [i/100 for i in range(1,19)],
#         'max_depth': [i for i in range(3,11)],
#         'subsample': [i/10 for i in range(5,11)],
#         'scale_pos_weight':[1,1.5,2,2.5],
        'reg_lambda': [0.8,1,1.2,1.5,2,5],
#         'n_estimators': [800, 1000, 1500, 2000],
#         'min_child_weight': range(1, 6, 2),
#         'gamma': [i/10.0 for i in range(0, 5)]
        }
gs_xgb_5 = GridSearchCV(xgbc_5, parameters_5, verbose=2, cv=4, n_jobs=-1)
gs_xgb_5.fit(x_train,y_train)
y_pred_5 = gs_xgb_5.predict(x_test)
print('The accuracy of classifying training data with XGBoost is :',
      gs_xgb_5.score(x_test, y_test))
print(classification_report(y_test, gs_xgb_5.predict(x_test)))

Fitting 4 folds for each of 6 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done  14 out of  24 | elapsed: 160.1min remaining: 114.4min
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed: 164.6min finished
  if diff:
  if diff:


The accuracy of classifying training data with XGBoost is : 0.693069306930693
             precision    recall  f1-score   support

          0       0.70      0.82      0.75      7192
          1       0.52      0.23      0.32      2357
          2       0.72      0.73      0.72      4995

avg / total       0.68      0.69      0.67     14544



  if diff:


In [15]:
gs_xgb_5.best_estimator_, gs_xgb_5.best_score_, gs_xgb_5.best_params_

(XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=0.8, eval_metric='merror', gamma=0,
        learning_rate=0.09, max_delta_step=0, max_depth=8,
        min_child_weight=1, missing=None, n_estimators=800, n_jobs=-1,
        nthread=None, num_class=3, objective='multi:softprob',
        random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
        seed=1000, silent=0, subsample=0.8),
 0.7048267326732673,
 {'reg_lambda': 1})