In [2]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.externals import joblib
from sklearn.model_selection import ParameterGrid
from ultis import *

In [3]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

读取处理好的特征数据

In [4]:
df = pd.read_csv('data/training.txt', delimiter=';', parse_dates=['time_interval_begin'], dtype={'link_ID': object})
df.head()

Unnamed: 0,link_ID,date,time_interval_begin,travel_time,imputation1,lagging1,lagging2,lagging3,lagging4,lagging5,length,area,vacation,minute_series,day_of_week,day_of_week_en,hour_en,"week_hour_1.0,1.0","week_hour_1.0,2.0","week_hour_1.0,3.0","week_hour_2.0,1.0","week_hour_2.0,2.0","week_hour_2.0,3.0","week_hour_3.0,1.0","week_hour_3.0,2.0","week_hour_3.0,3.0",links_num_2,links_num_3,links_num_4,links_num_5,width_3,width_6,width_9,width_12,width_15,link_ID_en
0,3377906280028510514,2017-03-01,2017-03-01 06:00:00,1.66236,True,,,,,,48,144,0.0,0.0,3,1.0,1.0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,47
1,3377906280028510514,2017-03-01,2017-03-01 06:02:00,1.681661,True,1.66236,,,,,48,144,0.0,2.0,3,1.0,1.0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,47
2,3377906280028510514,2017-03-01,2017-03-01 06:04:00,1.676155,True,1.681661,1.66236,,,,48,144,0.0,4.0,3,1.0,1.0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,47
3,3377906280028510514,2017-03-01,2017-03-01 06:06:00,1.683786,True,1.676155,1.681661,1.66236,,,48,144,0.0,6.0,3,1.0,1.0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,47
4,3377906280028510514,2017-03-01,2017-03-01 06:08:00,1.683193,True,1.683786,1.676155,1.681661,1.66236,,48,144,0.0,8.0,3,1.0,1.0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,47


时间序列特征

In [6]:
lagging = 5
lagging_feature = ['lagging%01d' % e for e in range(lagging, 0, -1)]
lagging_feature

['lagging5', 'lagging4', 'lagging3', 'lagging2', 'lagging1']

In [8]:
base_feature = [x for x in df.columns.values.tolist() if x not in ['time_interval_begin', 'link_ID', 'link_ID_int',
                                                                   'date', 'travel_time', 'imputation1',
                                                                   'minute_series', 'area', 'hour_en', 'day_of_week']]

In [11]:
base_feature = [x for x in base_feature if x not in lagging_feature]

In [12]:
train_feature = list(base_feature)
train_feature.extend(lagging_feature)
valid_feature = list(base_feature)
valid_feature.extend(['minute_series', 'travel_time'])
print (train_feature)

['length', 'vacation', 'day_of_week_en', 'week_hour_1.0,1.0', 'week_hour_1.0,2.0', 'week_hour_1.0,3.0', 'week_hour_2.0,1.0', 'week_hour_2.0,2.0', 'week_hour_2.0,3.0', 'week_hour_3.0,1.0', 'week_hour_3.0,2.0', 'week_hour_3.0,3.0', 'links_num_2', 'links_num_3', 'links_num_4', 'links_num_5', 'width_3', 'width_6', 'width_9', 'width_12', 'width_15', 'link_ID_en', 'lagging5', 'lagging4', 'lagging3', 'lagging2', 'lagging1']


xgboost训练参数：


In [13]:
params_grid = {
    'learning_rate': [0.05],
    'n_estimators': [100],
    'subsample': [0.6],
    'colsample_bytree': [0.6],
    'max_depth': [7],
    'min_child_weight': [1],
    'reg_alpha': [2],
    'gamma': [0]
}

In [15]:
grid = ParameterGrid(params_grid)

训练模块

In [17]:
def fit_evaluate(df, df_test, params):
    df = df.dropna()
    X = df[train_feature].values
    y = df['travel_time'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

    df_test = df_test[valid_feature].values
    valid_data = bucket_data(df_test)

    eval_set = [(X_test, y_test)]
    regressor = xgb.XGBRegressor(learning_rate=params['learning_rate'], n_estimators=params['n_estimators'],
                                 booster='gbtree', objective='reg:linear', n_jobs=-1, subsample=params['subsample'],
                                 colsample_bytree=params['colsample_bytree'], random_state=0,
                                 max_depth=params['max_depth'], gamma=params['gamma'],
                                 min_child_weight=params['min_child_weight'], reg_alpha=params['reg_alpha'])
    regressor.fit(X_train, y_train, verbose=False, early_stopping_rounds=10, eval_metric=mape_ln,
                  eval_set=eval_set)
    return regressor, cross_valid(regressor, valid_data,lagging=lagging), regressor.best_iteration, regressor.best_score

In [18]:
def train(df, params, best, vis=False):
    train1 = df.loc[df['time_interval_begin'] <= pd.to_datetime('2017-03-24')]
    train2 = df.loc[
        (df['time_interval_begin'] > pd.to_datetime('2017-03-24')) & (
            df['time_interval_begin'] <= pd.to_datetime('2017-04-18'))]
    train3 = df.loc[
        (df['time_interval_begin'] > pd.to_datetime('2017-04-18')) & (
            df['time_interval_begin'] <= pd.to_datetime('2017-05-12'))]
    train4 = df.loc[
        (df['time_interval_begin'] > pd.to_datetime('2017-05-12')) & (
            df['time_interval_begin'] <= pd.to_datetime('2017-06-06'))]
    train5 = df.loc[
        (df['time_interval_begin'] > pd.to_datetime('2017-06-06')) & (
            df['time_interval_begin'] <= pd.to_datetime('2017-06-30'))]

    regressor, loss1, best_iteration1, best_score1 = fit_evaluate(pd.concat([train1, train2, train3, train4]), train5,
                                                                  params)
    print (best_iteration1, best_score1, loss1)

    regressor, loss2, best_iteration2, best_score2 = fit_evaluate(pd.concat([train1, train2, train3, train5]), train4,
                                                                  params)
    print (best_iteration2, best_score2, loss2)

    regressor, loss3, best_iteration3, best_score3 = fit_evaluate(pd.concat([train1, train2, train4, train5]), train3,
                                                                  params)
    print (best_iteration3, best_score3, loss3)

    regressor, loss4, best_iteration4, best_score4 = fit_evaluate(pd.concat([train1, train3, train4, train5]), train2,
                                                                  params)
    print (best_iteration4, best_score4, loss4)

    regressor, loss5, best_iteration5, best_score5 = fit_evaluate(pd.concat([train2, train3, train4, train5]), train1,
                                                                  params)
    print (best_iteration5, best_score5, loss5)
    
    loss = [loss1, loss2, loss3, loss4, loss5]
    params['loss_std'] = np.std(loss)
    params['loss'] = str(loss)
    params['mean_loss'] = np.mean(loss)
    params['n_estimators'] = str([best_iteration1, best_iteration2, best_iteration3, best_iteration4, best_iteration5])
    params['best_score'] = str([best_score1, best_score2, best_score3, best_score4, best_score5])
    
    print (str(params))
    if np.mean(loss) <= best:
        best = np.mean(loss)
        print ("best with: " + str(params))
        #feature_vis(regressor, train_feature)
    return best

In [19]:
best = 1
for params in grid:
    best = train(df, params, best)

86 0.174535 0.09750734006537107
99 0.149778 0.22697949568099954
99 0.142421 0.26995488782745247
99 0.141303 0.2782240572632962
99 0.142115 0.2816868987204913
{'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': '[86, 99, 99, 99, 99]', 'reg_alpha': 2, 'subsample': 0.6, 'loss_std': 0.06950270135557089, 'loss': '[0.09750734006537107, 0.22697949568099954, 0.26995488782745247, 0.2782240572632962, 0.2816868987204913]', 'mean_loss': 0.2308705359115221, 'best_score': '[0.174535, 0.149778, 0.142421, 0.141303, 0.142115]'}
best with: {'colsample_bytree': 0.6, 'gamma': 0, 'learning_rate': 0.05, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': '[86, 99, 99, 99, 99]', 'reg_alpha': 2, 'subsample': 0.6, 'loss_std': 0.06950270135557089, 'loss': '[0.09750734006537107, 0.22697949568099954, 0.26995488782745247, 0.2782240572632962, 0.2816868987204913]', 'mean_loss': 0.2308705359115221, 'best_score': '[0.174535, 0.149778, 0.142421, 0.14130

### 生成预测序列

In [20]:
submit_params = {
     'learning_rate': 0.05,
     'n_estimators': 100,
     'subsample': 0.6,
     'colsample_bytree': 0.6,
     'max_depth': 7,
     'min_child_weight': 1,
     'reg_alpha': 2,
     'gamma': 0
}

In [21]:
def xgboost_submit(df, params):
    train_df = df.loc[df['time_interval_begin'] < pd.to_datetime('2017-07-01')]

    train_df = train_df.dropna()
    X = train_df[train_feature].values
    y = train_df['travel_time'].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)

    eval_set = [(X_test, y_test)]
    regressor = xgb.XGBRegressor(learning_rate=params['learning_rate'], n_estimators=params['n_estimators'],
                                 booster='gbtree', objective='reg:linear', n_jobs=-1, subsample=params['subsample'],
                                 colsample_bytree=params['colsample_bytree'], random_state=0,
                                 max_depth=params['max_depth'], gamma=params['gamma'],
                                 min_child_weight=params['min_child_weight'], reg_alpha=params['reg_alpha'])
    regressor.fit(X_train, y_train, verbose=True, early_stopping_rounds=10, eval_metric=mape_ln,
                  eval_set=eval_set)
    #feature_vis(regressor, train_feature)
    joblib.dump(regressor, 'model/xgbr.pkl')
    print (regressor)
    submission(train_feature, regressor, df, 'submission/xgbr1.txt', 'submission/xgbr2.txt', 'submission/xgbr3.txt',
               'submission/xgbr4.txt')

In [22]:
xgboost_submit(df, submit_params)

OSError: [WinError -529697949] Windows Error 0xe06d7363