In [30]:
import warnings 
warnings.filterwarnings("ignore")
import pandas as pd
import numpy as np
import os
import logging
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import linear_model
logging.basicConfig(level=logging.INFO)

In [2]:
train_path = '../data/training.txt'
df = pd.read_csv(train_path, delimiter=';', parse_dates=['time_interval_begin'], dtype={'link_ID': object})

In [3]:
df.head()

Unnamed: 0,link_ID,date,time_interval_begin,travel_time,imputation1,lagging1,lagging2,lagging3,lagging4,lagging5,...,"links_num_2,1","links_num_2,2","links_num_3,1","links_num_4,1",width_3,width_6,width_9,width_12,width_15,link_ID_en
0,3377906280028510514,2017-03-01,2017-03-01 06:00:00,1.656108,True,,,,,,...,0,0,0,0,1,0,0,0,0,46
1,3377906280028510514,2017-03-01,2017-03-01 06:02:00,1.661686,True,1.656108,,,,,...,0,0,0,0,1,0,0,0,0,46
2,3377906280028510514,2017-03-01,2017-03-01 06:04:00,1.6672,True,1.661686,1.656108,,,,...,0,0,0,0,1,0,0,0,0,46
3,3377906280028510514,2017-03-01,2017-03-01 06:06:00,1.672652,True,1.6672,1.661686,1.656108,,,...,0,0,0,0,1,0,0,0,0,46
4,3377906280028510514,2017-03-01,2017-03-01 06:08:00,1.67804,True,1.672652,1.6672,1.661686,1.656108,,...,0,0,0,0,1,0,0,0,0,46


In [5]:
df.columns

Index(['link_ID', 'date', 'time_interval_begin', 'travel_time', 'imputation1',
       'lagging1', 'lagging2', 'lagging3', 'lagging4', 'lagging5', 'length',
       'area', 'vacation', 'minute_series', 'day_of_week', 'day_of_week_en',
       'hour_en', 'week_hour_1.0,1.0', 'week_hour_1.0,2.0',
       'week_hour_1.0,3.0', 'week_hour_2.0,1.0', 'week_hour_2.0,2.0',
       'week_hour_2.0,3.0', 'week_hour_3.0,1.0', 'week_hour_3.0,2.0',
       'week_hour_3.0,3.0', 'links_num_0,1', 'links_num_0,2', 'links_num_1,0',
       'links_num_1,1', 'links_num_1,2', 'links_num_1,3', 'links_num_1,4',
       'links_num_2,0', 'links_num_2,1', 'links_num_2,2', 'links_num_3,1',
       'links_num_4,1', 'width_3', 'width_6', 'width_9', 'width_12',
       'width_15', 'link_ID_en'],
      dtype='object')

In [6]:
lagging = 5
lagging_feature = ['lagging%01d' % e for e in range(lagging, 0, -1)]
base_feature = [x for x in df.columns.values.tolist() if x not in ['time_interval_begin', 'link_ID',
                                                                   'date', 'travel_time', 'imputation1',
                                                                   'minute_series', 'area', 'hour_en', 'day_of_week']]

In [18]:
base_feature = [x for x in base_feature if x not in lagging_feature]

In [19]:
base_feature

['length',
 'vacation',
 'day_of_week_en',
 'week_hour_1.0,1.0',
 'week_hour_1.0,2.0',
 'week_hour_1.0,3.0',
 'week_hour_2.0,1.0',
 'week_hour_2.0,2.0',
 'week_hour_2.0,3.0',
 'week_hour_3.0,1.0',
 'week_hour_3.0,2.0',
 'week_hour_3.0,3.0',
 'links_num_0,1',
 'links_num_0,2',
 'links_num_1,0',
 'links_num_1,1',
 'links_num_1,2',
 'links_num_1,3',
 'links_num_1,4',
 'links_num_2,0',
 'links_num_2,1',
 'links_num_2,2',
 'links_num_3,1',
 'links_num_4,1',
 'width_3',
 'width_6',
 'width_9',
 'width_12',
 'width_15',
 'link_ID_en']

In [13]:
type(base_feature)

list

In [20]:
train_features = base_feature.copy()
train_features.extend(lagging_feature)

In [21]:
train_features

['length',
 'vacation',
 'day_of_week_en',
 'week_hour_1.0,1.0',
 'week_hour_1.0,2.0',
 'week_hour_1.0,3.0',
 'week_hour_2.0,1.0',
 'week_hour_2.0,2.0',
 'week_hour_2.0,3.0',
 'week_hour_3.0,1.0',
 'week_hour_3.0,2.0',
 'week_hour_3.0,3.0',
 'links_num_0,1',
 'links_num_0,2',
 'links_num_1,0',
 'links_num_1,1',
 'links_num_1,2',
 'links_num_1,3',
 'links_num_1,4',
 'links_num_2,0',
 'links_num_2,1',
 'links_num_2,2',
 'links_num_3,1',
 'links_num_4,1',
 'width_3',
 'width_6',
 'width_9',
 'width_12',
 'width_15',
 'link_ID_en',
 'lagging5',
 'lagging4',
 'lagging3',
 'lagging2',
 'lagging1']

In [22]:
params = {


        'boosting_type': 'gbdt',
        'objective': 'regression',
        'metric': 'mae',
        'num_leaves': 63,
        'learning_rate': 0.01,
        'feature_fraction': 0.8,
        'bagging_fraction': 0.9,
        'bagging_seed':0,
        'bagging_freq': 1,
        'verbose': 1,
        'reg_alpha':1,
        'reg_lambda':2,
        'min_child_weight':6
    }

In [34]:
def lightGBM_submit(df,params):
    train_df = df.loc[df['time_interval_begin'] < pd.to_datetime('2017-07-01')]
    train_df = train_df.dropna()
    X = train_df[train_features].values
    y = train_df['travel_time'].values
    
    test_df = df.loc[df['time_interval_begin'] >= pd.to_datetime('2017-07-01')]
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
    
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_evals = lgb.Dataset(X_test, y_test , reference=lgb_train)
    
    logging.info('begin train lgb')
    # 通过验证集调参（超参数），进行模型选择
    gbm = lgb.train(params,
                    lgb_train,
                    num_boost_round=1000,
                    valid_sets=[lgb_train,lgb_evals],
                    valid_names=['train','valid'],
                    early_stopping_rounds=50,
                    verbose_eval=100,
                    )
    
    
    logging.info('train success')
    logging.info('begin predict')
    test_df['prediction'] = gbm.predict(test_df[train_features].values)
    return test_df

In [35]:
test_df = lightGBM_submit(df,params)

INFO:root:begin train lgb


Training until validation scores don't improve for 50 rounds.
[100]	train's l1: 0.339439	valid's l1: 0.339252
[200]	train's l1: 0.200063	valid's l1: 0.199911
[300]	train's l1: 0.163285	valid's l1: 0.163269
[400]	train's l1: 0.154086	valid's l1: 0.154113
[500]	train's l1: 0.151534	valid's l1: 0.151565
[600]	train's l1: 0.150599	valid's l1: 0.15063
[700]	train's l1: 0.150172	valid's l1: 0.150206
[800]	train's l1: 0.149955	valid's l1: 0.150002
[900]	train's l1: 0.149814	valid's l1: 0.149878


INFO:root:train success
INFO:root:begin predict


[1000]	train's l1: 0.149708	valid's l1: 0.149793
Did not meet early stopping. Best iteration is:
[1000]	train's l1: 0.149708	valid's l1: 0.149793


In [36]:
test_df

Unnamed: 0,link_ID,date,time_interval_begin,travel_time,imputation1,lagging1,lagging2,lagging3,lagging4,lagging5,...,"links_num_2,2","links_num_3,1","links_num_4,1",width_3,width_6,width_9,width_12,width_15,link_ID_en,prediction
32940,3377906280028510514,2017-07-01,2017-07-01 06:00:00,1.684471,True,,,,,,...,0,0,0,1,0,0,0,0,46,0.892778
32941,3377906280028510514,2017-07-01,2017-07-01 06:02:00,1.690049,True,1.684471,,,,,...,0,0,0,1,0,0,0,0,46,1.681868
32942,3377906280028510514,2017-07-01,2017-07-01 06:04:00,1.695564,True,1.690049,1.684471,,,,...,0,0,0,1,0,0,0,0,46,1.725928
32943,3377906280028510514,2017-07-01,2017-07-01 06:06:00,1.701015,True,1.695564,1.690049,1.684471,,,...,0,0,0,1,0,0,0,0,46,1.692113
32944,3377906280028510514,2017-07-01,2017-07-01 06:08:00,1.706404,True,1.701015,1.695564,1.690049,1.684471,,...,0,0,0,1,0,0,0,0,46,1.655573
32945,3377906280028510514,2017-07-01,2017-07-01 06:10:00,1.711730,True,1.706404,1.701015,1.695564,1.690049,1.684471,...,0,0,0,1,0,0,0,0,46,1.738458
32946,3377906280028510514,2017-07-01,2017-07-01 06:12:00,1.716994,True,1.711730,1.706404,1.701015,1.695564,1.690049,...,0,0,0,1,0,0,0,0,46,1.740386
32947,3377906280028510514,2017-07-01,2017-07-01 06:14:00,1.722196,True,1.716994,1.711730,1.706404,1.701015,1.695564,...,0,0,0,1,0,0,0,0,46,1.740201
32948,3377906280028510514,2017-07-01,2017-07-01 06:16:00,1.727337,True,1.722196,1.716994,1.711730,1.706404,1.701015,...,0,0,0,1,0,0,0,0,46,1.740201
32949,3377906280028510514,2017-07-01,2017-07-01 06:18:00,1.732417,True,1.727337,1.722196,1.716994,1.711730,1.706404,...,0,0,0,1,0,0,0,0,46,1.753818


In [42]:
# submit

# 恢复数据
test_df['prediction'] = np.expm1(test_df['prediction'])

test_df['time_interval_begin'] = test_df['time_interval_begin'] + pd.DateOffset(minutes=2)
test_df['time_interval'] = test_df['time_interval_begin'].map(
            lambda x: '[' + str(x) + ',' + str(x + pd.DateOffset(minutes=2)) + ')')
test_df.time_interval = test_df.time_interval.astype(object)
        
submit_file = '../data/submit_lgb_v1.txt'

test_df[['link_ID', 'date', 'time_interval', 'prediction']].to_csv(submit_file, mode='a', header=False,
                                                                              index=False,
                                                                              sep=';')

### 现在有一个很严重的问题

In [43]:
# 因为给定1-n ,预测n+1 - n+ m的时间序列预测
# 因此在预测出t时刻的travel_time后，需要把这个travel_time作为预测t+1时刻travel_time的lagging1特征
# 这个lagging 特征需要根据上次的预测结果进行更新的，如此反复预测直到最后一个时刻的travel_time

In [46]:
sub_df = test_df[['link_ID', 'date', 'time_interval', 'prediction']]

In [47]:
sub_df

Unnamed: 0,link_ID,date,time_interval,prediction
32940,3377906280028510514,2017-07-01,"[2017-07-01 06:02:00,2017-07-01 06:04:00)",1.441903
32941,3377906280028510514,2017-07-01,"[2017-07-01 06:04:00,2017-07-01 06:06:00)",4.375586
32942,3377906280028510514,2017-07-01,"[2017-07-01 06:06:00,2017-07-01 06:08:00)",4.617733
32943,3377906280028510514,2017-07-01,"[2017-07-01 06:08:00,2017-07-01 06:10:00)",4.430943
32944,3377906280028510514,2017-07-01,"[2017-07-01 06:10:00,2017-07-01 06:12:00)",4.236078
32945,3377906280028510514,2017-07-01,"[2017-07-01 06:12:00,2017-07-01 06:14:00)",4.688563
32946,3377906280028510514,2017-07-01,"[2017-07-01 06:14:00,2017-07-01 06:16:00)",4.699544
32947,3377906280028510514,2017-07-01,"[2017-07-01 06:16:00,2017-07-01 06:18:00)",4.698489
32948,3377906280028510514,2017-07-01,"[2017-07-01 06:18:00,2017-07-01 06:20:00)",4.698489
32949,3377906280028510514,2017-07-01,"[2017-07-01 06:20:00,2017-07-01 06:22:00)",4.776613
