In [1]:
import warnings
warnings.filterwarnings("ignore")
from pandas import to_datetime
import numpy as np
import pandas as pd
import datetime

**Read Training Data**

In [2]:
train_data = pd.read_csv('data/train.csv')

In [3]:
test_data = pd.read_csv('data/test.csv')

In [4]:
# find all holiday in 2017 and 2018
holiday_2017 = {
        'month': [1, 1, 1, 1, 4, 4, 4, 4, 5, 5, 5, 7, 10, 10, 10, 12, 12],
        'day': [2, 28, 30, 31, 4, 14, 15, 17, 1, 3, 30, 1, 2, 5, 28, 25, 26]
    }
holiday_2018 = {
    'month': [1, 2, 2, 2, 3, 3, 4, 4, 5, 5, 6, 7, 9, 10, 10, 12, 12],
    'day': [2, 28, 30, 31, 4, 14, 15, 17, 1, 3, 30, 1, 2, 5, 28, 25, 26]
}

In [5]:
# Storm day in 2017 and 2018
storm_2017 = {
    'month': [6,6,6,6,7,7,8,8,8,8],
    'day': [10,11,12,13,22,23,24,25,26,27]
}
storm_2018 = {
    'month': [9,9,9,9,9,9,9,9,9,9,9],
    'day': [7,8,9,10,11,12,13,14,15,16,17]
}

### Pre-processing ###

In [6]:
def process_data(data):
    data_process = data.copy()
    temp = data_process['date']
    date_in_train = to_datetime(temp, format="%d/%m/%Y %H:%M") 
    data_process.drop(columns=['date', 'id'], inplace=True, axis=1)
    
    # identify the day
    data_process['weekday'] = date_in_train.dt.weekday
    data_process['year'] = date_in_train.dt.year
    data_process['month'] = date_in_train.dt.month
    data_process['day'] = date_in_train.dt.day
    data_process['hour'] = date_in_train.dt.hour
    
    # determine if it is holiday
    data_process['holiday'] = 0
    for i in range(len(holiday_2017['month'])):
        h_m_17 = holiday_2017['month'][i]
        h_d_17 = holiday_2017['day'][i]
        data_process.loc[(data_process.year==2017)&(data_process.month==h_m_17)&(data_process.day==h_d_17), 'holiday']=1
    for i in range(len(holiday_2018['month'])):
        h_m_18 = holiday_2018['month'][i]
        h_d_18 = holiday_2018['day'][i]
        data_process.loc[(data_process.year==2018)&(data_process.month==h_m_18)&(data_process.day==h_d_18), 'holiday']=1
        
    # determine if it is storm day
    data_process['storm'] = 0
    for i in range(len(storm_2017['month'])):
        s_m_17 = storm_2017['month'][i]
        s_d_17 = storm_2017['day'][i]
        data_process.loc[(data_process.year==2017)&(data_process.month==s_m_17)&(data_process.day==s_d_17), 'storm']=1
    for i in range(len(storm_2018['month'])):
        s_m_18 = storm_2018['month'][i]
        s_d_18 = storm_2018['day'][i]
        data_process.loc[(data_process.year==2017)&(data_process.month==s_m_18)&(data_process.day==s_d_18), 'storm']=1
        
    # determine if it is Weekend (Saturday or Sunday)
    data_process['Saturday'] = data_process['weekday'].apply(lambda x : 1 if x==5 else 0)
    data_process['Sunday'] = data_process['weekday'].apply(lambda x : 1 if x==6 else 0)
    
    # determine if it is working hour
    data_process['working_hour'] = data_process['hour'].apply(lambda x : 1 if x>=7 and x<=19 else 0)
    
    # determine the seasons
    data_process['spring'] = data_process['month'].apply(lambda x : 1 if x>0 and x<4 else 0)
    data_process['summer'] = data_process['month'].apply(lambda x : 1 if x>3 and x<7 else 0)
    data_process['autumn'] = data_process['month'].apply(lambda x : 1 if x>6 and x<10 else 0)
    data_process['winter'] = data_process['month'].apply(lambda x : 1 if x>9 and x<13 else 0)
    
    # determine the day in month
    data_process['early_mon'] = 0
    data_process['mid_mon'] = 0
    data_process['end_mon'] = 0
    data_process.loc[(data_process['day']>=1) & (data_process['day']<=10), 'early_mon'] = 1
    data_process.loc[(data_process['day']>=11) & (data_process['day']<=20), 'mid_mon'] = 1
    data_process.loc[data_process['day']>=21, 'end_mon'] = 1
    
    # Change weekday into dummy value
    temp_column = pd.get_dummies(data_process['weekday'], prefix='weekday')
    data_process.drop(['weekday'], axis=1, inplace=True)
    data_process = data_process.join(temp_column)
    
    return data_process

In [7]:
pro_train_data = process_data(train_data)

### Train ###

In [8]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from xgboost.sklearn import XGBRegressor

In [9]:
X_pre_train = pro_train_data.drop('speed',axis = 1)
y_pre_train = pro_train_data.speed

In [10]:
xgb = XGBRegressor()

Do not run the training part, it's really time consuming !!!

In [11]:
# para = {
#     'learning_rate': [0.01,0.03,0.05],
#     'max_depth': [15,20,25],
#     'min_child_weight': [1,2,3],
#     'subsample': [0.5,0.6,0.7],
#     'colsample_bytree': [0.7,0.8,0.9],
#     'n_estimators': [700,750,800,850,900],
#     'gamma':[0.5,0.6,0.7],
#     'reg_alpha': [1,2], 
#     'reg_lambda': [1,2]
# }

# xgb_rscv = RandomizedSearchCV(
#     xgb,
#     para,
#     cv=10,
#     n_jobs=-1,
#     verbose=True, 
#     scoring="neg_mean_squared_error"
# )

In [12]:
# X_train, X_test, y_train, y_test = train_test_split(X_pre_train, y_pre_train, test_size = 0.2, random_state = 100)
# xgb_rscv.fit(X_train,y_train)
# print(xgb_rscv.best_estimator_)
# print(xgb_rscv.best_score_)

The best parameters are put into final model directly!

**Final Model**

In [13]:
final_model = XGBRegressor(
    base_score=0.5, 
    colsample_bytree=0.8, 
    gamma=0.6,
    learning_rate=0.03, 
    max_depth=20,
    min_child_weight=2,
    n_estimators=800,
    reg_alpha=2,
    subsample=0.6
)

In [14]:
final_model.fit(X_pre_train, y_pre_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, gamma=0.6, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.03, max_delta_step=0, max_depth=20,
             min_child_weight=2, missing=nan, monotone_constraints='()',
             n_estimators=800, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=2, reg_lambda=1, scale_pos_weight=1, subsample=0.6,
             tree_method='exact', validate_parameters=1, verbosity=None)

**Test**

In [15]:
pro_test_data = process_data(test_data)

In [16]:
test_speed = final_model.predict(pro_test_data)

In [17]:
testFinal = pd.read_csv('data/sampleSubmission.csv')
testFinal['speed'] = test_speed

In [18]:
testFinal.to_csv('test.csv', encoding='utf-8', columns=['id','speed'], index=False)