In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import warnings
warnings.filterwarnings("ignore")

# Preprocessing Dataset

In [5]:
data = pd.read_csv("train.csv", dayfirst=True, parse_dates=['date'])
test_data = pd.read_csv("test.csv", dayfirst=True, parse_dates=['date'])
data['y-m-d'] = data['date'].dt.date
test_data['y-m-d'] = test_data['date'].dt.date

data['year'] = pd.Series([i.year for i in data['date']])
data['month'] = pd.Series([i.month for i in data['date']])
data['day'] = pd.Series([i.day for i in data['date']])
data['hour'] = pd.Series([i.hour for i in data['date']])

test_data['year'] = pd.Series([i.year for i in test_data['date']])
test_data['month'] = pd.Series([i.month for i in test_data['date']])
test_data['day'] = pd.Series([i.day for i in test_data['date']])
test_data['hour'] = pd.Series([i.hour for i in test_data['date']])

days = ['Mon','Tue','Wed','Thurs','Fri','Sat','Sun']
data.set_index(['date'], inplace=True)
for i in range(7):
    data[days[i]] = (data.index.dayofweek == i).astype(int)

test_data.set_index(['date'], inplace=True)
for i in range(7):
    test_data[days[i]] = (test_data.index.dayofweek == i).astype(int)

# Holiday Feature 

In [6]:
data.set_index('y-m-d', inplace=True)
HKholidays = ['2017-01-02', '2017-1-28', '2017-1-30', '2017-1-31', \
                   '2017-4-4', '2017-4-5', '2017-4-15', '2017-4-17',  \
                   '2017-5-1', '2017-5-3', '2017-5-30', '2017-7-1',   \
                   '2017-10-2', '2017-10-5', '2017-10-28', '2017-12-25', '2017-12-26',
                   '2018-01-01', '2018-2-16', '2018-2-17', '2018-2-19', \
                   '2018-3-30', '2018-3-31', '2018-4-2', '2018-4-5',  \
                   '2018-5-1', '2018-5-22', '2018-6-18', '2018-7-2',   \
                   '2018-9-25', '2018-10-1', '2018-10-17', '2018-12-25', '2018-12-26']
HKholidays = pd.to_datetime(HKholidays)
HKholidays_col = pd.Series(1,index=HKholidays,name='Isholiday')
data = data.join(HKholidays_col)
data['Isholiday'].fillna(0,inplace=True)

test_data = test_data.join(HKholidays_col)
test_data['Isholiday'].fillna(0,inplace=True) 

In [7]:
test_data

Unnamed: 0_level_0,id,y-m-d,year,month,day,hour,Mon,Tue,Wed,Thurs,Fri,Sat,Sun,Isholiday
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2018-01-01 02:00:00,0,2018-01-01,2018,1,1,2,1,0,0,0,0,0,0,0.0
2018-01-01 05:00:00,1,2018-01-01,2018,1,1,5,1,0,0,0,0,0,0,0.0
2018-01-01 07:00:00,2,2018-01-01,2018,1,1,7,1,0,0,0,0,0,0,0.0
2018-01-01 08:00:00,3,2018-01-01,2018,1,1,8,1,0,0,0,0,0,0,0.0
2018-01-01 10:00:00,4,2018-01-01,2018,1,1,10,1,0,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-31 17:00:00,3499,2018-12-31,2018,12,31,17,1,0,0,0,0,0,0,0.0
2018-12-31 19:00:00,3500,2018-12-31,2018,12,31,19,1,0,0,0,0,0,0,0.0
2018-12-31 21:00:00,3501,2018-12-31,2018,12,31,21,1,0,0,0,0,0,0,0.0
2018-12-31 22:00:00,3502,2018-12-31,2018,12,31,22,1,0,0,0,0,0,0,0.0


In [8]:
data

Unnamed: 0,id,speed,year,month,day,hour,Mon,Tue,Wed,Thurs,Fri,Sat,Sun,Isholiday
2017-01-01,0,43.002930,2017,1,1,0,0,0,0,0,0,0,1,0.0
2017-01-01,1,46.118696,2017,1,1,1,0,0,0,0,0,0,1,0.0
2017-01-01,2,44.294158,2017,1,1,2,0,0,0,0,0,0,1,0.0
2017-01-01,3,41.067468,2017,1,1,3,0,0,0,0,0,0,1,0.0
2017-01-01,4,46.448653,2017,1,1,4,0,0,0,0,0,0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-31,14001,19.865269,2018,12,31,12,1,0,0,0,0,0,0,0.0
2018-12-31,14002,17.820375,2018,12,31,15,1,0,0,0,0,0,0,0.0
2018-12-31,14003,12.501851,2018,12,31,16,1,0,0,0,0,0,0,0.0
2018-12-31,14004,15.979319,2018,12,31,18,1,0,0,0,0,0,0,0.0


# Model

In [9]:
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

X, y = data[["day","month","year","hour",'Mon','Tue','Wed','Thurs','Fri','Sat','Sun','Isholiday']], data["speed"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.18, random_state=42)
model = xgb.XGBRegressor(max_depth=7, learning_rate=0.07, n_estimators=400, objective='reg:squarederror') 
model.fit(X_train, y_train)
y_pre = model.predict(X_test)
print('mse:', mean_squared_error(y_test, y_pre))

mse: 10.088266231924132


In [10]:
test_data['speed'] = model.predict(test_data[["day","month","year","hour",'Mon','Tue','Wed','Thurs','Fri','Sat','Sun','Isholiday']])
test_output = test_data[['id', 'speed']].set_index('id')
test_output

Unnamed: 0_level_0,speed
id,Unnamed: 1_level_1
0,48.043549
1,48.058609
2,29.983295
3,19.199633
4,31.651152
...,...
3499,10.844743
3500,22.862059
3501,50.170872
3502,42.258659


In [None]:
test_output.to_csv('Submission_final.csv')