In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from datetime import datetime
import warnings
warnings.filterwarnings("ignore")

# Read Weather

In [2]:
dateparse = lambda dates: datetime.strptime(dates, '%Y-%m-%d')
df_weather = pd.read_csv('hongkong.csv', parse_dates=['date_time'], index_col='date_time', date_parser=dateparse)
weather_features = df_weather[['tempC','visibility','winddirDegree','windspeedKmph','humidity','cloudcover', 'WindChillC']]
weather_features

Unnamed: 0_level_0,tempC,visibility,winddirDegree,windspeedKmph,humidity,cloudcover,WindChillC
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-01-01,21,10,76,16,80,30,19
2017-01-02,22,10,77,10,80,11,21
2017-01-03,22,10,81,14,81,8,21
2017-01-04,22,10,80,15,82,8,21
2017-01-05,22,10,72,9,83,42,21
...,...,...,...,...,...,...,...
2018-12-27,22,10,39,12,66,43,22
2018-12-28,20,10,36,19,61,27,18
2018-12-29,14,10,21,20,62,44,12
2018-12-30,13,10,10,20,59,70,9


# Extract year/month/day/hour/week

In [3]:
data = pd.read_csv("train.csv", dayfirst=True, parse_dates=['date'])
test_data = pd.read_csv("test.csv", dayfirst=True, parse_dates=['date'])
data['y-m-d'] = data['date'].dt.date
test_data['y-m-d'] = test_data['date'].dt.date

data['year'] = pd.Series([i.year for i in data['date']])
data['month'] = pd.Series([i.month for i in data['date']])
data['day'] = pd.Series([i.day for i in data['date']])
data['hour'] = pd.Series([i.hour for i in data['date']])

test_data['year'] = pd.Series([i.year for i in test_data['date']])
test_data['month'] = pd.Series([i.month for i in test_data['date']])
test_data['day'] = pd.Series([i.day for i in test_data['date']])
test_data['hour'] = pd.Series([i.hour for i in test_data['date']])

days = ['Mon','Tue','Wed','Thurs','Fri','Sat','Sun']
data.set_index(['date'], inplace=True)
for i in range(7):
    data[days[i]] = (data.index.dayofweek == i).astype(int)

test_data.set_index(['date'], inplace=True)
for i in range(7):
    test_data[days[i]] = (test_data.index.dayofweek == i).astype(int)

In [4]:
test_data.shape

(3504, 13)

# Holiday Feature 

In [5]:
data.set_index('y-m-d', inplace=True)
HKholidays = ['2017-01-02', '2017-1-28', '2017-1-30', '2017-1-31', \
                   '2017-4-4', '2017-4-5', '2017-4-15', '2017-4-17',  \
                   '2017-5-1', '2017-5-3', '2017-5-30', '2017-7-1',   \
                   '2017-10-2', '2017-10-5', '2017-10-28', '2017-12-25', '2017-12-26',
                   '2018-01-01', '2018-2-16', '2018-2-17', '2018-2-19', \
                   '2018-3-30', '2018-3-31', '2018-4-2', '2018-4-5',  \
                   '2018-5-1', '2018-5-22', '2018-6-18', '2018-7-2',   \
                   '2018-9-25', '2018-10-1', '2018-10-17', '2018-12-25', '2018-12-26']
HKholidays = pd.to_datetime(HKholidays)
HKholidays_col = pd.Series(1,index=HKholidays,name='Isholiday')
data = data.join(HKholidays_col)
data['Isholiday'].fillna(0,inplace=True)

test_data.set_index('y-m-d', inplace=True)
test_data = test_data.join(HKholidays_col)
test_data['Isholiday'].fillna(0,inplace=True) 

In [6]:
data

Unnamed: 0,id,speed,year,month,day,hour,Mon,Tue,Wed,Thurs,Fri,Sat,Sun,Isholiday
2017-01-01,0,43.002930,2017,1,1,0,0,0,0,0,0,0,1,0.0
2017-01-01,1,46.118696,2017,1,1,1,0,0,0,0,0,0,1,0.0
2017-01-01,2,44.294158,2017,1,1,2,0,0,0,0,0,0,1,0.0
2017-01-01,3,41.067468,2017,1,1,3,0,0,0,0,0,0,1,0.0
2017-01-01,4,46.448653,2017,1,1,4,0,0,0,0,0,0,1,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-31,14001,19.865269,2018,12,31,12,1,0,0,0,0,0,0,0.0
2018-12-31,14002,17.820375,2018,12,31,15,1,0,0,0,0,0,0,0.0
2018-12-31,14003,12.501851,2018,12,31,16,1,0,0,0,0,0,0,0.0
2018-12-31,14004,15.979319,2018,12,31,18,1,0,0,0,0,0,0,0.0


# Weather Feature

In [7]:
 data = data.join(weather_features)

In [8]:
 test_data = test_data.join(weather_features)

In [9]:
test_data

Unnamed: 0,id,year,month,day,hour,Mon,Tue,Wed,Thurs,Fri,Sat,Sun,Isholiday,tempC,visibility,winddirDegree,windspeedKmph,humidity,cloudcover,WindChillC
2018-01-01,0,2018,1,1,2,1,0,0,0,0,0,0,1.0,19,10,65,12,63,23,18
2018-01-01,1,2018,1,1,5,1,0,0,0,0,0,0,1.0,19,10,65,12,63,23,18
2018-01-01,2,2018,1,1,7,1,0,0,0,0,0,0,1.0,19,10,65,12,63,23,18
2018-01-01,3,2018,1,1,8,1,0,0,0,0,0,0,1.0,19,10,65,12,63,23,18
2018-01-01,4,2018,1,1,10,1,0,0,0,0,0,0,1.0,19,10,65,12,63,23,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-31,3499,2018,12,31,17,1,0,0,0,0,0,0,0.0,12,10,138,18,69,79,10
2018-12-31,3500,2018,12,31,19,1,0,0,0,0,0,0,0.0,12,10,138,18,69,79,10
2018-12-31,3501,2018,12,31,21,1,0,0,0,0,0,0,0.0,12,10,138,18,69,79,10
2018-12-31,3502,2018,12,31,22,1,0,0,0,0,0,0,0.0,12,10,138,18,69,79,10


In [10]:
data

Unnamed: 0,id,speed,year,month,day,hour,Mon,Tue,Wed,Thurs,...,Sat,Sun,Isholiday,tempC,visibility,winddirDegree,windspeedKmph,humidity,cloudcover,WindChillC
2017-01-01,0,43.002930,2017,1,1,0,0,0,0,0,...,0,1,0.0,21,10,76,16,80,30,19
2017-01-01,1,46.118696,2017,1,1,1,0,0,0,0,...,0,1,0.0,21,10,76,16,80,30,19
2017-01-01,2,44.294158,2017,1,1,2,0,0,0,0,...,0,1,0.0,21,10,76,16,80,30,19
2017-01-01,3,41.067468,2017,1,1,3,0,0,0,0,...,0,1,0.0,21,10,76,16,80,30,19
2017-01-01,4,46.448653,2017,1,1,4,0,0,0,0,...,0,1,0.0,21,10,76,16,80,30,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2018-12-31,14001,19.865269,2018,12,31,12,1,0,0,0,...,0,0,0.0,12,10,138,18,69,79,10
2018-12-31,14002,17.820375,2018,12,31,15,1,0,0,0,...,0,0,0.0,12,10,138,18,69,79,10
2018-12-31,14003,12.501851,2018,12,31,16,1,0,0,0,...,0,0,0.0,12,10,138,18,69,79,10
2018-12-31,14004,15.979319,2018,12,31,18,1,0,0,0,...,0,0,0.0,12,10,138,18,69,79,10


In [11]:
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error

X, y = data[["day","month","year","hour",'Mon','Tue','Wed','Thurs','Fri','Sat','Sun','Isholiday','tempC','visibility','winddirDegree','windspeedKmph','humidity','cloudcover', 'WindChillC']], data["speed"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.01, random_state=1)
model = xgb.XGBRegressor(max_depth=7, learning_rate=0.07, n_estimators=600, objective='reg:squarederror',seed=5) 
model.fit(X_train, y_train)
y_pre = model.predict(X_test)
a = model.predict(X_train)
print('t_mse:', mean_squared_error(y_train,a))
print('mse:', mean_squared_error(y_test, y_pre))

t_mse: 2.8896036332842696
mse: 7.641321217621748


In [12]:
test_data['speed'] = model.predict(test_data[["day","month","year","hour",'Mon','Tue','Wed','Thurs','Fri','Sat','Sun','Isholiday','tempC','visibility','winddirDegree','windspeedKmph','humidity','cloudcover', 'WindChillC']])
#test_data['speed'] = model.predict(test_data[["day","month","year","hour",'Mon','Tue','Wed','Thurs','Fri','Sat','Sun','Isholiday']])
test_output = test_data[['id', 'speed']].set_index('id')
test_output

Unnamed: 0_level_0,speed
id,Unnamed: 1_level_1
0,47.331135
1,47.824734
2,40.366383
3,30.949749
4,41.286789
...,...
3499,13.397994
3500,26.039846
3501,47.658215
3502,40.596848


In [13]:
test_output.to_csv('Submission_fi.csv')