# Traffic speed Prediction

kaggle page: [here](https://www.kaggle.com/c/msbd5001-fall2020)

# Install and import dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from datetime import datetime

## Clean data


In [2]:
g_drive = "/content/drive/My Drive/HKUST/MSBD5001/" # edit this according to your path
df = pd.read_csv(g_drive + "train.csv")
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y %H:%M')
df.pop('id')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14006 entries, 0 to 14005
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   date    14006 non-null  datetime64[ns]
 1   speed   14006 non-null  float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 219.0 KB


## Feature Engineering

Parse Datetime as hour of the day (0-24), day of the week (1-7), week of year (0-52), and month of the year (1-12).

Since we want hour 24 to be closer to 1 then 23, we can use Sine and cosine function to convert dayOfWeek, weekOfYear, hourOfDay, and monthOfYear into features with Cyclic Ordinal attributes

### Weather

In [3]:
hk_weather = pd.read_csv(g_drive + "weather_data.csv")

In [4]:
hk_weather['date'] = pd.to_datetime(hk_weather['date'], format='%m/%d/%Y %H:%M')
hk_weather['rain'] = pd.to_numeric(hk_weather['rain'], errors='coerce')
hk_weather.fillna(value=0., inplace=True)
hk_weather.head()

Unnamed: 0,date,pressure,max_temp,mean_temp,min_temp,dew,humidity,cloud,rain,sunshine,wind_direction,wind_speed
0,2017-01-01,1021.7,20.8,19.2,18.4,15.6,80,72,0.0,4.6,60,34.2
1,2017-01-02,1020.2,23.3,20.2,18.4,16.7,81,28,0.0,9.3,70,17.6
2,2017-01-03,1019.8,21.3,20.0,18.9,17.1,83,56,0.0,3.8,70,26.1
3,2017-01-04,1018.7,21.7,19.9,18.7,16.3,80,51,0.0,6.3,70,27.7
4,2017-01-05,1016.9,23.4,21.1,18.9,17.5,80,61,0.0,1.7,40,14.3


### Holidays

In [5]:
hk_holidays = {'holidays': [
               '2/1/2017', '28/1/2017', '29/1/2017', '30/1/2017', '31/1/2017', 
               '4/4/2017', '14/4/2017', '15/4/2017', '17/4/2017', '1/5/2017', 
               '3/5/2017', '30/5/2017', '1/7/2017', '2/10/2017', '5/10/2017', 
               '28/10/2017', '25/12/2017', '26/12/2017', '1/1/2018', '16/2/2018',
               '17/2/2018', '18/2/2018', '19/2/2018', '30/3/2018', '31/3/2018',
               '2/4/2018', '5/4/2018', '1/5/2018', '22/5/2018', '18/6/2018',
               '2/7/2018', '25/9/2018', '1/10/2018', '17/10/2018', '25/12/2018',
               '26/12/2018'
               ]}

hk_holidays = pd.DataFrame(hk_holidays)
hk_holidays = pd.to_datetime(hk_holidays['holidays'], format='%d/%m/%Y')

In [6]:
def minDays(x):
    res = hk_holidays.dt.date.apply(lambda y: abs(y-x).days)
    return min(res)


In [7]:
def featureEngineering(data):
    dateTime = data['date']
    hourOfDay = dateTime.dt.hour
    dayOfWeek = dateTime.dt.dayofweek
    dayOfYear = dateTime.dt.dayofyear
    weekOfYear = dateTime.dt.weekofyear
    monthOfYear = dateTime.dt.month

    data["hourOfDay"] = hourOfDay
    data["dayOfWeek"] = dayOfWeek
    data["dayOfyear"] = dayOfYear
    data["weekOfYear"] = weekOfYear
    data["monthOfYear"] = monthOfYear

    # Hour of day
    data["hourOfDay_sin"] = np.sin(2*np.pi*(hourOfDay/24))
    data["hourOfDay_cos"] = np.cos(2*np.pi*(hourOfDay/24))
    # Day of week
    data["dayOfWeek_sin"] = np.sin(2*np.pi*(dayOfWeek/7))
    data["dayOfWeek_cos"] = np.cos(2*np.pi*(dayOfWeek/7))
    # Day of year
    data["dayOfYear_sin"] = np.sin(2*np.pi*(dayOfYear/365))
    data["dayOfYear_cos"] = np.cos(2*np.pi*(dayOfYear/365))
    # Week of year
    data["weekOfYear_sin"] = np.sin(2*np.pi*(weekOfYear/52))
    data["weekOfYear_cos"] = np.cos(2*np.pi*(weekOfYear/52))
    # Month of year
    data["monthOfYear_sin"] = np.sin(2*np.pi*(monthOfYear/12))
    data["monthOfYear_cos"] = np.cos(2*np.pi*(monthOfYear/12))

    data["isWeekend"] = (dateTime.dt.dayofweek >= 5).astype(float)

    data["isHoliday"] = dateTime.dt.date.isin(hk_holidays.dt.date)
    data["numDaysFromHoliday"] = dateTime.dt.date.apply(minDays)
    data = pd.merge_asof(data, hk_weather, on='date')
    
    data.pop('date')
    return data

In [None]:
# uncomment to understand the purpose of sine and consine transformation
# df.sample(500).plot.scatter('hourOfDay_sin','hourOfDay_cos').set_aspect('equal');
# df.sample(500).plot.scatter('dayOfWeek_sin','dayOfWeek_cos').set_aspect('equal');
# df.sample(500).plot.scatter('weekOfYear_sin','weekOfYear_cos').set_aspect('equal');
# df.sample(500).plot.scatter('monthOfYear_sin','monthOfYear_cos').set_aspect('equal');

# Preparing training data and validation data

In [8]:
df = featureEngineering(df)

  


In [9]:
def getDataSet(df, label='speed'):
    columns = list(df.columns)
    columns.remove(label)
    X = df[columns]
    y = df[label]
    return X,y

In [10]:
# use all data as training

train_data, train_label = getDataSet(df)

# XGBoost

best model

In [16]:
dtrain = xgb.DMatrix(train_data, train_label)

xgb_params = {
    'eta': 0.0175,
    'max_depth': 9,
    'subsample': 0.80,
    'min_child_weight': 5,
    'objective': 'reg:squarederror',
    'eval_metric': 'mae',
    'lambda': 0.8,   
    'alpha': 0.4, 
    'base_score': np.mean(train_label),
    'gpu_id': 0,
    'tree_method': 'gpu_hist',
    'num_parallel_tree': 3
}

num_boost_rounds = 841
model = xgb.train(
    dict(xgb_params, silent=1), 
    dtrain, 
    num_boost_round=num_boost_rounds,
)

In [None]:
dtrain = xgb.DMatrix(train_data)

train_mse = mean_squared_error(train_label, model.predict(dtrain))
print("training mse: ", train_mse)

# Predicting on test data

In [13]:
df_test = pd.read_csv(g_drive + "test.csv")
df_test['date'] = pd.to_datetime(df_test['date'], format='%d/%m/%Y %H:%M')
id = df_test.pop('id')

df_test = featureEngineering(df_test)

  


In [14]:
results = pd.DataFrame(id)
dtest = xgb.DMatrix(df_test)
results['speed'] = model.predict(dtest)
results.to_csv(g_drive + 'sub{}.csv'.format(datetime.now().strftime('%Y%m%d_%H%M%S')), index=False)