In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
import warnings
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('../../Data/daily_usr_cnt_with_holiday_tot_clean.csv')

In [3]:
df = df[['date','weekday', 'weeknum', 'is_rain', 'is_holiday', 'cnt']]

In [4]:
df.head()

Unnamed: 0,date,weekday,weeknum,is_rain,is_holiday,cnt
0,2017-06-21,2,25,0,0,13.0
1,2017-06-22,3,25,0,0,40.0
2,2017-06-23,4,25,0,0,39.0
3,2017-06-24,5,25,1,0,28.0
4,2017-06-25,6,25,0,0,28.0


In [5]:
dates = df['date']
df_cate = df[['weekday', 'weeknum', 'is_rain', 'is_holiday']]
df_y = df['cnt']

In [6]:
print(dates.head())
print('---------------')
print(df_cate.head())
print('---------------')
print(df_y.head())

0    2017-06-21
1    2017-06-22
2    2017-06-23
3    2017-06-24
4    2017-06-25
Name: date, dtype: object
---------------
   weekday  weeknum  is_rain  is_holiday
0        2       25        0           0
1        3       25        0           0
2        4       25        0           0
3        5       25        1           0
4        6       25        0           0
---------------
0    13.0
1    40.0
2    39.0
3    28.0
4    28.0
Name: cnt, dtype: float64


In [7]:
df_cate = df_cate.apply(LabelEncoder().fit_transform)

In [8]:
df = pd.concat([dates, df_cate, df_y], axis=1)

In [9]:
df.head()

Unnamed: 0,date,weekday,weeknum,is_rain,is_holiday,cnt
0,2017-06-21,2,24,0,0,13.0
1,2017-06-22,3,24,0,0,40.0
2,2017-06-23,4,24,0,0,39.0
3,2017-06-24,5,24,1,0,28.0
4,2017-06-25,6,24,0,0,28.0


In [10]:
val_train_data, test_data = train_test_split(df, test_size=0.15, shuffle=False) 
train_data, valid_data = train_test_split(val_train_data, test_size=1.5/8.5, shuffle=False)

In [11]:
print(train_data.shape)
print(valid_data.shape)
print(test_data.shape)

(646, 6)
(139, 6)
(139, 6)


In [12]:
test_data.tail()

Unnamed: 0,date,weekday,weeknum,is_rain,is_holiday,cnt
919,2019-12-27,4,51,0,0,81.0
920,2019-12-28,5,51,0,0,79.0
921,2019-12-29,6,51,0,0,61.0
922,2019-12-30,0,51,0,0,77.0
923,2019-12-31,1,51,0,0,39.0


In [13]:
X_param = ['weekday', 'weeknum', 'is_rain', 'is_holiday']
y_param = ['cnt']

In [14]:
c_list = [0.001, 0.01, 0.1, 1, 10, 100]
g_list = [0.001, 0.01, 0.1, 1, 10, 100]
e_list = [0.001, 0.01, 0.1, 1, 10, 100]
k_list = ['linear', 'rbf', 'poly', 'sigmoid']

In [15]:
def grid_search_SVR(train_data, valid_data, c_list, g_list, k_list, e_list):
    # set data
    X_train = train_data[X_param]
    y_train = train_data[y_param]
    X_test = valid_data[X_param]
    y_test = valid_data[y_param]
    
    # scale data
    sc=StandardScaler()
    sc.fit(X_train)
    X_train_std=sc.transform(X_train)
    X_test_std=sc.transform(X_test)
    
    results_df = pd.DataFrame(columns = ['kernel', 'C', 'gamma', 'epsilon', 'MAE', 'MSE', 'RMSE', 'R2'])
    
    for k in k_list:
        print(k)
        if k == 'linear':
            g = "None"
            for c in c_list: 
                for e in e_list:
                    # train model
                    ksvr=SVR(kernel=k, C=c, epsilon = e, max_iter = 1000000)
                    ksvr.fit(X_train_std,y_train)

                    # predict model
                    y_test_pred=ksvr.predict(X_test_std) # kernel SVR을 이용한 #test set의 y예측치 구하기

                    # get perfomance metric
                    MAE = metrics.mean_absolute_error(y_test,y_test_pred)
                    MSE = metrics.mean_squared_error(y_test,y_test_pred)
                    RMSE = np.sqrt(MSE)
                    R2=metrics.r2_score(y_test,y_test_pred)

                    # add result
                    to_append = [k, c, g, e, MAE, MSE, RMSE, R2]
                    append_series = pd.Series(to_append, index = results_df.columns)
                    results_df = results_df.append(append_series, ignore_index=True)
                    
        else:
            for c in c_list:
                for g in g_list:
                    for e in e_list:
                        print("({}, {}, {})".format(c, g, e))
                        # train model
                        ksvr=SVR(kernel=k, C=c, gamma=g, epsilon = e, max_iter = 1000000)
                        ksvr.fit(X_train_std,y_train)

                        # predict model
                        y_test_pred=ksvr.predict(X_test_std) # kernel SVR을 이용한 #test set의 y예측치 구하기

                        # get perfomance metric
                        MAE = metrics.mean_absolute_error(y_test,y_test_pred)
                        MSE = metrics.mean_squared_error(y_test,y_test_pred)
                        RMSE = np.sqrt(MSE)
                        R2=metrics.r2_score(y_test,y_test_pred)

                        # add result                    
                        to_append = [k, c, g, e, MAE, MSE, RMSE, R2]
                        append_series = pd.Series(to_append, index = results_df.columns)
                        results_df = results_df.append(append_series, ignore_index=True)

    return results_df

### Conduct Grid search (train & val)

In [16]:
svr_result = grid_search_SVR(train_data, valid_data, c_list, g_list, k_list, e_list)

linear
rbf
(0.001, 0.001, 0.001)
(0.001, 0.001, 0.01)
(0.001, 0.001, 0.1)
(0.001, 0.001, 1)
(0.001, 0.001, 10)
(0.001, 0.001, 100)
(0.001, 0.01, 0.001)
(0.001, 0.01, 0.01)
(0.001, 0.01, 0.1)
(0.001, 0.01, 1)
(0.001, 0.01, 10)
(0.001, 0.01, 100)
(0.001, 0.1, 0.001)
(0.001, 0.1, 0.01)
(0.001, 0.1, 0.1)
(0.001, 0.1, 1)
(0.001, 0.1, 10)
(0.001, 0.1, 100)
(0.001, 1, 0.001)
(0.001, 1, 0.01)
(0.001, 1, 0.1)
(0.001, 1, 1)
(0.001, 1, 10)
(0.001, 1, 100)
(0.001, 10, 0.001)
(0.001, 10, 0.01)
(0.001, 10, 0.1)
(0.001, 10, 1)
(0.001, 10, 10)
(0.001, 10, 100)
(0.001, 100, 0.001)
(0.001, 100, 0.01)
(0.001, 100, 0.1)
(0.001, 100, 1)
(0.001, 100, 10)
(0.001, 100, 100)
(0.01, 0.001, 0.001)
(0.01, 0.001, 0.01)
(0.01, 0.001, 0.1)
(0.01, 0.001, 1)
(0.01, 0.001, 10)
(0.01, 0.001, 100)
(0.01, 0.01, 0.001)
(0.01, 0.01, 0.01)
(0.01, 0.01, 0.1)
(0.01, 0.01, 1)
(0.01, 0.01, 10)
(0.01, 0.01, 100)
(0.01, 0.1, 0.001)
(0.01, 0.1, 0.01)
(0.01, 0.1, 0.1)
(0.01, 0.1, 1)
(0.01, 0.1, 10)
(0.01, 0.1, 100)
(0.01, 1, 0.001)


KeyboardInterrupt: 

In [None]:
svr_result.head()

In [None]:
svr_result=svr_result.sort_values(by=['MSE'], axis=0)


In [None]:
svr_result.to_csv('svr_results.csv')

In [None]:
best_kernel = svr_result.iloc[0][0]
best_C = svr_result.iloc[0][1]
best_gamma = svr_result.iloc[0][2]
best_epsilon = svr_result.iloc[0][3]
print(best_kernel, '/', best_C, '/', best_gamma, '/', best_epsilon)

### Try with best params (train/val & test)

In [None]:
# try with best param
X_train = val_train_data[X_param]
y_train = val_train_data[y_param]
X_test = test_data[X_param]
y_test = test_data[y_param]

# scale data
sc=StandardScaler()
sc.fit(X_train)
X_train_std=sc.transform(X_train)
X_test_std=sc.transform(X_test)

# run model
ksvr=SVR(kernel=best_kernel, C=best_C, gamma = best_gamma, epsilon = best_epsilon, max_iter = 1000000)
ksvr.fit(X_train_std,y_train)

# predict model
y_train_pred=ksvr.predict(X_train_std) # kernel SVR을 이용한 #test set의 y예측치 구하기
y_test_pred=ksvr.predict(X_test_std) # kernel SVR을 이용한 #test set의 y예측치 구하기

# get perfomance metric
MAE = metrics.mean_absolute_error(y_test,y_test_pred)
MSE = metrics.mean_squared_error(y_test,y_test_pred)
RMSE = np.sqrt(MSE)
R2=metrics.r2_score(y_test,y_test_pred)

In [None]:
print(MAE ,MSE, RMSE, R2)

In [None]:
y_train

In [None]:
val_train_data['date']

In [None]:
train_viz = pd.DataFrame({'date' : val_train_data['date'], 
                          'y_train_true' : y_train.iloc[:,-1].values, 
                          'y_train_pred' : y_train_pred})
train_viz.head()

In [None]:
test_data['date']

In [None]:
valid_viz = pd.DataFrame({'date' : test_data['date'], 
                          'y_valid_true' : y_test.iloc[:,-1].values, 
                          'y_valid_pred' : y_test_pred})
valid_viz.head()

In [None]:
valid_viz

In [None]:
import numpy as np
import plotly.graph_objects as go

# https://community.plotly.com/t/plotly-colours-list/11730/2

fig=go.Figure()
fig.add_trace(go.Scatter(x=train_viz['date'], y=train_viz['y_train_true'],mode='lines', 
                         line = dict(color = '#1f77b4'), name='y_true')) # muted blue
fig.add_trace(go.Scatter(x=train_viz['date'], y=train_viz['y_train_pred'],mode='lines',
                         line = dict(color = '#ff7f0e'), name='y_pred')) # safety orange
fig.add_trace(go.Scatter(x=valid_viz['date'], y=valid_viz['y_valid_true'],mode='lines',
                         line = dict(color = '#1f77b4'), name='y_true')) # muted blue
fig.add_trace(go.Scatter(x=valid_viz['date'], y=valid_viz['y_valid_pred'],mode='lines',
                         line = dict(color = '#bcbd22'),name='y_pred')) # curry yellow-green
fig.show()

In [None]:
fig.write_image("fig_plotly.jpeg")

In [None]:
# Plot outputs
plt.rc('font', size=12)
fig, ax = plt.subplots(figsize=(15, 6))

ax.plot(train_viz['date'], train_viz['y_train_true'], color='tab:blue', label='train : actual')
ax.plot(train_viz['date'], train_viz['y_train_true'], color='tab:orange', label='train : fitted')
ax.plot(valid_viz['date'], valid_viz['y_valid_true'], color='tab:green', label='test: actual')
ax.plot(valid_viz['date'], valid_viz['y_valid_pred'], color='tab:red', label='test: predicted')

ax.set_xlabel('Time')
ax.set_xticklabels([], rotation =45)
ax.set_ylabel('search amount')
ax.set_title('Time series: SVR')
ax.legend(loc='upper left')