In [35]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

In [24]:
df = pd.read_csv('../../Data/daily_usr_cnt_with_holiday.csv')

In [25]:
df = df[['year', 'month', 'day', 'weekday', 'weeknum', 'is_holiday', 'cnt']]

In [26]:
df.head()

Unnamed: 0,year,month,day,weekday,weeknum,is_holiday,cnt
0,2018,1,1,0,1,1.0,9
1,2018,1,2,1,1,0.0,22
2,2018,1,3,2,1,0.0,23
3,2018,1,4,3,1,0.0,11
4,2018,1,5,4,1,0.0,21


In [27]:
val_train_data, test_data = train_test_split(df, test_size=0.15, shuffle=False) 
train_data, valid_data = train_test_split(val_train_data, test_size=1.5/8.5, shuffle=False)

In [28]:
print(train_data.shape)
print(valid_data.shape)
print(test_data.shape)

(510, 7)
(110, 7)
(110, 7)


In [29]:
X_param = ['year', 'month', 'day', 'weekday', 'weeknum', 'is_holiday']
y_param = ['cnt']

In [43]:
c_list = [0.001, 0.01, 0.1, 1, 10, 100]
g_list = [0.001, 0.01, 0.1, 1, 10, 100]
k_list = ['linear', 'rbf', 'poly', 'sigmoid']

In [44]:
def grid_search_SVR(train_data, val_data, c_list, g_list, k_list):
    # set data
    X_train = train_data[X_param]
    y_train = train_data[y_param]
    X_test = val_data[X_param]
    y_test = val_data[y_param]
    
    # scale data
    sc=StandardScaler()
    sc.fit(X_train)
    X_train_std=sc.transform(X_train)
    X_test_std=sc.transform(X_test)
    
    results_df = pd.DataFrame(columns = ['kernel', 'C', 'gamma', 'MAE', 'MSE', 'RMSE', 'R2'])
    
    for k in k_list:
        print(k)
        if k == 'linear':
            g = "None"
            for c in c_list:                   
                # train model
                ksvr=SVR(kernel=k, C=c, max_iter = 1000000)
                ksvr.fit(X_train_std,y_train)

                # predict model
                y_test_pred=ksvr.predict(X_test_std) # kernel SVR을 이용한 #test set의 y예측치 구하기
                
                # get perfomance metric
                MAE = metrics.mean_absolute_error(y_test,y_test_pred)
                MSE = metrics.mean_squared_error(y_test,y_test_pred)
                RMSE = np.sqrt(MSE)
                R2=metrics.r2_score(y_test,y_test_pred)
                
                # add result
                to_append = [k, c, g, MAE, MSE, RMSE, R2]
                append_series = pd.Series(to_append, index = results_df.columns)
                results_df = results_df.append(append_series, ignore_index=True)
                    
        else:
            for c in c_list:
                for g in g_list:
                    print("({}, {})".format(c, g))
                    # train model
                    ksvr=SVR(kernel=k, C=c, gamma=g, max_iter = 1000000)
                    ksvr.fit(X_train_std,y_train)
                    
                    # predict model
                    y_test_pred=ksvr.predict(X_test_std) # kernel SVR을 이용한 #test set의 y예측치 구하기
                    
                    # get perfomance metric
                    MAE = metrics.mean_absolute_error(y_test,y_test_pred)
                    MSE = metrics.mean_squared_error(y_test,y_test_pred)
                    RMSE = np.sqrt(MSE)
                    R2=metrics.r2_score(y_test,y_test_pred)
                    
                    # add result                    
                    to_append = [k, c, g, MAE, MSE, RMSE, R2]
                    append_series = pd.Series(to_append, index = results_df.columns)
                    results_df = results_df.append(append_series, ignore_index=True)

    return results_df

In [45]:
svr_result = grid_search_SVR(train_data, valid_data, c_list, g_list, k_list)

linear
rbf
(0.001, 0.001)
(0.001, 0.01)
(0.001, 0.1)
(0.001, 1)
(0.001, 10)
(0.001, 100)
(0.01, 0.001)
(0.01, 0.01)
(0.01, 0.1)
(0.01, 1)
(0.01, 10)
(0.01, 100)
(0.1, 0.001)
(0.1, 0.01)
(0.1, 0.1)
(0.1, 1)
(0.1, 10)
(0.1, 100)
(1, 0.001)
(1, 0.01)
(1, 0.1)
(1, 1)
(1, 10)
(1, 100)
(10, 0.001)
(10, 0.01)
(10, 0.1)
(10, 1)
(10, 10)
(10, 100)
(100, 0.001)
(100, 0.01)
(100, 0.1)
(100, 1)
(100, 10)
(100, 100)
poly
(0.001, 0.001)
(0.001, 0.01)
(0.001, 0.1)
(0.001, 1)
(0.001, 10)
(0.001, 100)
(0.01, 0.001)
(0.01, 0.01)
(0.01, 0.1)
(0.01, 1)
(0.01, 10)
(0.01, 100)
(0.1, 0.001)
(0.1, 0.01)
(0.1, 0.1)
(0.1, 1)
(0.1, 10)
(0.1, 100)
(1, 0.001)
(1, 0.01)
(1, 0.1)
(1, 1)
(1, 10)
(1, 100)
(10, 0.001)
(10, 0.01)
(10, 0.1)
(10, 1)
(10, 10)
(10, 100)
(100, 0.001)
(100, 0.01)
(100, 0.1)
(100, 1)
(100, 10)
(100, 100)
sigmoid
(0.001, 0.001)
(0.001, 0.01)
(0.001, 0.1)
(0.001, 1)
(0.001, 10)
(0.001, 100)
(0.01, 0.001)
(0.01, 0.01)
(0.01, 0.1)
(0.01, 1)
(0.01, 10)
(0.01, 100)
(0.1, 0.001)
(0.1, 0.01)
(0.1, 0.1

In [46]:
svr_result.head()

Unnamed: 0,kernel,C,gamma,MAE,MSE,RMSE,R2
0,linear,0.001,,138.443447,24989.912889,158.081982,-2.704771
1,linear,0.01,,138.395339,25009.288135,158.143252,-2.707643
2,linear,0.1,,135.48363,24306.07164,155.904046,-2.603391
3,linear,1.0,,125.148096,21412.667067,146.330677,-2.174442
4,linear,10.0,,122.723341,20768.620817,144.113222,-2.078961


In [47]:
svr_result.sort_values(by=['MAE'], axis=0)

Unnamed: 0,kernel,C,gamma,MAE,MSE,RMSE,R2
38,rbf,100.0,0.1,71.787873,1.038026e+04,101.883557,-0.538880
39,rbf,100.0,1,81.723293,1.085272e+04,104.176392,-0.608923
57,poly,0.1,1,83.680875,1.305865e+04,114.274471,-0.935954
74,poly,100.0,0.1,83.680925,1.305867e+04,114.274533,-0.935956
32,rbf,10.0,0.1,90.808947,1.326307e+04,115.165396,-0.966258
...,...,...,...,...,...,...,...
106,sigmoid,10.0,10,412.378427,2.287194e+05,478.246154,-32.907794
107,sigmoid,10.0,100,434.799789,2.781446e+05,527.394202,-40.235121
111,sigmoid,100.0,1,2691.453426,1.150248e+07,3391.530970,-1704.250337
112,sigmoid,100.0,10,3818.119645,2.262025e+07,4756.074738,-3352.466027


In [None]:
# try with best param
X_train = train_data[X_param]
y_train = train_data[y_param]
X_test = val_data[X_param]
y_test = val_data[y_param]

# scale data
sc=StandardScaler()
sc.fit(X_train)
X_train_std=sc.transform(X_train)
X_test_std=sc.transform(X_test)

In [None]:
# import plotly
import plotly.express as px

fig=px.line(df,x='date',y='cnt', title='bike use')
fig.show()

In [None]:
plt.figure(figsize=(5, 5))
plt.title('Various hyper-parameters',
          fontsize=10)

plt.xlabel("C")
plt.ylabel("Score")

ax = plt.gca()
#ax.set_xlim(0, 100)
ax.set_ylim(0.7, 1)

X_axis = C

acc = []
f1 = []
recall = []
precision = []
auroc = []
for i in range(len(score_total)):
    if (score_total['Hyper-parameter'][i][0] == 'sigmoid') and (score_total['Hyper-parameter'][i][2] == 100):
        acc.append(score_total['Accuracy'][i])
        f1.append(score_total['F1-score'][i])
        recall.append(score_total['recall'][i])
        precision.append(score_total['precision'][i])
        auroc.append(score_total['AUROC'][i])
        
x = np.linspace(0,6,5)
ax.plot(x, acc, color='g',alpha=1, label = 'Accuracy')
ax.plot(x, f1, color='r',alpha=1, label = 'F1-score')
ax.plot(x, recall, color='silver',alpha=1, label = 'recall')
ax.plot(x, precision, color='b',alpha=1, label = 'precision')
ax.plot(x, auroc, color='y',alpha=1, label = 'AUROC')
plt.xticks(x,X_axis)
plt.legend()