### Support Vector Regression Task

##### 한국 방문자 수 예측

- date: 날짜 (년-월)
- nation: 방문자의 국가
- visitor: 방문자 수
- growth: 전년 동월 대비 방문객 수 성장률
- share: 해당 월의 전체 방문자 중 해당 국가 비율

##### 년도와 국가를 입력해서 예상 방문자 수를 예측하세요.

In [1]:
import pandas as pd

visitor_df = pd.read_csv('./datasets/korea_visitor.csv')
visitor_df

Unnamed: 0,date,nation,visitor,growth,share,tourism,business,official affairs,studying,others
0,2019-1,China,392814,28.737870,35.555117,320113,2993,138,8793,60777
1,2019-1,Japan,206526,23.606830,18.693468,198805,2233,127,785,4576
2,2019-1,Taiwan,87954,16.003693,7.961057,86393,74,22,180,1285
3,2019-1,Hong Kong,35896,3.533212,3.249086,34653,59,2,90,1092
4,2019-1,Macao,2570,-12.376406,0.232621,2506,2,0,17,45
...,...,...,...,...,...,...,...,...,...,...
955,2020-4,Oceania others,13,-97.239915,0.044195,0,0,0,0,13
956,2020-4,South Africa,22,-98.101812,0.074792,2,0,0,0,20
957,2020-4,Africa others,177,-95.412131,0.601734,11,5,0,11,150
958,2020-4,Stateless,3,-95.312500,0.010199,0,0,0,0,3


In [2]:
visitor_df['year'] = visitor_df['date'].str.split('-').str[0]
visitor_df.drop('date', axis=1, inplace=True)

In [3]:
visitor_df

Unnamed: 0,nation,visitor,growth,share,tourism,business,official affairs,studying,others,year
0,China,392814,28.737870,35.555117,320113,2993,138,8793,60777,2019
1,Japan,206526,23.606830,18.693468,198805,2233,127,785,4576,2019
2,Taiwan,87954,16.003693,7.961057,86393,74,22,180,1285,2019
3,Hong Kong,35896,3.533212,3.249086,34653,59,2,90,1092,2019
4,Macao,2570,-12.376406,0.232621,2506,2,0,17,45,2019
...,...,...,...,...,...,...,...,...,...,...
955,Oceania others,13,-97.239915,0.044195,0,0,0,0,13,2020
956,South Africa,22,-98.101812,0.074792,2,0,0,0,20,2020
957,Africa others,177,-95.412131,0.601734,11,5,0,11,150,2020
958,Stateless,3,-95.312500,0.010199,0,0,0,0,3,2020


In [4]:
from sklearn.preprocessing import LabelEncoder

columns = ['nation']
encoders = []
for column in columns:
    encoder = LabelEncoder()
    category = encoder.fit_transform(visitor_df[column])
    visitor_df[column] = category
    encoders.append(encoder)

In [5]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_squared_log_error

def get_evaluation(y_test, prediction):
    MAE =  mean_absolute_error(y_test, prediction)
    MSE = mean_squared_error(y_test, prediction)
    RMSE = np.sqrt(MSE)
    MSLE = mean_squared_log_error(y_test, prediction)
    RMSLE = np.sqrt(mean_squared_log_error(y_test, prediction))
    R2 = r2_score(y_test, prediction)

    print('MAE: {:.4f}, MSE: {:.4f}, RMSE: {:.4f}, MSLE: {:.4f}, RMSLE: {:.4f}, R2: {:.4f}'.format(MAE, MSE, RMSE, MSLE, RMSLE, R2))

In [6]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

features, targets = visitor_df.iloc[:, :-1], visitor_df.visitor

X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=124)

y_train = np.log1p(y_train)

linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

# 기울기(가중치)
print(linear_regression.coef_)
# 절편(상수)
print(linear_regression.intercept_)

prediction = linear_regression.predict(X_test)
print(linear_regression.score(X_test, np.log1p(y_test)))
print(r2_score(np.log1p(y_test), prediction))
get_evaluation(np.log1p(y_test), prediction)

[ 9.82881242e-03  1.10873366e-04  2.73036864e-02  2.89721238e-01
 -1.17550931e-04  4.00844258e-05  5.57338579e-04 -2.85389707e-04
 -8.36090009e-05]
7.367210134587611
0.6152516110026369
0.6152516110026369
MAE: 0.9821, MSE: 1.4049, RMSE: 1.1853, MSLE: 0.0199, RMSLE: 0.1411, R2: 0.6153


In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

features, targets = visitor_df.iloc[:, :-1], visitor_df.visitor

parmas = {
    'gamma': [0.01, 0.1, 1, 10, 100], 
    'C': [0.01, 0.1, 1, 10, 100], 
    'epsilon': [0, 0.01, 0.1, 1, 10, 100]
}

X_train, X_test, y_train, y_test = train_test_split(features, targets, test_size=0.2, random_state=0)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

grid_svr = GridSearchCV(SVR(kernel='linear'), param_grid=parmas, cv=3, refit=True, return_train_score=True, scoring='r2')


# 로그 변환
y_train = np.log1p(y_train)

grid_svr.fit(X_train, y_train)

prediction = grid_svr.predict(X_test)

# 기울기(가중치)
print(grid_svr.best_estimator_.coef_)

get_evaluation(np.log1p(y_test), prediction)

[[ 0.21136119  0.12095593  0.96441363  0.89289081  0.08643009 -0.00765886
   0.2022096  -0.51400486  0.51161616]]
MAE: 1.0222, MSE: 1.7149, RMSE: 1.3096, MSLE: 0.0242, RMSLE: 0.1556, R2: 0.5127


In [8]:
# DataFrame으로 변환
scores_df = pd.DataFrame(grid_svr.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score', 
           'split0_test_score', 'split1_test_score', 'split2_test_score']].sort_values(by='rank_test_score')

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
45,"{'C': 0.1, 'epsilon': 1, 'gamma': 0.01}",0.538713,1,0.609845,0.453812,0.552481
47,"{'C': 0.1, 'epsilon': 1, 'gamma': 1}",0.538713,1,0.609845,0.453812,0.552481
48,"{'C': 0.1, 'epsilon': 1, 'gamma': 10}",0.538713,1,0.609845,0.453812,0.552481
49,"{'C': 0.1, 'epsilon': 1, 'gamma': 100}",0.538713,1,0.609845,0.453812,0.552481
46,"{'C': 0.1, 'epsilon': 1, 'gamma': 0.1}",0.538713,1,0.609845,0.453812,0.552481
...,...,...,...,...,...,...
118,"{'C': 10, 'epsilon': 100, 'gamma': 10}",-0.174514,126,-0.026768,-0.231095,-0.265679
119,"{'C': 10, 'epsilon': 100, 'gamma': 100}",-0.174514,126,-0.026768,-0.231095,-0.265679
89,"{'C': 1, 'epsilon': 100, 'gamma': 100}",-0.174514,126,-0.026768,-0.231095,-0.265679
55,"{'C': 0.1, 'epsilon': 100, 'gamma': 0.01}",-0.174514,126,-0.026768,-0.231095,-0.265679
