### AI-10 Practice 1  

#### Import libraries  

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.pipeline import Pipeline
from joblib import dump

#### Parameters  

In [13]:
csv_in = '../ai-09/ai-09-practice1.csv'

#### Read CSV file  

In [14]:
df = pd.read_csv(csv_in, delimiter=',', skiprows=0, header=0)
print(df.shape)
print(df.info())
display(df.head())

(350, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   d1      350 non-null    float64
 1   d2      350 non-null    float64
 2   d3      350 non-null    float64
 3   d4      350 non-null    float64
 4   w       350 non-null    float64
dtypes: float64(5)
memory usage: 13.8 KB
None


Unnamed: 0,d1,d2,d3,d4,w
0,5.18,7.39,-1.68,0.33,37.78
1,3.67,8.26,1.02,-0.17,30.18
2,4.02,8.27,-1.44,-1.59,35.58
3,4.29,6.99,-1.34,0.26,29.61
4,3.45,8.34,-0.36,0.93,28.42


#### Get X and y  

In [15]:
X = df.loc[:, 'd1':'d4']  # explanatory variables
y = df['w']  # objective variable
print('X:', X.shape)
display(X.head())
print('y:', y.shape)
print(y.head())

X: (350, 4)


Unnamed: 0,d1,d2,d3,d4
0,5.18,7.39,-1.68,0.33
1,3.67,8.26,1.02,-0.17
2,4.02,8.27,-1.44,-1.59
3,4.29,6.99,-1.34,0.26
4,3.45,8.34,-0.36,0.93


y: (350,)
0    37.78
1    30.18
2    35.58
3    29.61
4    28.42
Name: w, dtype: float64


#### Make pipeline and set parameters for grid search   

In [16]:
pipe = Pipeline([('scaler', MinMaxScaler(feature_range=(0, 1), copy=True)),
                 ('svr', SVR(kernel='rbf'))])

C_grid = [10e0, 10e1, 10e2, 10e3]
g_grid = ['scale', 0.1, 0.5, 1.0]
e_grid = [0.01, 0.1, 1.0, 10.0]

# Hyper paramet settings for grid search
param_grid = {
    'svr__C': C_grid,
    'svr__gamma' : g_grid,
    'svr__epsilon' : e_grid,
}

#### Preparation of objects for cross validation  

In [17]:
grid_cv = KFold(n_splits=4, shuffle=True, random_state=13)  # for grid search
gen_cv = KFold(n_splits=4, shuffle=True, random_state=17)  # for estimation of generalization performance

#### Define the grid search for hyper parameters  

In [18]:
gs = GridSearchCV(pipe, param_grid , cv=grid_cv, scoring='neg_mean_squared_error')

#### Estimation of generalization performance  

In [19]:
%%time
nested_score = cross_val_score(gs, X=X, y=y, cv=gen_cv,
                               scoring='neg_mean_squared_error')
print(nested_score)
print(np.sqrt(-nested_score.mean()))
# neg_mean_squared_error: Regression
# accuracy: clasify

[-0.00207529 -0.0013367  -0.00105015 -0.00108188]
0.03722908382895541
CPU times: user 3min 11s, sys: 3.85 ms, total: 3min 11s
Wall time: 3min 11s


**Ans. 0.076**  

#### Cross-validation to obtain the model with the best hyperparameter set (best estimator)  
- Note: gs_best is already fit to the whole data (X) in gs.fit(X,y)  

In [20]:
%%time
gs.fit(X, y)
gs_best = gs.best_estimator_

CPU times: user 1min 28s, sys: 0 ns, total: 1min 28s
Wall time: 1min 28s


In [21]:
print(gs_best)

Pipeline(steps=[('scaler', MinMaxScaler()),
                ('svr', SVR(C=10000.0, epsilon=0.01, gamma=0.1))])


**Ans. C=500, epsilon=0.01, gamma=0.5**  

#### Save the best predictor  

In [22]:
tag = 'ai-10-assign1'
model_file = 'svr_best_{}.joblib'.format(tag)
dump(gs_best, model_file)

['svr_best_ai-10-practice1.joblib']