# SVM(Support Vector Machine)

In [1]:
import pandas as pd
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [3]:
df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
df['target'] = cancer.target
df.head(3)

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0


In [4]:
df.shape

(569, 31)

In [6]:
# y값의 분포
df.target.value_counts()

1    357
0    212
Name: target, dtype: int64

- 표준화

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
cancer_std = scaler.fit_transform(cancer.data)
df = pd.DataFrame(cancer_std, columns=cancer.feature_names)
df.describe()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,-3.153111e-15,-6.568462e-15,-6.993039e-16,-8.553985e-16,6.081447e-15,-1.136369e-15,-2.997017e-16,1.023981e-15,-1.860648e-15,-1.504752e-15,...,-2.297713e-15,1.742016e-15,-1.198807e-15,6.118909e-16,-5.094929e-15,-2.122887e-15,6.118909e-16,-1.998011e-16,-2.422589e-15,2.497514e-15
std,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,...,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088,1.00088
min,-2.029648,-2.229249,-1.984504,-1.454443,-3.112085,-1.610136,-1.114873,-1.26182,-2.744117,-1.819865,...,-1.726901,-2.223994,-1.693361,-1.222423,-2.682695,-1.443878,-1.305831,-1.745063,-2.16096,-1.601839
25%,-0.6893853,-0.7259631,-0.6919555,-0.6671955,-0.7109628,-0.747086,-0.7437479,-0.7379438,-0.7032397,-0.7226392,...,-0.6749213,-0.7486293,-0.6895783,-0.6421359,-0.6912304,-0.6810833,-0.7565142,-0.7563999,-0.6418637,-0.6919118
50%,-0.2150816,-0.1046362,-0.23598,-0.2951869,-0.03489108,-0.2219405,-0.3422399,-0.3977212,-0.0716265,-0.1782793,...,-0.2690395,-0.04351564,-0.2859802,-0.3411812,-0.04684277,-0.2695009,-0.2182321,-0.2234689,-0.1274095,-0.2164441
75%,0.4693926,0.5841756,0.4996769,0.3635073,0.636199,0.4938569,0.5260619,0.6469351,0.5307792,0.4709834,...,0.5220158,0.6583411,0.540279,0.3575891,0.5975448,0.5396688,0.5311411,0.71251,0.4501382,0.4507624
max,3.971288,4.651889,3.97613,5.250529,4.770911,4.568425,4.243589,3.92793,4.484751,4.910919,...,4.094189,3.885905,4.287337,5.930172,3.955374,5.112877,4.700669,2.685877,6.046041,6.846856


- 표준화된 데이터로 train/test dataset 분리

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    cancer_std, cancer.target, stratify=cancer.target, test_size=0.2, random_state=2022
)

- SVM 모델 생성, 학습, 평가

In [10]:
from sklearn.svm import SVC
svc = SVC(random_state=2022)
svc.fit(X_train, y_train)
svc.score(X_train, y_train)

0.9824175824175824

In [11]:
# 하이퍼 파라메터
svc.get_params()

{'C': 1.0,
 'break_ties': False,
 'cache_size': 200,
 'class_weight': None,
 'coef0': 0.0,
 'decision_function_shape': 'ovr',
 'degree': 3,
 'gamma': 'scale',
 'kernel': 'rbf',
 'max_iter': -1,
 'probability': False,
 'random_state': 2022,
 'shrinking': True,
 'tol': 0.001,
 'verbose': False}

- 최적의 하이퍼 파라메터

In [12]:
params = {'C': [0.1, 1, 10]}

In [13]:
from sklearn.model_selection import GridSearchCV
grid_svc = GridSearchCV(svc, params, scoring='accuracy', cv=5)
grid_svc.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SVC(random_state=2022),
             param_grid={'C': [0.1, 1, 10]}, scoring='accuracy')

In [14]:
grid_svc.best_params_

{'C': 1}

In [15]:
params = {'C': [0.5, 0.8, 1, 3, 5]}
grid_svc = GridSearchCV(svc, params, scoring='accuracy', cv=5)
grid_svc.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SVC(random_state=2022),
             param_grid={'C': [0.5, 0.8, 1, 3, 5]}, scoring='accuracy')

In [16]:
grid_svc.best_params_

{'C': 3}

In [17]:
params = {'C': [2, 3, 4]}
grid_svc = GridSearchCV(svc, params, scoring='accuracy', cv=5)
grid_svc.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=SVC(random_state=2022),
             param_grid={'C': [2, 3, 4]}, scoring='accuracy')

In [18]:
grid_svc.best_params_

{'C': 2}

In [19]:
best_svc = grid_svc.best_estimator_
best_svc.score(X_test, y_test)

0.9824561403508771