In [9]:
import pandas as pd
import numpy as np

In [10]:
df = pd.read_csv('penguins_size.csv')

# 데이터 확인한 결과 성별에 결측치가 있는 행을 모두 삭제하면, 결측치가 있는 행은 모두 삭제됨!
# sex columns에 .이 찍힌 행이 하나 있어서 삭제해줌
df = df.dropna(subset=['sex'])
df = df[df['sex'] != '.']
df.shape

(333, 7)

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 레이블 바꾸기 => 0 1
label_encoder = LabelEncoder()
df['species'] = label_encoder.fit_transform(df['species'])

# one-hot encoding
df_encoded = pd.get_dummies(df, columns=['island', 'sex'],dtype=int)
# 다만, get_dummies는 안쓰는게 좋음.
# test_data에서 새로운 값이 발생한다면, col이 하나 추가되어 에러 발생
# 다른 방법을 사용하는게 좋다
# => 해당 컬럼들 + 혹시라도 들어올 예외값을 위한 컬럼을 만들어놓고 분류하는 로직을 만들어놓는게 좋을듯?

# X, y 분리
X = df_encoded.drop('species', axis=1)
y = df['species']

# Train , Test 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scaling 실행 => 스케일링은 MinMax, 훈련 데이터에 적용한 스케일러는 Test에도 동일하게 적용
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
scaler = MinMaxScaler()

# 스케일링할 columns
columns_to_scale = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']

# 해당 컬럼들 스케일링 실행
X_train_to_scale = X_train[columns_to_scale]
X_train_scaled = X_train.copy()
X_train_scaled[columns_to_scale] = scaler.fit_transform(X_train_to_scale)

In [12]:
X_train_scaled.head()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,island_Torgersen,sex_FEMALE,sex_MALE
230,0.32,0.071429,0.711864,0.541667,1,0,0,1,0
84,0.189091,0.559524,0.322034,0.180556,0,1,0,1,0
303,0.650909,0.333333,0.881356,0.736111,1,0,0,0,1
22,0.138182,0.72619,0.288136,0.305556,1,0,0,1,0
29,0.305455,0.690476,0.135593,0.347222,1,0,0,0,1


##### X_test 데이터 처리

In [13]:
# train 데이터에 적용된 scaler을 그래도 사용
X_test_to_scale = X_test[columns_to_scale]
X_test_scaled = X_test.copy()
X_test_scaled[columns_to_scale] = scaler.fit_transform(X_test_to_scale)

In [14]:
X_test_scaled.head()

Unnamed: 0,culmen_length_mm,culmen_depth_mm,flipper_length_mm,body_mass_g,island_Biscoe,island_Dream,island_Torgersen,sex_FEMALE,sex_MALE
30,0.209402,0.435897,0.0,0.114754,0,1,0,1,0
317,0.525641,0.166667,0.846154,0.647541,1,0,0,1,0
79,0.320513,0.74359,0.326923,0.360656,0,0,1,0,1
201,0.649573,0.512821,0.384615,0.254098,0,1,0,1,0
63,0.277778,0.628205,0.269231,0.377049,1,0,0,0,1


# SVM

### 베이스 모델 테스트

In [15]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train_scaled, y_train)

### 평가

In [16]:
from sklearn.metrics import accuracy_score
y_pred = svc.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"정확도: {accuracy}")

정확도: 1.0


### 최적 파라미터를 찾기 위한 Grid Search

In [17]:
from sklearn.model_selection import GridSearchCV

# 파라미터 그리드 구성
param_grid = {
    'C': [0.1, 1, 10],  
    'gamma': [1, 0.1, 0.01], 
    'kernel': ['rbf', 'linear']
}


grid_search = GridSearchCV(SVC(), param_grid, cv =5, refit=True, verbose=3)
# cv => 교차 검증 5번 수행
# refit => 재학습 O
# verbose => 출력 메시지 양 조절

# 그리드 서치 실행
grid_search.fit(X_train_scaled, y_train)

# 최적 매개변수 도출
print("최적의 매개변수:", grid_search.best_params_)

# 그 최적 매개변수로 훈련된 svc로 predict 실행
best_svc = grid_search.best_estimator_
y_pred = best_svc.predict(X_test_scaled)

# 정확도 산출
accuracy = accuracy_score(y_test, y_pred)
print(f"최적 모델 정확도: {accuracy}")

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.815 total time=   0.0s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.792 total time=   0.0s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.811 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.830 total time=   0.0s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.792 total time=   0.0s
[CV 1/5] END .....C=0.1, gamma=1, kernel=linear;, score=1.000 total time=   0.0s
[CV 2/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.811 total time=   0.0s
[CV 3/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.811 total time=   0.0s
[CV 4/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.981 total time=   0.0s
[CV 5/5] END .....C=0.1, gamma=1, kernel=linear;, score=0.792 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.685 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;