In [49]:
# 기본 패키지
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# 이항로지스틱 패키지
from sklearn.linear_model import LogisticRegression # 이항 로지스틱 회귀분석을 위한 패키지
import statsmodels.api as sm
from sklearn.model_selection import train_test_split # 데이터 분할을 위한 패키지
from sklearn.metrics import accuracy_score, confusion_matrix # 분석 결과 확인을 위한 패키지
from sklearn.metrics import roc_curve, roc_auc_score # ROC 커브를 그리기 위한 패키지

# CART
from sklearn import tree # CART 분석을 하기 위한 패키지
from sklearn.tree import DecisionTreeClassifier, export_text #의사결정나무 분류 및 규칙 확인을 위한 패키지
from sklearn.tree import DecisionTreeRegressor # 의사결정나무 회귀
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 의사결정나무 분류모형 성능 평가

# 랜덤포레스트
from sklearn.ensemble import RandomForestClassifier # 랜덤 포레스트를 위한 패키지(분류)
from sklearn.ensemble import RandomForestRegressor # 랜덤 포레스트를 위한 패키지(회귀)
from sklearn.metrics import accuracy_score, confusion_matrix # 분석 결과 확인을 위한 패키지

# 서포트 벡터 머신 (SVM)
from sklearn import svm

# k-fold 교차 검증
from sklearn.model_selection import KFold

from IPython.display import display, HTML
import warnings
warnings.filterwarnings(action='ignore')

In [50]:
svm = pd.read_csv('c:/data/pj/food_clean+remove_outlier_수정.csv')
svm.head()

Unnamed: 0,Temp,Humidity,Weight,Defect
0,26.4,42.9,100.190254,0
1,24.3,57.7,98.952307,0
2,23.5,44.3,99.997581,0
3,26.5,44.4,100.536753,0
4,27.2,55.9,98.838464,1


In [51]:
x = svm.iloc[:, 0:-1]
y = svm.iloc[:, -1]
x
#y.head()

Unnamed: 0,Temp,Humidity,Weight
0,26.4,42.9,100.190254
1,24.3,57.7,98.952307
2,23.5,44.3,99.997581
3,26.5,44.4,100.536753
4,27.2,55.9,98.838464
...,...,...,...
391,23.1,58.1,98.575073
392,25.5,53.0,100.031100
393,24.2,46.3,100.401429
394,24.0,50.7,99.249985


In [57]:
## 데이터 분할하기 2 (Function : split)
# 학습(Training)과 검증(Test) 데이터세트로 분할
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 1234)
y_train.shape # 데이터 행 수 확인
len(y_train)

277

In [60]:
# SVM 활용하여 모델 학습

from sklearn import svm
svm_model = svm.SVC(kernel = 'linear',class_weight='balanced')
svm_model.fit(x_train, y_train)

In [61]:
# 검증 데이터에 대한 예측
y_pred = svm_model.predict(x_test)
y_pred

array([0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0])

In [62]:
# 모형 성능 확인(정확도 및 혼동 행렬 출력)
print(f'Accuracy: {accuracy_score(y_test, y_pred)}')
print('Confusion Matrix:')
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7142857142857143
Confusion Matrix:
[[81 26]
 [ 8  4]]


In [63]:
# 분류 모형 성능 평가 정확도(accuracy), 정밀도(precision), 재현율(recall), F1 점수(F1 score)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [64]:
accuracy

0.7142857142857143

In [65]:
precision

0.13333333333333333

In [66]:
recall

0.3333333333333333

In [67]:
f1

0.19047619047619047

In [68]:
# 분류 모형 평가 지표 데이터프레임 변환
metrics_df = pd.DataFrame({
    'Metric': ['Accuracy', 'Precision', 'Recall', 'F1 Score'],
    'Value': [accuracy, precision, recall, f1]
})

metrics_df

Unnamed: 0,Metric,Value
0,Accuracy,0.714286
1,Precision,0.133333
2,Recall,0.333333
3,F1 Score,0.190476
