In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import joblib
#데이터 전처리 및 train, test 분할
data = pd.read_csv('https://raw.githubusercontent.com/eunjong147/tech/main/sklearn/testgo.csv')
data = np.round(data, decimals=5)
feature_list = list(data)[:-1]
data_input = data[feature_list].to_numpy()
data_target = data['C'].to_numpy()
train_input, test_input, train_target, test_target = train_test_split(data_input, data_target)

#KNN 알고리즘
from sklearn.neighbors import KNeighborsClassifier
kn = KNeighborsClassifier(n_neighbors=3)
kn.fit(train_input, train_target)
print("KNN : ", kn.score(test_input, test_target))

#Decision Tree 알고리즘
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=0)
tree.fit(train_input, train_target)
print("Decision Tree : ", tree.score(test_input, test_target))

#Naive Bayes 알고리즘
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
gnb.fit(train_input, train_target)
print("Naive Bayes : ", gnb.score(test_input, test_target))

#Random forest 알고리즘
from sklearn.ensemble import RandomForestClassifier
ranf = RandomForestClassifier(max_depth=2, random_state=0)
ranf.fit(train_input, train_target)
print("Random Forest : ", ranf.score(test_input, test_target))

#선형회귀
from sklearn.linear_model import LinearRegression
line = LinearRegression()
line.fit(train_input, train_target)
print("Linear : ", line.score(test_input, test_target))

#SVM(Support Vector Machine 알고리즘
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
#평균을 제거하고 데이터를 단위 분산으로 조정한다. 그러나 이상치가 있다면 평균과 표준편차에 영향을 미쳐 변환된 데이터의 확산은 매우 달라지게 된다.
#따라서 이상치가 있는 경우 균형 잡힌 척도를 보장할 수 없다.
from sklearn.preprocessing import StandardScaler
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(train_input, train_target)
print("SVM1 : " ,clf.score(test_input, test_target))

#SVM알고리즘 2
from sklearn import svm
clf2= svm.SVC(kernel='linear')
clf2.fit(train_input, train_target)
print("SVM2 : ", clf2.score(test_input, test_target))

KNN :  1.0
Decision Tree :  0.9985734664764622
Naive Bayes :  0.9985734664764622
Random Forest :  0.6162624821683309
Linear :  0.9605972177328244
SVM1 :  1.0
SVM2 :  1.0


## 모델 성능 세부 평가

* Accuracy: 모든 분류 건수 중에서 분류기가 몇개의 정답을 맞혔는가 (맞거나 틀리다고 정확히 분류했는가)
* Recall: 맞다고 분류해야 하는 건수 중에서 분류기가 몇개나 제대로 분류했는가
* Precision: 분류기가 맞다고 분류한 건수 중에서 실제로 맞는 건수가 몇개나 되는가
**Recall과 Precision은 상충하는 개념이기 때문에 하나가 높아지면 다른 하나는 낮아진다.
* F1 score: Recall과 Precision의 균형값 (조화평균)

In [36]:
# 평가를 위한 cross_validate를 가져오고, 교차검증을 위한 KFold를 가져옴
from sklearn.model_selection import cross_validate, KFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

# 교차검증을 설정합니다.
# n_splits=20의 의미는 전체의 19/20은 훈련, 1/20은 테스트 데이터로 사용한다는 의미입니다.
k_fold = KFold(n_splits=20, random_state=1, shuffle=True)

# 평가지표를 객체로 만들어줍니다.
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score, average = 'micro'),
           'recall' : make_scorer(recall_score, average = 'micro'), 
           'f1_score' : make_scorer(f1_score, average = 'micro'),
          }

knn_result = cross_validate(kn, train_input, train_target, cv=k_fold, scoring=scoring)
tree_result = cross_validate(tree, train_input, train_target, cv=k_fold, scoring=scoring)
NB_result = cross_validate(gnb, train_input, train_target, cv=k_fold, scoring=scoring)
RF_result = cross_validate(ranf, train_input, train_target, cv=k_fold, scoring=scoring)

#선형회귀의 경우 분류척도로 평가할수 없음.
#Line_result = cross_validate(line, train_input, train_target, cv=k_fold, scoring=scoring)

SVM1_result = cross_validate(clf, train_input, train_target, cv=k_fold, scoring=scoring)
SVM2_result = cross_validate(clf2, train_input, train_target, cv=k_fold, scoring=scoring)

In [37]:
print("KNN accuracy : ", knn_result["test_accuracy"].mean())
print("KNN precision : ", knn_result["test_precision"].mean())
print("KNN recall : ", knn_result["test_recall"].mean())
print("KNN f1_score : ", knn_result["test_f1_score"].mean())
print("KNN fit_time : ", knn_result["fit_time"].mean())
print("KNN score_time : ", knn_result["score_time"].mean())
print()
print("Decision Tree accuracy : ", tree_result["test_accuracy"].mean())
print("Decision Tree precision : ", tree_result["test_precision"].mean())
print("Decision Tree recall : ", tree_result["test_recall"].mean())
print("Decision Tree f1_score : ", tree_result["test_f1_score"].mean())
print("Decision Tree fit_time : ", tree_result["fit_time"].mean())
print("Decision Tree score_time : ", tree_result["score_time"].mean())
print()
print("Naive Bayes accuracy : ", NB_result["test_accuracy"].mean())
print("Naive Bayes precision : ", NB_result["test_precision"].mean())
print("Naive Bayes recall : ", NB_result["test_recall"].mean())
print("Naive Bayes f1_score : ", NB_result["test_f1_score"].mean())
print("Naive Bayes fit_time : ", NB_result["fit_time"].mean())
print("Naive Bayes score_time : ", NB_result["score_time"].mean())
print()
print("Random Forest accuracy : ", RF_result["test_accuracy"].mean())
print("Random Forest precision : ", RF_result["test_precision"].mean())
print("Random Forest recall : ", RF_result["test_recall"].mean())
print("Random Forest f1_score : ", RF_result["test_f1_score"].mean())
print("Random Forest fit_time : ", RF_result["fit_time"].mean())
print("Random Forest score_time : ", RF_result["score_time"].mean())
print()
print("SVM1 accuracy : ", SVM1_result["test_accuracy"].mean())
print("SVM1 precision : ", SVM1_result["test_precision"].mean())
print("SVM1 recall : ", SVM1_result["test_recall"].mean())
print("SVM1 f1_score : ", SVM1_result["test_f1_score"].mean())
print("SVM1 fit_time : ", SVM1_result["fit_time"].mean())
print("SVM1 score_time : ", SVM1_result["score_time"].mean())
print()
print("SVM2 accuracy : ", SVM2_result["test_accuracy"].mean())
print("SVM2 precision : ", SVM2_result["test_precision"].mean())
print("SVM2 recall : ", SVM2_result["test_recall"].mean())
print("SVM2 f1_score : ", SVM2_result["test_f1_score"].mean())
print("SVM2 fit_time : ", SVM2_result["fit_time"].mean())
print("SVM2 score_time : ", SVM2_result["score_time"].mean())

KNN accuracy :  0.999052111410602
KNN precision :  0.999052111410602
KNN recall :  0.999052111410602
KNN f1_score :  0.999052111410602
KNN fit_time :  0.017580187320709227
KNN score_time :  0.007234561443328858

Decision Tree accuracy :  0.9966756513926326
Decision Tree precision :  0.9966756513926326
Decision Tree recall :  0.9966756513926326
Decision Tree f1_score :  0.9966756513926326
Decision Tree fit_time :  0.05966956615447998
Decision Tree score_time :  0.002145421504974365

Naive Bayes accuracy :  0.9976235399820306
Naive Bayes precision :  0.9976235399820306
Naive Bayes recall :  0.9976235399820306
Naive Bayes f1_score :  0.9976235399820306
Naive Bayes fit_time :  0.0026941299438476562
Naive Bayes score_time :  0.0025946974754333498

Random Forest accuracy :  0.6091778975741239
Random Forest precision :  0.6091778975741239
Random Forest recall :  0.6091778975741239
Random Forest f1_score :  0.6091778975741239
Random Forest fit_time :  0.27723791599273684
Random Forest score_ti