### Machine_Learning Classification

In [1]:
from sklearn.datasets import load_iris
import pandas as pd

#### data loading

In [2]:
### Data loading
iris = load_iris()
iris_data = iris.data
iris_label = iris.target

iris_df = pd.DataFrame(data = iris_data, columns = iris.feature_names)
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


#### data preprocessing

In [None]:
### Data preprocessing
### Standard Scaler
### 평균0, 분산 1인 값으로 변환시켜 '가우시안 정규 분포'를 가질 수 있도록 변환함
### 서포트 벡터머신(Support Vector Machine), 선형회귀(Linear Regression), 로지스틱 회귀(Logistic Regression)은 
### 데이터가 가우시안 분포를 가지고 있다고 가정하고 구현되었기 때문에 예측성능을 향상시킬 수 있음

In [3]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)

iris_scaled_df = pd.DataFrame(data=iris_scaled, columns=iris.feature_names)
iris_scaled_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444


In [4]:
iris_scaled_df['label'] = iris_label
iris_scaled_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,-0.900681,1.019004,-1.340227,-1.315444,0
1,-1.143017,-0.131979,-1.340227,-1.315444,0
2,-1.385353,0.328414,-1.397064,-1.315444,0
3,-1.506521,0.098217,-1.283389,-1.315444,0
4,-1.021849,1.249201,-1.340227,-1.315444,0


#### dataset preparing (seperating)

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X_train_, X_val, y_train_, y_val = train_test_split(iris_scaled, iris_label, test_size = 0.2, random_state = 42, stratify= iris_label )

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_train_, y_train_, test_size = 0.2, random_state = 42, stratify= y_train_ )

In [8]:
print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)
print(X_test.shape, y_test.shape)

(96, 4) (96,)
(30, 4) (30,)
(24, 4) (24,)


#### modeling

In [9]:
from sklearn.tree import DecisionTreeClassifier

In [10]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_val)

#### evaluation

In [11]:
from sklearn.metrics import confusion_matrix, classification_report
# 혼동 행렬 생성
confusion = confusion_matrix(y_val, pred)

print("Confusion Matrix:")
print(confusion)

# 분류 보고서 출력
report = classification_report(y_val, pred)
print("Classification Report:")
print(report)

Confusion Matrix:
[[10  0  0]
 [ 0  9  1]
 [ 0  0 10]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.90      0.95        10
           2       0.91      1.00      0.95        10

    accuracy                           0.97        30
   macro avg       0.97      0.97      0.97        30
weighted avg       0.97      0.97      0.97        30



#### final evaluation

In [12]:
model_pred = dt_clf.predict(X_test)

print("Confusion Matrix:")
print(confusion)

# 분류 보고서 출력
report = classification_report(y_test, model_pred)
print("Classification Report:")
print(report)

Confusion Matrix:
[[10  0  0]
 [ 0  9  1]
 [ 0  0 10]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         8
           1       1.00      0.88      0.93         8
           2       0.89      1.00      0.94         8

    accuracy                           0.96        24
   macro avg       0.96      0.96      0.96        24
weighted avg       0.96      0.96      0.96        24



#### K-fold cross validation

In [13]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import numpy as np

iris = load_iris()
features = iris.data
label = iris.target
dt_clf = DecisionTreeClassifier(random_state=162)

kfold = KFold(n_splits=5)
cv_accuracy = []

n_iter = 0
for train_index, test_index in kfold.split(features):
    X_train, X_test = features[train_index], features[test_index]
    y_train, y_test = label[train_index], label[test_index]

    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    
    n_iter +=1

    accuracy = np.round(accuracy_score(y_test, pred), 4)
    cv_accuracy.append(accuracy)
    print('{}번째 Accuracy {}'.format(n_iter, accuracy))

print('평균 Accuracy ', np.round(np.mean(cv_accuracy),3))

1번째 Accuracy 1.0
2번째 Accuracy 0.9667
3번째 Accuracy 0.9
4번째 Accuracy 0.9333
5번째 Accuracy 0.8
평균 Accuracy  0.92


In [14]:
### cross_val_score()
from sklearn.model_selection import cross_val_score

iris = load_iris()
features = iris.data
label = iris.target
dt_clf = DecisionTreeClassifier(random_state=162)

scores = cross_val_score(dt_clf, features, label, scoring='accuracy', cv=3)
print('교차검증', np.round(scores,4))
print('평균검증', np.round(np.mean(scores),4))

교차검증 [0.98 0.94 0.98]
평균검증 0.9667


#### 하이퍼파라미터 튜닝 GridSearchCV

In [15]:
from sklearn.model_selection import GridSearchCV

grid_parameters = {'max_depth':[1, 2, 3], 'min_samples_split':[2, 3]} 
grid_dtree = GridSearchCV(dt_clf, param_grid= grid_parameters, cv=3, refit=True)

grid_dtree.fit(X_train, y_train)
score_df = pd.DataFrame(grid_dtree.cv_results_)
score_df[['params', 'mean_test_score', 'rank_test_score', 'split0_test_score','split1_test_score','split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 1, 'min_samples_split': 2}",0.833333,5,0.85,0.825,0.825
1,"{'max_depth': 1, 'min_samples_split': 3}",0.833333,5,0.85,0.825,0.825
2,"{'max_depth': 2, 'min_samples_split': 2}",0.958333,1,1.0,0.95,0.925
3,"{'max_depth': 2, 'min_samples_split': 3}",0.958333,1,1.0,0.95,0.925
4,"{'max_depth': 3, 'min_samples_split': 2}",0.95,3,0.975,0.95,0.925
5,"{'max_depth': 3, 'min_samples_split': 3}",0.95,3,0.975,0.95,0.925


#### evaluation 실습

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report
import numpy as np

# 예제 데이터 생성
np.random.seed(0)
X = np.random.rand(10000, 2)
y = (X[:, 0] + X[:, 1] > 1).astype(int)  # 간단한 결정 경계 생성

# 데이터를 훈련 세트와 테스트 세트로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 모델 훈련
model = DecisionTreeClassifier(random_state=42)
model.fit(X_train, y_train)

# 모델 예측
y_pred = model.predict(X_test)

# 혼동 행렬 생성
confusion = confusion_matrix(y_test, y_pred)

print("Confusion Matrix:")
print(confusion)


Confusion Matrix:
[[993  11]
 [ 10 986]]
