In [30]:
import numpy as np
import pandas as pd

print('numpy version - ', np.__version__)
print('pandas version - ', pd.__version__)


import sklearn
from sklearn.datasets import load_iris, load_breast_cancer

print('sklearn version - ', sklearn.__version__)

import seaborn as sns

# 데이터 분류함수
from sklearn.model_selection import train_test_split, KFold , StratifiedKFold , cross_val_score, cross_validate, GridSearchCV 
from sklearn.tree            import DecisionTreeClassifier
from sklearn.metrics         import accuracy_score

from sklearn.preprocessing   import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler 

numpy version -  1.20.3
pandas version -  1.3.4
sklearn version -  0.24.2


## 스케일링 전 모델 학습 

In [63]:
sample_frm = pd.read_csv('https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/wine_data.csv' , 
                                   header  = None , 
                                   usecols = [0, 2, 13])

sample_frm.columns = ['target', 'x', 'y']
sample_frm_data = sample_frm.drop(['target'], axis=1)


In [64]:
print('01')
x_train, x_test ,y_train, y_test = train_test_split(sample_frm_data,
                                                    sample_frm.target,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=100)

01


In [65]:
x_train.shape, x_test.shape ,y_train.shape, y_test.shape

((142, 2), (36, 2), (142,), (36,))

In [66]:
print('02')

sample_frm_model = DecisionTreeClassifier()
sample_frm_model.fit(x_train, y_train) #학습데이터만

02


DecisionTreeClassifier()

In [67]:
print('03')
y_pred = sample_frm_model.predict(x_test) 

print('04')
print('acc - ',accuracy_score(y_test, y_pred))

03
04
acc -  0.7777777777777778


## 스케일링 후 모델 학습 

In [68]:
print('01')
x_train, x_test ,y_train, y_test = train_test_split(sample_frm_data,
                                                    sample_frm.target,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=100)

01


In [69]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((142, 2), (36, 2), (142,), (36,))

In [70]:
sample_dtc_model = DecisionTreeClassifier()
param = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [1,2,3],
    'min_samples_split' : [2,3],
    'splitter' : ['random', 'best']
}



In [71]:
print('GridSearchCV를 이용한 최적의 모델을 만드는 방법 - ')
grid_tree = GridSearchCV(sample_dtc_model,
                         param_grid=param,
                         cv = 5,
                         refit=True
                        )

GridSearchCV를 이용한 최적의 모델을 만드는 방법 - 


In [72]:
grid_tree.fit(x_train, y_train)
grid_tree.cv_results_

{'mean_fit_time': array([0.00140824, 0.0011981 , 0.00081825, 0.00100274, 0.0006021 ,
        0.00100179, 0.00082664, 0.00105987, 0.00099964, 0.0009912 ,
        0.00099502, 0.00079803, 0.00102506, 0.00104885, 0.000809  ,
        0.0008213 , 0.00099077, 0.00069652, 0.00099888, 0.00081415,
        0.00101209, 0.00139985, 0.00102029, 0.00100021]),
 'std_fit_time': array([4.89432790e-04, 4.00797513e-04, 4.11281572e-04, 2.58379748e-05,
        4.91616002e-04, 1.58654264e-05, 4.14244179e-04, 6.20204892e-04,
        6.68432158e-05, 3.81414886e-05, 1.02521098e-05, 3.99315132e-04,
        5.96760513e-05, 8.08525738e-05, 4.06048117e-04, 4.12415336e-04,
        3.57464435e-05, 4.04761856e-04, 2.07128533e-05, 4.09107925e-04,
        2.21295599e-05, 4.89706784e-04, 4.12023176e-05, 4.62310777e-07]),
 'mean_score_time': array([0.00080552, 0.00081611, 0.00080738, 0.00037694, 0.00080934,
        0.00079193, 0.00040116, 0.00060673, 0.00038161, 0.00080338,
        0.00059137, 0.00061069, 0.00054855, 0.00

In [73]:
score_frm = pd.DataFrame(grid_tree.cv_results_)
score_frm[['params', 'rank_test_score']]

Unnamed: 0,params,rank_test_score
0,"{'criterion': 'gini', 'max_depth': 1, 'min_sam...",21
1,"{'criterion': 'gini', 'max_depth': 1, 'min_sam...",13
2,"{'criterion': 'gini', 'max_depth': 1, 'min_sam...",24
3,"{'criterion': 'gini', 'max_depth': 1, 'min_sam...",13
4,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",17
5,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",5
6,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",19
7,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",5
8,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",7
9,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",1


In [74]:
print('최적의 파라미터 확인 - ')
print(grid_tree.best_params_)

print('최적의 파라미터의 정확도 - ')
print(grid_tree.best_score_)

최적의 파라미터 확인 - 
{'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 2, 'splitter': 'best'}
최적의 파라미터의 정확도 - 
0.8027093596059114


In [75]:
estimator = grid_tree.best_estimator_
y_pred = estimator.predict(x_test)

print(f'acc - {accuracy_score(y_test, y_pred)} \n ')

acc - 0.75 
 


## 표준화 & 정규화

In [113]:
sample_frm = pd.read_csv('https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/wine_data.csv' , 
                                   header  = None , 
                                   usecols = [0, 2, 13])

sample_frm.columns = ['target', 'x', 'y']

In [None]:
print('01')
x_train, x_test ,y_train, y_test = train_test_split(s_score_frm,
                                                    sample_frm_all.target,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=100)

In [114]:
x_train.shape, x_test.shape ,y_train.shape, y_test.shape

((142, 2), (36, 2), (142,), (36,))

## 표준화

In [116]:
print('학습데이터의 정규화 - ')
scaler = MinMaxScaler()
scaler.fit(x_train)
m_train_scaler = scaler.transform(x_train)

print('테스트데이터의 정규화 - ')
m_test_scaler = scaler.transform(x_test)

학습데이터의 정규화 - 
테스트데이터의 정규화 - 


In [88]:
score_frm = pd.DataFrame(data  = sample_frm,
                        columns=['x', 'y']
                       )

s_scaler = StandardScaler()
s_score_frm = s_scaler.fit_transform(score_frm)
s_score_frm = pd.DataFrame(data=s_score_frm,
                         columns=['x', 'y']
                         )
s_score_frm

표준화


Unnamed: 0,x,y
0,-0.562250,1.013009
1,-0.499413,0.965242
2,0.021231,1.395148
3,-0.346811,2.334574
4,0.227694,-0.037874
...,...,...
173,2.974543,-0.021952
174,1.412609,0.009893
175,1.744744,0.280575
176,0.227694,0.296498


In [103]:
print('01')
x_train, x_test ,y_train, y_test = train_test_split(s_score_frm,
                                                    sample_frm_all.target,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=100)

01


In [104]:
x_train.shape, x_test.shape ,y_train.shape, y_test.shape

((142, 2), (36, 2), (142,), (36,))

In [105]:
print('02')

sample_frm_model = DecisionTreeClassifier()
sample_frm_model.fit(x_train, y_train) #학습데이터만

02


DecisionTreeClassifier()

In [106]:
print('03')
y_pred = sample_frm_model.predict(x_test) 

print('04')
print('acc - ',accuracy_score(y_test, y_pred))

03
04
acc -  0.75


## 정규화

In [117]:
print('01')
x_train, x_test ,y_train, y_test = train_test_split(s_score_frm,
                                                    sample_frm_all.target,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=100)

01


In [118]:
x_train.shape, x_test.shape ,y_train.shape, y_test.shape

((142, 2), (36, 2), (142,), (36,))

In [126]:
print('02 \n 학습데이터의 정규화 - ')
scaler = MinMaxScaler()
scaler.fit(x_train)
m_train_scaler = scaler.transform(x_train)

print('테스트데이터의 정규화 - ')
m_test_scaler = scaler.transform(x_test)

02 
 학습데이터의 정규화 - 
테스트데이터의 정규화 - 


In [127]:
# 주말 해결 오류
sample_frm_model = DecisionTreeClassifier()
sample_frm_model.fit(m_train_scaler, m_test_scaler) #학습데이터만

ValueError: Unknown label type: 'continuous-multioutput'

In [123]:
print('03')
y_pred = sample_frm_model.predict(x_test) 

print('04')
print('acc - ',accuracy_score(y_test, y_pred))

03
04
acc -  0.75
