In [4]:
import numpy as np
import pandas as pd

print('numpy version - ', np.__version__)
print('pandas version - ', pd.__version__)


import sklearn
from sklearn.datasets import load_iris, load_breast_cancer

print('sklearn version - ', sklearn.__version__)

import seaborn as sns

# 데이터 분류함수
from sklearn.model_selection import train_test_split, KFold , StratifiedKFold , cross_val_score, cross_validate, GridSearchCV 
from sklearn.tree            import DecisionTreeClassifier
from sklearn.metrics         import accuracy_score

from sklearn.preprocessing   import LabelEncoder, OneHotEncoder, StandardScaler, MinMaxScaler 

numpy version -  1.20.3
pandas version -  1.3.4
sklearn version -  0.24.2


## 스케일링 전 모델 학습 

In [5]:
sample_frm = pd.read_csv('https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/wine_data.csv' , 
                                   header  = None , 
                                   usecols = [0, 2, 13])

sample_frm.columns = ['target', 'x', 'y']
sample_frm_data = sample_frm.drop(['target'], axis=1)


In [6]:
print('01')
x_train, x_test ,y_train, y_test = train_test_split(sample_frm_data,
                                                    sample_frm.target,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=100)

01


In [7]:
x_train.shape, x_test.shape ,y_train.shape, y_test.shape

((142, 2), (36, 2), (142,), (36,))

In [8]:
print('02')

sample_frm_model = DecisionTreeClassifier()
sample_frm_model.fit(x_train, y_train) #학습데이터만

02


DecisionTreeClassifier()

In [9]:
print('03')
y_pred = sample_frm_model.predict(x_test) 

print('04')
print('acc - ',accuracy_score(y_test, y_pred))

03
04
acc -  0.75


## 스케일링 후 모델 학습 

In [10]:
print('01')
x_train, x_test ,y_train, y_test = train_test_split(sample_frm_data,
                                                    sample_frm.target,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=100)

01


In [11]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((142, 2), (36, 2), (142,), (36,))

In [12]:
sample_dtc_model = DecisionTreeClassifier()
param = {
    'criterion' : ['gini', 'entropy'],
    'max_depth' : [1,2,3],
    'min_samples_split' : [2,3],
    'splitter' : ['random', 'best']
}



In [13]:
print('GridSearchCV를 이용한 최적의 모델을 만드는 방법 - ')
grid_tree = GridSearchCV(sample_dtc_model,
                         param_grid=param,
                         cv = 5,
                         refit=True
                        )

GridSearchCV를 이용한 최적의 모델을 만드는 방법 - 


In [14]:
grid_tree.fit(x_train, y_train)
grid_tree.cv_results_

{'mean_fit_time': array([0.0028048 , 0.00200868, 0.0022471 , 0.00195894, 0.00181274,
        0.00140152, 0.00219984, 0.00159516, 0.00159831, 0.00149431,
        0.00172143, 0.00176091, 0.00124569, 0.00212026, 0.00119247,
        0.00138845, 0.00166521, 0.00159187, 0.00181785, 0.00173855,
        0.00133176, 0.0014142 , 0.00128403, 0.00171785]),
 'std_fit_time': array([9.72792744e-04, 3.27833782e-05, 6.82454815e-04, 5.34400928e-04,
        4.10040183e-04, 4.87235293e-04, 1.16179157e-03, 4.90266301e-04,
        8.03629995e-04, 6.13948990e-04, 6.35365740e-04, 5.42623598e-04,
        3.86057723e-04, 4.80071661e-04, 3.99648445e-04, 5.10615936e-04,
        5.76813664e-04, 4.63509839e-04, 4.15262684e-04, 6.20068909e-04,
        4.44822046e-04, 4.62325817e-04, 3.97354661e-04, 3.97442224e-04]),
 'mean_score_time': array([0.00117097, 0.00085316, 0.00120091, 0.00102367, 0.0012804 ,
        0.00111828, 0.00134606, 0.0010077 , 0.00100322, 0.00099969,
        0.00090156, 0.00123825, 0.00109577, 0.00

In [15]:
score_frm = pd.DataFrame(grid_tree.cv_results_)
score_frm[['params', 'rank_test_score']]

Unnamed: 0,params,rank_test_score
0,"{'criterion': 'gini', 'max_depth': 1, 'min_sam...",21
1,"{'criterion': 'gini', 'max_depth': 1, 'min_sam...",13
2,"{'criterion': 'gini', 'max_depth': 1, 'min_sam...",24
3,"{'criterion': 'gini', 'max_depth': 1, 'min_sam...",13
4,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",19
5,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",6
6,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",20
7,"{'criterion': 'gini', 'max_depth': 2, 'min_sam...",6
8,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",17
9,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",1


In [16]:
print('최적의 파라미터 확인 - ')
print(grid_tree.best_params_)

print('최적의 파라미터의 정확도 - ')
print(grid_tree.best_score_)

최적의 파라미터 확인 - 
{'criterion': 'gini', 'max_depth': 3, 'min_samples_split': 2, 'splitter': 'best'}
최적의 파라미터의 정확도 - 
0.8027093596059114


In [17]:
estimator = grid_tree.best_estimator_
y_pred = estimator.predict(x_test)

print(f'acc - {accuracy_score(y_test, y_pred)} \n ')

acc - 0.75 
 


## 표준화 & 정규화

In [18]:
sample_frm = pd.read_csv('https://raw.githubusercontent.com/rasbt/pattern_classification/master/data/wine_data.csv' , 
                                   header  = None , 
                                   usecols = [0, 2, 13])

sample_frm.columns = ['target', 'x', 'y']

In [22]:
print('01')
x_train, x_test ,y_train, y_test = train_test_split(sample_frm,
                                                    sample_frm.target,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=100)

01


In [23]:
x_train.shape, x_test.shape ,y_train.shape, y_test.shape

((142, 3), (36, 3), (142,), (36,))

## 표준화

In [24]:
print('학습데이터의 정규화 - ')
scaler = MinMaxScaler()
scaler.fit(x_train)
m_train_scaler = scaler.transform(x_train)

print('테스트데이터의 정규화 - ')
m_test_scaler = scaler.transform(x_test)

학습데이터의 정규화 - 
테스트데이터의 정규화 - 


In [25]:
score_frm = pd.DataFrame(data  = sample_frm,
                        columns=['x', 'y']
                       )

s_scaler = StandardScaler()
s_score_frm = s_scaler.fit_transform(score_frm)
s_score_frm = pd.DataFrame(data=s_score_frm,
                         columns=['x', 'y']
                         )
s_score_frm

Unnamed: 0,x,y
0,-0.562250,1.013009
1,-0.499413,0.965242
2,0.021231,1.395148
3,-0.346811,2.334574
4,0.227694,-0.037874
...,...,...
173,2.974543,-0.021952
174,1.412609,0.009893
175,1.744744,0.280575
176,0.227694,0.296498


In [28]:
print('01')
x_train, x_test ,y_train, y_test = train_test_split(sample_frm,
                                                    sample_frm.target,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=100)

01


In [29]:
x_train.shape, x_test.shape ,y_train.shape, y_test.shape

((142, 3), (36, 3), (142,), (36,))

In [32]:
print('02 \n 학습데이터의 정규화 - ')
scaler = MinMaxScaler()
scaler.fit(x_train)
m_train_scaler = scaler.transform(x_train)

print('테스트데이터의 정규화 - ')
m_test_scaler = scaler.transform(x_test)

02 
 학습데이터의 정규화 - 
테스트데이터의 정규화 - 


In [33]:

# 주말 해결 오류
sample_frm_model = DecisionTreeClassifier()
sample_frm_model.fit(m_train_scaler, m_test_scaler) #학습데이터만

ValueError: Unknown label type: 'continuous-multioutput'

In [34]:
print('03')
y_pred = sample_frm_model.predict(x_test) 

print('04')
print('acc - ',accuracy_score(y_test, y_pred))

03


AttributeError: 'DecisionTreeClassifier' object has no attribute 'tree_'

## 정규화

In [35]:
print('01')
x_train, x_test ,y_train, y_test = train_test_split(s_score_frm,
                                                    sample_frm_all.target,
                                                    test_size=0.2,
                                                    shuffle=True,
                                                    random_state=100)

01


NameError: name 'sample_frm_all' is not defined

In [36]:
x_train.shape, x_test.shape ,y_train.shape, y_test.shape

((142, 3), (36, 3), (142,), (36,))

In [37]:
print('02 \n 학습데이터의 정규화 - ')
scaler = MinMaxScaler()
scaler.fit(x_train)
m_train_scaler = scaler.transform(x_train)

print('테스트데이터의 정규화 - ')
m_test_scaler = scaler.transform(x_test)

02 
 학습데이터의 정규화 - 
테스트데이터의 정규화 - 


In [38]:
# 주말 해결 오류
sample_frm_model = DecisionTreeClassifier()
sample_frm_model.fit(m_train_scaler, m_test_scaler) #학습데이터만

ValueError: Unknown label type: 'continuous-multioutput'

In [39]:
print('03')
y_pred = sample_frm_model.predict(x_test) 

print('04')
print('acc - ',accuracy_score(y_test, y_pred))

03


AttributeError: 'DecisionTreeClassifier' object has no attribute 'tree_'