# 붓꽃 분류 모델

## 데이터 구성

In [4]:
from sklearn.datasets import load_iris

iris = load_iris()
print('붓꽃 데이터세트 타입 : ', type(iris))

keys = iris.keys()
print('붓꽃 데이터세트 키 : ', keys)

붓꽃 데이터세트 타입 :  <class 'sklearn.utils.Bunch'>
붓꽃 데이터세트 키 :  dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])


- feature_names : 독립변수의 변수명
- data ; 데이터 값
- target : 종속변수 데이터값
- target_names : 종속변수 데이터 값의 의미 ( 0 : setosa, 1 ; versicolor, 2 : virginca)

-> 데이터 세트가 기본적으로 dictionary 타입이기 때문에, 데이터를 불러올 때는 iris.data 형태여야 한다.

In [8]:
print('feature_names : ', iris.feature_names) #컬럼명
print('target_names : ', iris.target_names) #종속변수

feature_names :  ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
target_names :  ['setosa' 'versicolor' 'virginica']


In [9]:
import pandas as pd

iris_df = pd.DataFrame(data = iris.data, columns = iris.feature_names)
iris_df['label'] = iris.target #종속변수
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [12]:
iris_df.shape #총 150행 5열 데이터

(150, 5)

## 데이터셋 분리

In [13]:
from sklearn.model_selection import train_test_split

iris_data = iris.data #독립변수
iris_label = iris.target #종속변수

x_train, x_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size = 0.2, random_state = 11)

print('x_train dataset : ', len(x_train))
print('x_test dataset : ', len(x_test))

print('y_train dataset : ', len(y_train))
print('y_test dataset : ' ,len(y_test))

x_train dataset :  120
x_test dataset :  30
y_train dataset :  120
y_test dataset :  30


train_test_split으로 train data : test data를 8:2로 분할. <br>
따라서 총 150개 데이터셋 중 120개는 train data, 30개는 test dat로 구성됨

## 모델 학습

In [14]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier(random_state = 11)
dt_clf.fit(x_train, y_train)

DecisionTreeClassifier(random_state=11)

## 예측 수행 및 평가

In [16]:
from sklearn.metrics import accuracy_score 

pred = dt_clf.predict(x_test)
ac_score = accuracy_score(y_test, pred)

print('예측 정확도 : ', ac_score)

예측 정확도 :  0.9333333333333333


# 데이터 전처리

## 레이블 인코딩

In [17]:
from sklearn.preprocessing import LabelEncoder

items = ['TV', '냉장고', '전자레인지', '컴퓨터', 'TV', '냉장고', '컴퓨터', '컴퓨터']

encoder = LabelEncoder()
encoder.fit(items)
labels = encoder.transform(items)

print('인코딩 변환값 : ', labels)
print('인코딩 클래스 : ', encoder.classes_)

인코딩 변환값 :  [0 1 2 3 0 1 3 3]
인코딩 클래스 :  ['TV' '냉장고' '전자레인지' '컴퓨터']


In [18]:
origins = encoder.inverse_transform([0, 1, 2, 3, 0, 1, 3, 3])
print('디코딩 원본값 : ', origins)

디코딩 원본값 :  ['TV' '냉장고' '전자레인지' '컴퓨터' 'TV' '냉장고' '컴퓨터' '컴퓨터']


## 원-핫 인코딩

In [22]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

labels = labels.reshape(-1,1) #하나의 열 데이터
 
oh_encoder = OneHotEncoder()
oh_encoder.fit(labels)
oh_labels = oh_encoder.transform(labels)

print('원-핫 인코딩 데이터\n', oh_labels.toarray())
print('원-핫 인코딩 데이터 차원' , oh_labels.shape)

원-핫 인코딩 데이터
 [[1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]]
원-핫 인코딩 데이터 차원 (8, 4)


In [23]:
labels.shape

(8, 1)

In [24]:
import pandas as pd

item_df = pd.DataFrame({'item' : items})
item_df

Unnamed: 0,item
0,TV
1,냉장고
2,전자레인지
3,컴퓨터
4,TV
5,냉장고
6,컴퓨터
7,컴퓨터


In [25]:
pd.get_dummies(item_df)

Unnamed: 0,item_TV,item_냉장고,item_전자레인지,item_컴퓨터
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,1,0,0,0
5,0,1,0,0
6,0,0,0,1
7,0,0,0,1


## 스케일링

In [28]:
from sklearn.datasets import load_iris
import pandas as pd

iris = load_iris()
iris_df = pd.DataFrame(data = iris.data, columns = iris.feature_names)

In [29]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)
iris_df_scaled = pd.DataFrame(data = iris_scaled, columns = iris.feature_names)

In [30]:
iris_df_scaled

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832


In [31]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaler.fit(iris_df)
iris_scaled = scaler.transform(iris_df)
iris_df_scaled = pd.DataFrame(data = iris_scaled, columns = iris.feature_names)

In [32]:
iris_df_scaled

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625000,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.500000,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667
146,0.555556,0.208333,0.677966,0.750000
147,0.611111,0.416667,0.711864,0.791667
148,0.527778,0.583333,0.745763,0.916667


In [None]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

train_array = np.arange(0,11).reshape(-1,1)
test_array = np.arange(0,6).reshape(-1,1)

scaler = MinMaxScaler()
scaler.fit(train_array)
train_scaled = scaler.transform(train_array)

print('원본 train_array 데이터 : ', np.round(train_array.reshape(-1),2))
print('scaled train_array 데이터 : ', np.round(train_scaled.reshape(-1),2))

test_scaled = scaler.transform(test_array)
print('원본 test_array 데이터 : ', np.round(test_array.reshape(-1),2))
print('scaled test_array 데이터 : ', np.rou)