### 데이터 전처리
##### 1. 레이블 인코딩

In [1]:
from sklearn.preprocessing import LabelEncoder

In [2]:
items = ['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

In [3]:
# 객체 생성
le = LabelEncoder()

In [4]:
# 학습
le.fit(items)

In [5]:
# 인코딩 실행, 즉 변환 작업
labels = le.transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

In [6]:
# 단축 형태
le2 = LabelEncoder()
labels = le2.fit_transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

In [7]:
# 고수들은
labels = LabelEncoder().fit_transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

In [8]:
le.inverse_transform([2,3,5,0,1])

array(['믹서', '선풍기', '컴퓨터', 'TV', '냉장고'], dtype='<U4')

##### 2. One-hot encoding

In [9]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder()
oh_labels = ohe.fit_transform(labels.reshape(-1,1))
oh_labels.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [10]:
from tensorflow.keras.utils import to_categorical
to_categorical(labels)

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]], dtype=float32)

##### 3. 표준화
- 평균 0, 표준편차 1인 가우시안 표준정규분포

In [11]:
from sklearn.datasets import load_iris
iris = load_iris()

In [12]:
from sklearn.preprocessing import StandardScaler
iris_std = StandardScaler().fit_transform(iris.data)

In [13]:
iris.data[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [14]:
iris_std[:5]

array([[-0.90068117,  1.01900435, -1.34022653, -1.3154443 ],
       [-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
       [-1.38535265,  0.32841405, -1.39706395, -1.3154443 ],
       [-1.50652052,  0.09821729, -1.2833891 , -1.3154443 ],
       [-1.02184904,  1.24920112, -1.34022653, -1.3154443 ]])

In [15]:
iris_std[:,0].mean(), iris_std[:,0].std()

(-1.4684549872375404e-15, 1.0)

In [16]:
for i in range(4):
    print(iris_std[:,i].mean(), iris_std[:,i].std())

-1.4684549872375404e-15 1.0
-1.8237263551175904e-15 1.0000000000000004
-1.6105635343895603e-15 0.9999999999999999
-9.473903143468002e-16 1.0


- 로지스틱 회귀(Logistic Regression)

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [18]:
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, stratify=iris.target, test_size=0.2, random_state=2023
)
lr = LogisticRegression()
lr.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [19]:
lr.get_params()

{'C': 1.0,
 'class_weight': None,
 'dual': False,
 'fit_intercept': True,
 'intercept_scaling': 1,
 'l1_ratio': None,
 'max_iter': 100,
 'multi_class': 'auto',
 'n_jobs': None,
 'penalty': 'l2',
 'random_state': None,
 'solver': 'lbfgs',
 'tol': 0.0001,
 'verbose': 0,
 'warm_start': False}

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    iris_std, iris.target, stratify=iris.target, test_size=0.2, random_state=2023
)
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [21]:
lr.score(X_test, y_test)

0.9666666666666667

##### 4. 정규화
- 최소값 0, 최대값 1

In [22]:
from sklearn.preprocessing import MinMaxScaler
iris_mm = MinMaxScaler().fit_transform(iris.data)

In [23]:
for i in range(4):
    print (iris_mm[:,i].min(), iris_mm[:,i].max())

0.0 1.0
0.0 1.0
0.0 1.0
0.0 1.0


In [24]:
X_train, X_test, y_train, y_test = train_test_split(
    iris_mm, iris.target, stratify=iris.target, test_size=0.2, random_state=2023
)
lr = LogisticRegression()
lr.fit(X_train, y_train)

In [25]:
lr.score(X_test, y_test)

0.9333333333333333