# 데이터 전처리

### 1. 레이블 인코딩

In [1]:
from sklearn.preprocessing import LabelEncoder

In [2]:
items = ['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

In [3]:
# 객체 생성
encoder = LabelEncoder()

In [4]:
# 데이터를 주어서 간을 봄. 일종의 학습
encoder.fit(items)

LabelEncoder()

In [5]:
# 인코딩 수행
labels = encoder.transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

In [6]:
# 실전에서는
labels2 = encoder.fit_transform(items)
labels2

array([0, 1, 4, 5, 3, 3, 2, 2])

In [7]:
# 고수들은
labels3 = LabelEncoder().fit_transform(items)
labels3

array([0, 1, 4, 5, 3, 3, 2, 2])

### 2. One-hot encoding
- Scikit-Learn: 선택적으로 하면 됨
- Tensorflow/Keras: 반드시 해야 함

In [8]:
from sklearn.preprocessing import OneHotEncoder

In [10]:
labels.reshape(-1,1)

array([[0],
       [1],
       [4],
       [5],
       [3],
       [3],
       [2],
       [2]])

In [11]:
oh_encoder = OneHotEncoder()
oh_encoder.fit(labels.reshape(-1,1))

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=True)

In [13]:
oh_vector = oh_encoder.transform(labels.reshape(-1,1))
oh_vector.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

### 3. Standard Scaler
- 표준정규분포 N(0, 1)로 변환

In [14]:
from sklearn.datasets import load_iris
iris = load_iris()

In [15]:
import pandas as pd
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [16]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
iris_std = scaler.fit_transform(iris.data)

In [17]:
df2 = pd.DataFrame(iris_std, columns=iris.feature_names)
df2.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444


In [18]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [19]:
df2.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,-1.690315e-15,-1.84297e-15,-1.698641e-15,-1.409243e-15
std,1.00335,1.00335,1.00335,1.00335
min,-1.870024,-2.433947,-1.567576,-1.447076
25%,-0.9006812,-0.592373,-1.226552,-1.183812
50%,-0.05250608,-0.1319795,0.3364776,0.1325097
75%,0.6745011,0.5586108,0.7627583,0.7906707
max,2.492019,3.090775,1.785832,1.712096


### 4. Min Max Scaler
- 0 ~ 1 사이의 값으로 변환

In [20]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
iris_scaled = scaler.fit_transform(iris.data)

In [21]:
df3 = pd.DataFrame(iris_scaled, columns=iris.feature_names)
df3.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.5,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667


In [22]:
df3.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,0.428704,0.440556,0.467458,0.458056
std,0.230018,0.181611,0.299203,0.317599
min,0.0,0.0,0.0,0.0
25%,0.222222,0.333333,0.101695,0.083333
50%,0.416667,0.416667,0.567797,0.5
75%,0.583333,0.541667,0.694915,0.708333
max,1.0,1.0,1.0,1.0


### 전처리한 데이터로 로지스틱 회귀 적용

In [23]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    iris_scaled, iris.target, stratify=iris.target,
    test_size = 0.2, random_state=2021
)

In [24]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state=2021)
lrc.fit(X_train, y_train)
lrc.score(X_test, y_test)

0.9