# 데이터 전처리

### 1. 레이블 인코딩

In [1]:
from sklearn.preprocessing import LabelEncoder

In [2]:
items=['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']

In [3]:
# 레이블 인코더 객체 생성
le = LabelEncoder()

In [4]:
# 데이터를 주어서 처리할 데이터 어떻게 생겼는지 확인
le.fit(items)

LabelEncoder()

In [5]:
labels = le.transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

In [6]:
# 실전에서는
le = LabelEncoder()
labels = le.fit_transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

In [7]:
# 고수들은
labels = LabelEncoder().fit_transform(items)
labels

array([0, 1, 4, 5, 3, 3, 2, 2])

### 2. One-hot Encoding

In [8]:
from sklearn.preprocessing import OneHotEncoder

In [9]:
items=['TV','냉장고','전자렌지','컴퓨터','선풍기','선풍기','믹서','믹서']
labels = LabelEncoder().fit_transform(items)

In [10]:
labels.shape

(8,)

In [13]:
# OneHotEncoder 객체 생성후 레이블을 2차원으로 만들어서 변환시킴
oh = OneHotEncoder()
encoded = oh.fit_transform(labels.reshape(-1,1))

In [14]:
# 변환된 결과는 희소 행렬(sparse matrix)
encoded

<8x6 sparse matrix of type '<class 'numpy.float64'>'
	with 8 stored elements in Compressed Sparse Row format>

In [15]:
encoded.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

In [17]:
# Tensorflow/Keras 라이브러리를 이용하면 좀더 편리하게 할 수 있음
from tensorflow.keras.utils import to_categorical
to_categorical(labels)

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]], dtype=float32)

### 3. StandardScaler

- 스케일링을 안 한 경우

In [18]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [19]:
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, stratify=iris.target, test_size=0.2, random_state=2021
)

In [20]:
lrc = LogisticRegression(random_state=2021)
lrc.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=2021, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

- 데이터 정규화

In [22]:
from sklearn.preprocessing import StandardScaler

In [23]:
scaler = StandardScaler()
iris_std = scaler.fit_transform(iris.data)

In [26]:
import pandas as pd
df = pd.DataFrame(iris_std, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444


In [27]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,-1.690315e-15,-1.84297e-15,-1.698641e-15,-1.409243e-15
std,1.00335,1.00335,1.00335,1.00335
min,-1.870024,-2.433947,-1.567576,-1.447076
25%,-0.9006812,-0.592373,-1.226552,-1.183812
50%,-0.05250608,-0.1319795,0.3364776,0.1325097
75%,0.6745011,0.5586108,0.7627583,0.7906707
max,2.492019,3.090775,1.785832,1.712096


In [28]:
# iris.data 대신에 정규화한 iris_std 로 대체
X_train, X_test, y_train, y_test = train_test_split(
    iris_std, iris.target, stratify=iris.target, test_size=0.2, random_state=2021
)

In [29]:
lrc = LogisticRegression(random_state=2021)
lrc.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=2021, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

표준정규분포로 정규화한 데이터로 수행하니까 아무런 문제 없이 깨끗하게 처리됨

### 4. MinMaxScaler

In [30]:
from sklearn.preprocessing import MinMaxScaler

In [31]:
mm_scaler = MinMaxScaler()
iris_scaled = mm_scaler.fit_transform(iris.data)

In [32]:
df = pd.DataFrame(iris_scaled, columns=iris.feature_names)
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.5,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667


In [33]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,0.428704,0.440556,0.467458,0.458056
std,0.230018,0.181611,0.299203,0.317599
min,0.0,0.0,0.0,0.0
25%,0.222222,0.333333,0.101695,0.083333
50%,0.416667,0.416667,0.567797,0.5
75%,0.583333,0.541667,0.694915,0.708333
max,1.0,1.0,1.0,1.0


In [34]:
# iris.data 대신에 정규화한 iris_scaled 로 대체
X_train, X_test, y_train, y_test = train_test_split(
    iris_scaled, iris.target, stratify=iris.target, test_size=0.2, random_state=2021
)

In [35]:
lrc = LogisticRegression(random_state=2021)
lrc.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=2021, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)