## 4장 5절 데이터 스케일링

### 2. Standard Scaler

In [4]:
import pandas as pd
from sklearn.datasets import load_iris

In [6]:
iris = load_iris()
df_iris = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df_iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [8]:
df_iris['class'] = iris.target
df_iris['class'] = df_iris['class'].map({0:'setosa',1:'versicolour', 2:'virginica'})
df_iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df_iris.drop(columns=['class']), df_iris['class'], test_size=0.2, random_state=1004, stratify=df_iris['class'])

In [12]:
print('X_train: ', X_train.shape, 'X_test: ', X_test.shape)
print('y_train: ', y_train.shape, 'y_test: ', y_test.shape)

X_train:  (120, 4) X_test:  (30, 4)
y_train:  (120,) y_test:  (30,)


In [14]:
y_train.value_counts()

class
versicolour    40
virginica      40
setosa         40
Name: count, dtype: int64

In [1]:
from sklearn.preprocessing import StandardScaler

In [15]:
stdScaler = StandardScaler()
stdScaler.fit(X_train)
X_train_sc = stdScaler.transform(X_train)
X_test_sc = stdScaler.transform(X_test)

In [16]:
print('\t\t(min, max) (mean, std)')
print('Train_scaled (%.2f, %.2f) (%.2f, %.2f)'%(X_train_sc.min(), X_train_sc.max(), X_train_sc.mean(), X_train_sc.std()))
print('Test_scaled (%.2f, %.2f) (%.2f, %.2f)'%(X_test_sc.min(), X_test_sc.max(), X_test_sc.mean(), X_test_sc.std()))

		(min, max) (mean, std)
Train_scaled (-2.37, 3.04) (0.00, 1.00)
Test_scaled (-1.76, 2.48) (-0.01, 0.97)


### 3. Min-Max Scaler

In [18]:
from sklearn.preprocessing import MinMaxScaler

In [19]:
MMScaler = MinMaxScaler()
MMScaler.fit(X_train)
X_train_sc = MMScaler.transform(X_train)
X_test_sc = MMScaler.transform(X_test)

In [20]:
print('\t\t(min, max) (mean, std)')
print('Train_scaled (%.2f, %.2f) (%.2f, %.2f)'%(X_train_sc.min(), X_train_sc.max(), X_train_sc.mean(), X_train_sc.std()))
print('Test_scaled (%.2f, %.2f) (%.2f, %.2f)'%(X_test_sc.min(), X_test_sc.max(), X_test_sc.mean(), X_test_sc.std()))

		(min, max) (mean, std)
Train_scaled (0.00, 1.00) (0.46, 0.27)
Test_scaled (0.03, 1.06) (0.45, 0.26)


### 4. Max Abs Scaler

In [21]:
from sklearn.preprocessing import MaxAbsScaler

In [22]:
MAScaler = MaxAbsScaler()
MAScaler.fit(X_train)

X_train_sc = MAScaler.transform(X_train)
X_test_sc = MAScaler.transform(X_test)

In [23]:
print('\t\t(min, max) (mean, std)')
print('Train_scaled (%.2f, %.2f) (%.2f, %.2f)'%(X_train_sc.min(), X_train_sc.max(), X_train_sc.mean(), X_train_sc.std()))
print('Test_scaled (%.2f, %.2f) (%.2f, %.2f)'%(X_test_sc.min(), X_test_sc.max(), X_test_sc.mean(), X_test_sc.std()))

		(min, max) (mean, std)
Train_scaled (0.04, 1.00) (0.62, 0.24)
Test_scaled (0.08, 1.03) (0.62, 0.24)


### 5. Robust Scaler

In [24]:
from sklearn.preprocessing import RobustScaler

In [25]:
RuScaler = RobustScaler()
RuScaler.fit(X_train)

X_train_sc = RuScaler.transform(X_train)
X_test_sc = RuScaler.transform(X_test)

In [26]:
print('\t\t(min, max) (mean, std)')
print('Train_scaled (%.2f, %.2f) (%.2f, %.2f)'%(X_train_sc.min(), X_train_sc.max(), X_train_sc.mean(), X_train_sc.std()))
print('Test_scaled (%.2f, %.2f) (%.2f, %.2f)'%(X_test_sc.min(), X_test_sc.max(), X_test_sc.mean(), X_test_sc.std()))

		(min, max) (mean, std)
Train_scaled (-1.90, 2.67) (-0.02, 0.65)
Test_scaled (-1.14, 1.90) (-0.02, 0.62)


### 6. 원본 스케일로 변경하기

In [27]:
pd.DataFrame(X_train_sc).head(3)

Unnamed: 0,0,1,2,3
0,0.846154,0.190476,0.157143,0.133333
1,-0.153846,-0.380952,0.157143,0.466667
2,-0.076923,0.0,-0.042857,-0.066667


In [28]:
X_original = RuScaler.inverse_transform(X_train_sc)
pd.DataFrame(X_original).head(3)

Unnamed: 0,0,1,2,3
0,6.9,3.1,4.9,1.5
1,5.6,2.8,4.9,2.0
2,5.7,3.0,4.2,1.2
