# Feature Scaling
작성일 : 2020-09-29

---

## Feature Scaling
서로 다른 변수의 값 범위를 일정한 수준으로 맞추는 작업

### 표준화 (Standardization)
평균은 0, 표준편차 1인 표준정규분포를 가진 값으로 변환

```
   관측값 - 평균
   ----------
     표준편차
```

In [2]:
import numpy as np
x = np.arange(9, dtype = np.float) - 3
x

array([-3., -2., -1.,  0.,  1.,  2.,  3.,  4.,  5.])

In [3]:
x.shape

(9,)

In [4]:
x.reshape(9,1) # 행의 수를 알고 있을 경우
np.reshape(x,(9,1))

array([[-3.],
       [-2.],
       [-1.],
       [ 0.],
       [ 1.],
       [ 2.],
       [ 3.],
       [ 4.],
       [ 5.]])

In [5]:
x = x.reshape(-1,1) # 열은 고정, 행은 가변일 경우 -1
x

array([[-3.],
       [-2.],
       [-1.],
       [ 0.],
       [ 1.],
       [ 2.],
       [ 3.],
       [ 4.],
       [ 5.]])

In [6]:
x = np.vstack([x,[100]]) # 열 이어붙이기
x

array([[ -3.],
       [ -2.],
       [ -1.],
       [  0.],
       [  1.],
       [  2.],
       [  3.],
       [  4.],
       [  5.],
       [100.]])

In [8]:
import pandas as pd
pd.DataFrame(x).describe()

Unnamed: 0,0
count,10.0
mean,10.9
std,31.412842
min,-3.0
25%,-0.75
50%,1.5
75%,3.75
max,100.0


In [9]:
# 표준화
y = (x - np.mean(x)) / np.std(x)
y

array([[-0.46642982],
       [-0.43287372],
       [-0.39931762],
       [-0.36576152],
       [-0.33220541],
       [-0.29864931],
       [-0.26509321],
       [-0.23153711],
       [-0.197981  ],
       [ 2.98984872]])

In [10]:
pd.DataFrame(y).describe()

Unnamed: 0,0
count,10.0
mean,-4.4408920000000007e-17
std,1.054093
min,-0.4664298
25%,-0.3909286
50%,-0.3154274
75%,-0.2399261
max,2.989849


### 정규화 (Normalization)
최소값과 최대값을 사용해서 0 ~ 1 사이의 데이터로 변환

```
       x - x.min()
    -----------------
    x.max() - x.min()
```

In [11]:
(x - x.min()) / (x.max() - x.min())

array([[0.        ],
       [0.00970874],
       [0.01941748],
       [0.02912621],
       [0.03883495],
       [0.04854369],
       [0.05825243],
       [0.06796117],
       [0.0776699 ],
       [1.        ]])

### Feature Scaling 패키지

#### 표준화

In [12]:
from sklearn.preprocessing import StandardScaler
s = StandardScaler()

In [13]:
r = s.fit_transform(x)
r

array([[-0.46642982],
       [-0.43287372],
       [-0.39931762],
       [-0.36576152],
       [-0.33220541],
       [-0.29864931],
       [-0.26509321],
       [-0.23153711],
       [-0.197981  ],
       [ 2.98984872]])

In [14]:
np.mean(r)

0.0

In [15]:
np.std(r)

1.0

In [16]:
from sklearn.preprocessing import scale
scale(x)

array([[-0.46642982],
       [-0.43287372],
       [-0.39931762],
       [-0.36576152],
       [-0.33220541],
       [-0.29864931],
       [-0.26509321],
       [-0.23153711],
       [-0.197981  ],
       [ 2.98984872]])

In [17]:
np.mean(scale(x))

0.0

In [18]:
np.std(scale(x))

1.0

#### 정규화

In [19]:
from sklearn.preprocessing import MinMaxScaler
m = MinMaxScaler()
m.fit_transform(x)

array([[0.        ],
       [0.00970874],
       [0.01941748],
       [0.02912621],
       [0.03883495],
       [0.04854369],
       [0.05825243],
       [0.06796117],
       [0.0776699 ],
       [1.        ]])

In [20]:
from sklearn.preprocessing import minmax_scale
minmax_scale(x)

array([[0.        ],
       [0.00970874],
       [0.01941748],
       [0.02912621],
       [0.03883495],
       [0.04854369],
       [0.05825243],
       [0.06796117],
       [0.0776699 ],
       [1.        ]])

#### bmi 문제 다시 풀기

In [23]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier

bmi = pd.read_csv("c:/data/bmi.csv")
bmi

Unnamed: 0,height,weight,label
0,142,62,fat
1,142,73,fat
2,177,61,normal
3,187,48,thin
4,153,60,fat
...,...,...,...
19995,122,58,fat
19996,193,69,normal
19997,193,37,thin
19998,195,51,thin


In [24]:
x_train = np.array(bmi.iloc[:,0:2])
label = bmi['label']
y = np.array([[178, 71]])

##### 표준화

In [25]:
pd.DataFrame(x_train).describe()

Unnamed: 0,0,1
count,20000.0,20000.0
mean,159.9272,57.535
std,23.342096,13.285259
min,120.0,35.0
25%,140.0,46.0
50%,160.0,58.0
75%,180.0,69.0
max,200.0,80.0


In [26]:
x_train

array([[142,  62],
       [142,  73],
       [177,  61],
       ...,
       [193,  37],
       [195,  51],
       [163,  67]])

In [27]:
x_train_scale = scale(x_train)
x_train_scale

array([[-0.76803935,  0.3360952 ],
       [-0.76803935,  1.16410127],
       [ 0.73143504,  0.26082192],
       ...,
       [ 1.41690904, -1.5457368 ],
       [ 1.5025933 , -0.49191088],
       [ 0.13164528,  0.71246159]])

In [28]:
pd.DataFrame(x_train_scale).describe()

Unnamed: 0,0,1
count,20000.0,20000.0
mean,7.332052000000001e-17,2.467693e-16
std,1.000025,1.000025
min,-1.710566,-1.696283
25%,-0.8537236,-0.8682773
50%,0.003118907,0.03500208
75%,0.8599614,0.8630082
max,1.716804,1.691014


In [29]:
y = np.array([[170, 91]])
y

array([[170,  91]])

In [30]:
y_scale = np.array([[(y[0][0] - 159.927200) / 23.342096, (y[0][1] - 57.535000) / 13.285259]])
y_scale

array([[0.43152937, 2.51895729]])

In [31]:
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(x_train_scale, label)
clf.predict(y_scale)[0]

'fat'

##### 정규화

In [32]:
pd.DataFrame(x_train).describe()

Unnamed: 0,0,1
count,20000.0,20000.0
mean,159.9272,57.535
std,23.342096,13.285259
min,120.0,35.0
25%,140.0,46.0
50%,160.0,58.0
75%,180.0,69.0
max,200.0,80.0


In [33]:
x_train_minmax = minmax_scale(x_train)
x_train_minmax

array([[0.275     , 0.6       ],
       [0.275     , 0.84444444],
       [0.7125    , 0.57777778],
       ...,
       [0.9125    , 0.04444444],
       [0.9375    , 0.35555556],
       [0.5375    , 0.71111111]])

In [34]:
pd.DataFrame(x_train_minmax).describe()

Unnamed: 0,0,1
count,20000.0,20000.0
mean,0.49909,0.500778
std,0.291776,0.295228
min,0.0,0.0
25%,0.25,0.244444
50%,0.5,0.511111
75%,0.75,0.755556
max,1.0,1.0


In [35]:
y_minmax = np.array([[(y[0][0] - 120) / (200 - 120), (y[0][1] - 35) / (80 - 35)]])
y_minmax

array([[0.625     , 1.24444444]])

In [36]:
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(x_train_minmax, label)
clf.predict(y_minmax)[0]

'fat'