## 정규화

In [None]:
# # 데이터 스케일링(Data Scaling)은 각 변수들의 범위 혹은 분포를 같게 만드는 작업
# 1)정규화 (Normalization)
# 2) 표준화 (Standardization)

In [None]:
# A) 정규화 (Normalization)
# 정규화는 데이터의 값들을 [0,1] 범위의 값이 되게 변환하는 것

In [1]:
import pandas as pd
lst = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
df = pd.DataFrame(lst, columns=['A','B'])
df
# A자료  음수 값, B 자료  범위 넚다.

Unnamed: 0,A,B
0,-1.0,2
1,-0.5,6
2,0.0,10
3,1.0,18


In [2]:
# 정규화 수식 =  (X-Min) / (Max-Min)
def normalize(s):
    return (s-s.min())/(s.max()-s.min())
    
# normalize(df['A'].to_list())
dic={}
dic['A']=normalize(df['A'])
dic['B']=normalize(df['B'])
df1 = pd.DataFrame(dic)
df1

Unnamed: 0,A,B
0,0.0,0.0
1,0.25,0.25
2,0.5,0.5
3,1.0,1.0


In [3]:
# 2) MinMaxScaler 이용
# 정규화자료 = df1
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df)
arr = scaler.transform(df)
df1 = pd.DataFrame(arr, columns=df.columns)
df1

Unnamed: 0,A,B
0,0.0,0.0
1,0.25,0.25
2,0.5,0.5
3,1.0,1.0


In [4]:
scaler.data_min_, scaler.data_max_,scaler.data_range_

(array([-1.,  2.]), array([ 1., 18.]), array([ 2., 16.]))

In [5]:
# 원자료는 그대로 존재하고 있다.
df

Unnamed: 0,A,B
0,-1.0,2
1,-0.5,6
2,0.0,10
3,1.0,18


In [1]:
import pandas as pd
df = pd.read_csv('C:/finance_data/heart.csv')
df

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [2]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(df[['age']])
arr = scaler.transform(df[['age']])
arr
df1 = pd.DataFrame(arr, columns=['age'])
df1

Unnamed: 0,age
0,0.708333
1,0.166667
2,0.250000
3,0.562500
4,0.583333
...,...
298,0.583333
299,0.333333
300,0.812500
301,0.583333


In [3]:
df1['age'].mean()

0.5284653465346535

In [4]:
type(arr)

numpy.ndarray

In [5]:
import numpy as np
np.mean(arr)

0.5284653465346535

## 표준화 (Standardization)

In [None]:
# 표준화는 데이터의 값을 평균 0, 분산 1이 되게 변환하는 것 
# Z점수 정규화

In [6]:
df = pd.DataFrame({
    'A':[1,2,3,4,5,6],
    'B':[101,102,103,104,105,106]
})
df

Unnamed: 0,A,B
0,1,101
1,2,102
2,3,103
3,4,104
4,5,105
5,6,106


In [7]:
# 표준화(z)수식 : (X - 평균) / 표준편차
import numpy as np

def standardization(s):
    return (s-s.mean()) / s.std(ddof=0) #모집단이라고 가정

df1 = pd.DataFrame({
    'A':standardization(df['A']),
    'B':standardization(df['B'])
})
df1

Unnamed: 0,A,B
0,-1.46385,-1.46385
1,-0.87831,-0.87831
2,-0.29277,-0.29277
3,0.29277,0.29277
4,0.87831,0.87831
5,1.46385,1.46385


In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df)
arr = scaler.transform(df)
df1 = pd.DataFrame(arr, columns=df.columns )
df1

Unnamed: 0,A,B
0,-1.46385,-1.46385
1,-0.87831,-0.87831
2,-0.29277,-0.29277
3,0.29277,0.29277
4,0.87831,0.87831
5,1.46385,1.46385


In [9]:
scaler.mean_, scaler.var_

(array([  3.5, 103.5]), array([2.91666667, 2.91666667]))

In [10]:
df.mean(), df.var(ddof=0)

(A      3.5
 B    103.5
 dtype: float64,
 A    2.916667
 B    2.916667
 dtype: float64)

In [None]:
import pandas as pd
df = pd.read_csv('../heart.csv')
df


In [4]:
import pandas as pd
df = pd.read_csv('C:\finance_data/heart.csv')
df

OSError: [Errno 22] Invalid argument: 'C:\x0cinance_data/heart.csv'

In [1]:
df_tgt = df[[ 'age', 'trestbps', 'chol', 'thalach', 'oldpeak']]
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df_tgt)
arr = scaler.transform(df_tgt)
df1 = pd.DataFrame(arr, columns=df_tgt.columns)
df1
df1 = pd.concat([df1,df[['sex','cp','fbs','restecg','exang','slope', 'ca','thal','target']]], axis=1)
df1

NameError: name 'df' is not defined

## RobustScaler

In [11]:
from sklearn.preprocessing import MaxAbsScaler
scaler = MaxAbsScaler()
scaler.fit(df)
arr = scaler.transform(df)