In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from scipy import stats

################################
# Raw Data Loading
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/NeoWizard(박성호)-Pytorch/data/ozone.csv')
training_data = df[['Temp','Ozone']]

################################
# 결측값(NA) 제거
training_data = training_data.dropna(how='any')
print(f'training_data.shape = {training_data.shape}')  # (116, 2) 너무 많이 삭제됬습니다. 별로 좋지 않아요!

################################
# 이상치(Outlier) 처리

zscore_threshold = 2.0 # zscore outliers 임계값 (2.0이하가 optimal)

# Ozone에 대한 이상치(Outlier) 확인
outliers = training_data['Ozone'][(np.abs(stats.zscore(training_data['Ozone'])) > zscore_threshold)]
print(f'outliers = {outliers}')

# Ozone에 대한 이상치(Outlier) 제거한 결과
training_data = training_data.loc[~training_data['Ozone'].isin(outliers)]
print(f'training_data.shape = {training_data.shape}')  # (103, 2)

################################
# 정규화 처리(Min-Max Normalization)

# 1차원 array의 경우 reshape(-1, 1)을 이용 shape을 변경
x_data = training_data['Temp'].values.reshape(-1,1)
t_data = training_data['Ozone'].values.reshape(-1,1)

# MinMaxScaler 클래스의 인스턴스 생성.
scaler_x = MinMaxScaler()
scaler_t = MinMaxScaler()

# scaler로 data 학습
scaler_x.fit(x_data)
scaler_t.fit(t_data)

# fit()으로 학습시킨 것을 적용
x_data_norm = scaler_x.transform(x_data)
t_data_norm = scaler_t.transform(t_data)

print(f'type(x_data_norm) = {type(x_data_norm)}')  # <class 'numpy.ndarray'>
print(f'x_data_norm.shape = {x_data_norm.shape}')  # (109, 1)
print(f't_data_norm.shape = {t_data_norm.shape}')  # (109, 1)

# MinMaxScaler

In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# 1차원 array의 경우 reshape(-1, 1)을 이용 shape을 변경
data = np.array([1, 5, 7, 2, 9, 11]).reshape(-1,1)

scaler = MinMaxScaler()
scaler.fit(data)  # data를 scaler로 변환하기 위한 함수 생성
scaled_data = scaler.transform(data)  # original data를 입력해 data를 변환
data = scaler.inverse_transform(scaled_data) # scaler로 변환된 data를 original data로 변환

print('==============================================')
print('original')
print('==============================================')
print(data)
print('==============================================')
print('scaled')
print('==============================================')
print(scaled_data)
print('==============================================')
print('original')
print('==============================================')
print(data)


In [None]:
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# numpy array
data = np.array([[1,  0.1,  0.3],
                 [4,  6.7,  6.0],
                 [7 , 18.0, 29.0]])

## MinMaxScaler 정의 [ feature_range로 normalization 범위를 0~1로 설정 ]
scaler = MinMaxScaler(feature_range=(0,1))

## numpy array에 Scaler 적용
scaled_data = scaler.fit_transform(data)

print('==============================================')
print('original')
print('==============================================')
print(data)
print('==============================================')
print('scaled')
print('==============================================')
print(scaled_data)

In [None]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

data = {'A' : [10, 20, 30, 40],
        'B' : [1, 2, 3, 4],
        'C' : [100, 200, 300, 400]}

df = pd.DataFrame(data)

## MinMaxScaler 정의 [ feature_range로 normalization 범위를 0~1로 설정 ]
scaler = MinMaxScaler(feature_range=(-1, 1))

## DataFrame에 Scaler 적용
scaled_data = pd.DataFrame(scaler.fit_transform(df), columns = df.columns)
# 또는 scaled_data = sc.fit_transform(df)

print('==============================================')
print('original')
print('==============================================')
print(df)
print('==============================================')
print('scaled')
print('==============================================')
print(scaled_data)

# 특정 col 정규화에 사용된 최대값(Max), 최소값(Min)
col_name = 'A'  # 추출하려는 열의 이름
col_min = scaler.data_min_[df.columns.get_loc(col_name)]
col_max = scaler.data_max_[df.columns.get_loc(col_name)]
print(f'col_min = {col_min}')
print(f'col_max = {col_max}')