### 전처리(Preprocessing)
* 원본 raw 데이터의 품질이 좋지 않아서 가공하여 데이터의 품질을 향상시키는 과정

#### 피처 스케일링
#### (1) 표준화(Standardization) : 데이터가 평균으로 부터 얼마나 떨어져 있는지를 나타내는 값으로, 특정 범위를 벗어난 데이터를 이상치(outlier)로 간주하여 제거 한다, StandardScaler 사용, Z = (X - mean())/std()  , 정규분포, 가우시안분포(평균=0,분산=1)

#### (2) 정규화(Normalization) : 데이터의 범위를 일치시키거나 분포를 유사하게 만들어 주는 기능, 데이터의 범위를 0 ~ 1사이의 값으로 변환, MinMaxScaler, Z = (X-min())/(max()-min())

In [2]:
import numpy as np
import pandas as pd

### 표준화(Standardization)

In [25]:
# (1) numpy를 이용한 표준화
# Z = (X - mean())/std()
data = np.random.randint(1000,size=(3,4))  # 0~999까지의 정수 ,(3,4) 2차원 배열
print('raw_data:\n',data)

data_stand_np = (data - np.mean(data,axis=0))/np.std(data,axis=0)
print('numpy standardized data:\n',data_stand_np)

raw_data:
 [[627 954 594 246]
 [123  84  57 773]
 [961 403 559 177]]
numpy standardized data:
 [[ 0.16451318  1.31808087  0.77724422 -0.57352879]
 [-1.29868643 -1.102884   -1.4118125   1.40627257]
 [ 1.13417325 -0.21519688  0.63456827 -0.83274377]]


In [30]:
# (2) sklearn.preprocessing를 이용한 표준화
# StandardScaler().fit_transform()  (fit() --> transform())

from sklearn.preprocessing import StandardScaler

data_stand_skl = StandardScaler().fit_transform(data)
print('sklearn standardized data:\n',data_stand_skl)

# 동일한 구현
sc = StandardScaler()  # 클래스의 인스턴스를 생성
# print(type(sc))      # <class 'sklearn.preprocessing._data.StandardScaler'>
sc.fit(data)
data_stand_skl = sc.transform(data)
print('sklearn standardized data:\n',data_stand_skl)

sklearn standardized data:
 [[ 0.16451318  1.31808087  0.77724422 -0.57352879]
 [-1.29868643 -1.102884   -1.4118125   1.40627257]
 [ 1.13417325 -0.21519688  0.63456827 -0.83274377]]
sklearn standardized data:
 [[ 0.16451318  1.31808087  0.77724422 -0.57352879]
 [-1.29868643 -1.102884   -1.4118125   1.40627257]
 [ 1.13417325 -0.21519688  0.63456827 -0.83274377]]


In [33]:
# fit_transform() 메서드
from sklearn.base import TransformerMixin

class A(TransformerMixin):
    def fit(self,X):
        print('fit:',X)
        return self
    
    def transform(self,X):
        print('transform:',X)
        return X

inst = A()  # 클래스의 인스턴스 생성
# inst.fit('x_data')
# inst.transform('y_data')
inst.fit_transform('my_data')

fit: my_data
transform: my_data


'my_data'

In [41]:
# csv파일을 읽어서 표준화 전처리
# import pandas as pd
# from sklearn.preprocessing import StandardScaler

df = pd.read_csv('표준화예제데이터.csv')

# 표준화 수행
scaler = StandardScaler()
scaled_data = scaler.fit_transform(df)

# Dataframe으로 변환
df_scaled_data = pd.DataFrame(scaled_data,columns=df.columns)

# 새 파일로 저장
df_scaled_data.to_csv('표준화 처리된 데이터.csv',index=False)
df_scaled_data.head()

Unnamed: 0,feature1,feature2,feature3
0,0.56142,-1.299977,1.253192
1,0.124558,-0.908903,-0.830076
2,0.369201,-0.511256,1.270381
3,-1.158068,0.967597,-0.737257
4,1.089149,1.592001,-0.627249


### 정규화(Normalization)

In [46]:
# (1) numpy를 이용한 정규화 : 0 ~ 1 범위
# Z = (X-min())/(max()-min())

data_minmax_np = (data - np.min(data,axis=0))/(np.max(data,axis=0) - np.min(data,axis=0))
print('numpy minmax data:\n',data_minmax_np)

numpy minmax data:
 [[0.60143198 1.         1.         0.11577181]
 [0.         0.         0.         1.        ]
 [1.         0.36666667 0.93482309 0.        ]]


In [49]:
# (2) sklearn.preprocessing 를  이용한 정규화
# MinMaxScaler 클래스 : fit_transform() (fit() -> transform())

from sklearn.preprocessing import MinMaxScaler

data_minmax_skl = MinMaxScaler().fit_transform(data)
print('sklearn minmax data:\n',data_minmax_skl)

sklearn minmax data:
 [[0.60143198 1.         1.         0.11577181]
 [0.         0.         0.         1.        ]
 [1.         0.36666667 0.93482309 0.        ]]


In [52]:
# csv파일을 읽어서 정규화 전처리
# import pandas as pd
# from sklearn.preprocessing import MinMaxScaler

df = pd.read_csv('표준화예제데이터.csv')

# 정규화 수행
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df)

# Dataframe으로 변환
df_scaled_data = pd.DataFrame(scaled_data,columns=df.columns)

# 새 파일로 저장
df_scaled_data.to_csv('정규화 처리된 데이터.csv',index=False)
df_scaled_data.head()

Unnamed: 0,feature1,feature2,feature3
0,0.692308,0.075884,0.87538
1,0.564103,0.199584,0.261398
2,0.635897,0.325364,0.880446
3,0.187692,0.793139,0.288754
4,0.847179,0.990644,0.321175


In [58]:
# 최소값이 0인 데이터의 정규화는 최대값으로만 나누면 된다!
a = np.arange(0,256)
a/255

array([0.        , 0.00392157, 0.00784314, 0.01176471, 0.01568627,
       0.01960784, 0.02352941, 0.02745098, 0.03137255, 0.03529412,
       0.03921569, 0.04313725, 0.04705882, 0.05098039, 0.05490196,
       0.05882353, 0.0627451 , 0.06666667, 0.07058824, 0.0745098 ,
       0.07843137, 0.08235294, 0.08627451, 0.09019608, 0.09411765,
       0.09803922, 0.10196078, 0.10588235, 0.10980392, 0.11372549,
       0.11764706, 0.12156863, 0.1254902 , 0.12941176, 0.13333333,
       0.1372549 , 0.14117647, 0.14509804, 0.14901961, 0.15294118,
       0.15686275, 0.16078431, 0.16470588, 0.16862745, 0.17254902,
       0.17647059, 0.18039216, 0.18431373, 0.18823529, 0.19215686,
       0.19607843, 0.2       , 0.20392157, 0.20784314, 0.21176471,
       0.21568627, 0.21960784, 0.22352941, 0.22745098, 0.23137255,
       0.23529412, 0.23921569, 0.24313725, 0.24705882, 0.25098039,
       0.25490196, 0.25882353, 0.2627451 , 0.26666667, 0.27058824,
       0.2745098 , 0.27843137, 0.28235294, 0.28627451, 0.29019

### 레이블 인코딩(Label Encoding)

In [60]:
from sklearn.preprocessing import LabelEncoder

items = ['TV', '냉장고', '전자레인지', '컴퓨터', '선풍기', '믹서', '선풍기','믹서']

# encoder = LabelEncoder()
# encoder.fit(items)
# labels = encoder.transform(items)

labels = LabelEncoder().fit_transform(items)
print('encoding 변환 값:',labels)

encoding 변환 값: [0 1 4 5 3 2 3 2]


In [61]:
print('encoding 클래스:',encoder.classes_)

encoding 클래스: ['TV' '냉장고' '믹서' '선풍기' '전자레인지' '컴퓨터']


In [63]:
print('decding 원본 값:', encoder.inverse_transform([0, 1, 4, 5, 3, 2, 3, 2]))

decding 원본 값: ['TV' '냉장고' '전자레인지' '컴퓨터' '선풍기' '믹서' '선풍기' '믹서']


### 결측치 제거

In [90]:
import pandas as pd

df = pd.read_csv('결측치예제데이터.csv')
df2 = df.fillna(0)  # 결측치를 0으로 채움
df2

Unnamed: 0,학생_ID,수학,영어,과학,사회
0,1,85.0,0.0,88.0,0.0
1,2,0.0,75.0,92.0,83.0
2,3,78.0,82.0,0.0,78.0
3,4,92.0,90.0,85.0,88.0
4,5,0.0,85.0,79.0,90.0
5,6,88.0,0.0,94.0,0.0
6,7,76.0,91.0,0.0,85.0
7,8,95.0,88.0,87.0,87.0
8,9,0.0,77.0,91.0,0.0
9,10,89.0,0.0,0.0,92.0


In [67]:
np

<class 'float'>


In [80]:
print(np.divide(0,0))   # nan
print(np.divide(10,0))  # inf

nan
inf


  print(np.divide(0,0))
  print(np.divide(10,0))


###  피처(feature) 엔지니어링
- 피처 selection  : 여러 피처(X) 중에서 필요한 피처만 선택
- 피처 extraction : 기존 피처를 가공하거나 다른 방법을 사용하여 새로운 피처를 추출해내는 방법