## 데이터 전처리

### 실습 데이터셋 준비

In [1]:
# 관련 라이브러리를 호출합니다.
import pandas as pd

In [2]:
# 인터넷으로 공유 중인 텍스트 데이터를 읽고 데이터프레임 df를 생성합니다.
df = pd.read_csv(filepath_or_buffer = 'https://bit.ly/Iris_Data')

In [3]:
# df의 정보를 확인합니다.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Sepal.Length  150 non-null    float64
 1   Sepal.Width   150 non-null    float64
 2   Petal.Length  150 non-null    float64
 3   Petal.Width   150 non-null    float64
 4   Species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [4]:
# df의 처음 5행을 출력합니다.
df.head(n = 5)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [5]:
# 연속형 변수의 기술통계량을 확인합니다.
# [참고] 표준편차를 계산할 때 분모에 n-1을 지정합니다.
df.describe().round(2)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
count,150.0,150.0,150.0,150.0
mean,5.84,3.06,3.76,1.2
std,0.83,0.44,1.77,0.76
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [6]:
# 범주형 변수의 기술통계량을 확인합니다.
df.describe(include = 'object')

Unnamed: 0,Species
count,150
unique,3
top,setosa
freq,50


In [7]:
# 범주형 변수의 범주별 빈도수를 확인합니다.
df['Species'].value_counts()

Species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

In [8]:
# 범주형 변수를 제외하고 X에 할당합니다.
X = df.drop(columns = 'Species')

### 데이터 표준화

In [9]:
# 관련 라이브러리를 호출합니다.
from sklearn.preprocessing import StandardScaler

In [10]:
# 데이터프레임의 열별로 표준화합니다.
scaler = StandardScaler()
scaled = scaler.fit_transform(X = X)

In [11]:
# 2차원 배열을 데이터프레임으로 변환합니다.
scaled = pd.DataFrame(data = scaled, columns = X.columns)
scaled

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
0,-0.900681,1.019004,-1.340227,-1.315444
1,-1.143017,-0.131979,-1.340227,-1.315444
2,-1.385353,0.328414,-1.397064,-1.315444
3,-1.506521,0.098217,-1.283389,-1.315444
4,-1.021849,1.249201,-1.340227,-1.315444
...,...,...,...,...
145,1.038005,-0.131979,0.819596,1.448832
146,0.553333,-1.282963,0.705921,0.922303
147,0.795669,-0.131979,0.819596,1.053935
148,0.432165,0.788808,0.933271,1.448832


In [12]:
# 표준화된 데이터프레임의 열별 기술통계량을 확인합니다.
scaled.describe().round(4)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
count,150.0,150.0,150.0,150.0
mean,-0.0,-0.0,-0.0,-0.0
std,1.0034,1.0034,1.0034,1.0034
min,-1.87,-2.4339,-1.5676,-1.4471
25%,-0.9007,-0.5924,-1.2266,-1.1838
50%,-0.0525,-0.132,0.3365,0.1325
75%,0.6745,0.5586,0.7628,0.7907
max,2.492,3.0908,1.7858,1.7121


### 최소-최대 정규화

In [13]:
# 관련 라이브러리를 호출합니다.
from sklearn.preprocessing import MinMaxScaler

In [14]:
# 데이터프레임의 열별로 최소-최대 정규화합니다.
scaler = MinMaxScaler()
scaled = scaler.fit_transform(X = X)

In [15]:
# 2차원 배열을 데이터프레임으로 변환합니다.
scaled = pd.DataFrame(data = scaled, columns = X.columns)
scaled

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
0,0.222222,0.625000,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.500000,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
...,...,...,...,...
145,0.666667,0.416667,0.711864,0.916667
146,0.555556,0.208333,0.677966,0.750000
147,0.611111,0.416667,0.711864,0.791667
148,0.527778,0.583333,0.745763,0.916667


In [16]:
# 표준화된 데이터프레임의 열별 기술통계량을 확인합니다.
scaled.describe().round(4)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
count,150.0,150.0,150.0,150.0
mean,0.4287,0.4406,0.4675,0.4581
std,0.23,0.1816,0.2992,0.3176
min,0.0,0.0,0.0,0.0
25%,0.2222,0.3333,0.1017,0.0833
50%,0.4167,0.4167,0.5678,0.5
75%,0.5833,0.5417,0.6949,0.7083
max,1.0,1.0,1.0,1.0


### 로버스트 정규화

In [17]:
# 관련 라이브러리를 호출합니다.
from sklearn.preprocessing import RobustScaler

In [18]:
# 데이터프레임의 열별로 최소-최대 정규화합니다.
scaler = RobustScaler()
scaled = scaler.fit_transform(X = X)

In [19]:
# 2차원 배열을 데이터프레임으로 변환합니다.
scaled = pd.DataFrame(data = scaled, columns = X.columns)
scaled

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
0,-0.538462,1.0,-0.842857,-0.733333
1,-0.692308,0.0,-0.842857,-0.733333
2,-0.846154,0.4,-0.871429,-0.733333
3,-0.923077,0.2,-0.814286,-0.733333
4,-0.615385,1.2,-0.842857,-0.733333
...,...,...,...,...
145,0.692308,0.0,0.242857,0.666667
146,0.384615,-1.0,0.185714,0.400000
147,0.538462,0.0,0.242857,0.466667
148,0.307692,0.8,0.300000,0.666667


In [20]:
# 표준화된 데이터프레임의 열별 기술통계량을 확인합니다.
scaled.describe().round(4)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width
count,150.0,150.0,150.0,150.0
mean,0.0333,0.1147,-0.1691,-0.0671
std,0.637,0.8717,0.5044,0.5082
min,-1.1538,-2.0,-0.9571,-0.8
25%,-0.5385,-0.4,-0.7857,-0.6667
50%,0.0,0.0,0.0,0.0
75%,0.4615,0.6,0.2143,0.3333
max,1.6154,2.8,0.7286,0.8


### 원-핫 인코딩

In [21]:
# 범주형 변수로 원-핫 인코딩을 실행합니다.
pd.get_dummies(
    data = df, 
    prefix = 'Species', 
    columns = ['Species'], 
    drop_first = False, 
    dtype = int
)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species_setosa,Species_versicolor,Species_virginica
0,5.1,3.5,1.4,0.2,1,0,0
1,4.9,3.0,1.4,0.2,1,0,0
2,4.7,3.2,1.3,0.2,1,0,0
3,4.6,3.1,1.5,0.2,1,0,0
4,5.0,3.6,1.4,0.2,1,0,0
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0,0,1
146,6.3,2.5,5.0,1.9,0,0,1
147,6.5,3.0,5.2,2.0,0,0,1
148,6.2,3.4,5.4,2.3,0,0,1


### 레이블 인코딩

In [22]:
# 관련 라이브러리를 호출합니다.
from sklearn.preprocessing import LabelEncoder

In [23]:
# 레이블 인코딩을 실행합니다.
le = LabelEncoder()
y = le.fit_transform(X = df['Species'])

In [24]:
# 1차원 배열을 시리즈로 변환합니다.
y = pd.Series(data = y)
y

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Length: 150, dtype: int64

## End of Document