### 1. 결측치 처리
- 제거
- 채우기

In [1]:
import pandas as pd
import numpy as np

In [2]:
d = {'score1': [100, 90, np.nan, 95], 'score2': [30, np.nan, 45, 56], 
     'score3':[52, 40, 80, 98], 'score4': [np.nan, np.nan, np.nan, 65]}

In [4]:
df = pd.DataFrame(d)
df.head()

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,
1,90.0,,40,
2,,45.0,80,
3,95.0,56.0,98,65.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   score1  3 non-null      float64
 1   score2  3 non-null      float64
 2   score3  4 non-null      int64  
 3   score4  1 non-null      float64
dtypes: float64(3), int64(1)
memory usage: 256.0 bytes


In [7]:
df.isnull().sum()

score1    1
score2    1
score3    0
score4    3
dtype: int64

In [8]:
df[df.score1.isnull()]

Unnamed: 0,score1,score2,score3,score4
2,,45.0,80,


#### 행기준으로 삭제

In [9]:
df.dropna()

Unnamed: 0,score1,score2,score3,score4
3,95.0,56.0,98,65.0


In [10]:
df.dropna(axis=0, how='any')
# how='any' -> 어떤 결측지든지..

Unnamed: 0,score1,score2,score3,score4
3,95.0,56.0,98,65.0


#### 열기준

In [11]:
df.dropna(axis=1)

Unnamed: 0,score3
0,52
1,40
2,80
3,98


#### 행의 전체값이 결측인 행을 삭제

In [12]:
d2 = {'score1': [100, np.nan, np.nan, 95], 
     'score2': [np.nan, np.nan, np.nan, np.nan],
     'score3': [52, np.nan, 80, 98], 
     'score4': [np.nan, np.nan, np.nan, 65]}

In [14]:
df2 = pd.DataFrame(d2)
df2

Unnamed: 0,score1,score2,score3,score4
0,100.0,,52.0,
1,,,,
2,,,80.0,
3,95.0,,98.0,65.0


In [15]:
df2.dropna(how='all')
# how='all' -> 전체가 결측치인 경우만 삭제

Unnamed: 0,score1,score2,score3,score4
0,100.0,,52.0,
2,,,80.0,
3,95.0,,98.0,65.0


#### 임계치 설정해서 제거
- 임계치 = 기준값

In [16]:
df2.dropna(thresh=2) # 적당한 임계값이 얼마인지 분석가가 잘 판단해야 한다.

Unnamed: 0,score1,score2,score3,score4
0,100.0,,52.0,
3,95.0,,98.0,65.0


#### 특정 열 안에서 삭제

In [17]:
df.dropna(subset=['score2', 'score4'])

Unnamed: 0,score1,score2,score3,score4
3,95.0,56.0,98,65.0


### 결측치 채우기

#### 특정한 단일값으로 채우기

In [18]:
df.fillna(0)

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,0.0
1,90.0,0.0,40,0.0
2,0.0,45.0,80,0.0
3,95.0,56.0,98,65.0


In [19]:
df.fillna(method='pad')
# method='pad' -> 이전의 값으로 채워진다.
# ㄴ책ㄷ4 -> 이전 값이 없어서 채워질 값 역시 없다.

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,
1,90.0,30.0,40,
2,90.0,45.0,80,
3,95.0,56.0,98,65.0


In [21]:
df.fillna(method='bfill')
# method='bfill' -> nan값의 뒤에 있는 값으로 결측치를 채운다.

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,65.0
1,90.0,45.0,40,65.0
2,95.0,45.0,80,65.0
3,95.0,56.0,98,65.0


#### 결측치를 각 열의 평균값으로 채우기

In [22]:
df.fillna(df.mean())
# df.mean() -> nan값에 평균치를 채운다.
# score1 -> (100 + 90 + 95)/3
# score4 -> (nan + nan + 65)/3 = 65

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,65.0
1,90.0,43.666667,40,65.0
2,95.0,45.0,80,65.0
3,95.0,56.0,98,65.0


#### 결측치 - 중간값, 최소값, 최대값

In [23]:
df.fillna(df.median())

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,65.0
1,90.0,45.0,40,65.0
2,95.0,45.0,80,65.0
3,95.0,56.0,98,65.0


In [24]:
df.fillna(df.min())

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,65.0
1,90.0,30.0,40,65.0
2,90.0,45.0,80,65.0
3,95.0,56.0,98,65.0


In [27]:
df.fillna(df.max())

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,65.0
1,90.0,56.0,40,65.0
2,100.0,45.0,80,65.0
3,95.0,56.0,98,65.0


#### replace() 함수로 결측치 채우기

In [29]:
df.replace(to_replace=np.nan, value=10)
# value=10 -> value에 문자도 입력 가능

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,10.0
1,90.0,10.0,40,10.0
2,10.0,45.0,80,10.0
3,95.0,56.0,98,65.0


#### interpolate() 함수로 결측치 채우기 
- 선형 방법을 사용해서 결측값 채워줌
- 선형방식은 인덱스를 무시하고 값들을 같은 간격으로 처리함.

In [31]:
df.interpolate(method='linear', limit_direction='forward')

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,
1,90.0,37.5,40,
2,92.5,45.0,80,
3,95.0,56.0,98,65.0


In [32]:
df.interpolate(method='pad')

Unnamed: 0,score1,score2,score3,score4
0,100.0,30.0,52,
1,90.0,30.0,40,
2,90.0,45.0,80,
3,95.0,56.0,98,65.0


### 2.범주 특성을 원핫 인코딩으로 변환
- 모든 데이터를 0과 1롤 변환
- 컴퓨터는 모든 데이터를 이지법으로 처리하기 때문에 레이블인코딩 보다는 원핫인코딩을 주로 사용한다.

In [34]:
df = pd.read_csv('mushrooms.csv')
df

Unnamed: 0,class,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,p,x,s,n,t,p,f,c,n,k,...,s,w,w,p,w,o,p,k,s,u
1,e,x,s,y,t,a,f,c,b,k,...,s,w,w,p,w,o,p,n,n,g
2,e,b,s,w,t,l,f,c,b,n,...,s,w,w,p,w,o,p,n,n,m
3,p,x,y,w,t,p,f,c,n,n,...,s,w,w,p,w,o,p,k,s,u
4,e,x,s,g,f,n,f,w,b,k,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,e,k,s,n,f,n,a,c,b,y,...,s,o,o,p,o,o,p,b,c,l
8120,e,x,s,n,f,n,a,c,b,y,...,s,o,o,p,n,o,p,b,v,l
8121,e,f,s,n,f,n,a,c,b,n,...,s,o,o,p,o,o,p,b,c,l
8122,p,k,y,n,f,y,f,c,n,b,...,k,w,w,p,w,o,e,w,v,l


In [35]:
one = pd.get_dummies(df)
one

Unnamed: 0,class_e,class_p,cap-shape_b,cap-shape_c,cap-shape_f,cap-shape_k,cap-shape_s,cap-shape_x,cap-surface_f,cap-surface_g,...,population_s,population_v,population_y,habitat_d,habitat_g,habitat_l,habitat_m,habitat_p,habitat_u,habitat_w
0,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,1,0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
4,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,1,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8120,1,0,0,0,0,0,0,1,0,0,...,0,1,0,0,0,1,0,0,0,0
8121,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
8122,0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,0


- pandas.get_dumies의 문제점

train 데이터만 있고 test 데이터에는 없는 카테고리를 원핫인코딩된 컬럼으로 바꿔주지 못함

#### sklearn OneHotEncoder 사용

In [46]:
x = df.iloc[:, 1:]
y = df['class']

In [47]:
x

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
0,x,s,n,t,p,f,c,n,k,e,...,s,w,w,p,w,o,p,k,s,u
1,x,s,y,t,a,f,c,b,k,e,...,s,w,w,p,w,o,p,n,n,g
2,b,s,w,t,l,f,c,b,n,e,...,s,w,w,p,w,o,p,n,n,m
3,x,y,w,t,p,f,c,n,n,e,...,s,w,w,p,w,o,p,k,s,u
4,x,s,g,f,n,f,w,b,k,t,...,s,w,w,p,w,o,e,n,a,g
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,k,s,n,f,n,a,c,b,y,e,...,s,o,o,p,o,o,p,b,c,l
8120,x,s,n,f,n,a,c,b,y,e,...,s,o,o,p,n,o,p,b,v,l
8121,f,s,n,f,n,a,c,b,n,e,...,s,o,o,p,o,o,p,b,c,l
8122,k,y,n,f,y,f,c,n,b,t,...,k,w,w,p,w,o,e,w,v,l


In [48]:
y

0       p
1       e
2       e
3       p
4       e
       ..
8119    e
8120    e
8121    e
8122    p
8123    e
Name: class, Length: 8124, dtype: object

In [49]:
from sklearn.preprocessing import OneHotEncoder

In [50]:
one = OneHotEncoder(sparse=False)
train_cat = one.fit_transform(x[['cap-shape']])

In [51]:
train_cat

array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [55]:
one.categories_

[array(['b', 'c', 'f', 'k', 's', 'x'], dtype=object)]

In [59]:
o = pd.DataFrame(train_cat, 
                 columns=['caps-shape_' + col for col in one.categories_[0]])

In [60]:
one_x = pd.concat([x.drop(columns=['cap-shape']), o], axis=1)
one_x

Unnamed: 0,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,...,ring-type,spore-print-color,population,habitat,caps-shape_b,caps-shape_c,caps-shape_f,caps-shape_k,caps-shape_s,caps-shape_x
0,s,n,t,p,f,c,n,k,e,e,...,p,k,s,u,0.0,0.0,0.0,0.0,0.0,1.0
1,s,y,t,a,f,c,b,k,e,c,...,p,n,n,g,0.0,0.0,0.0,0.0,0.0,1.0
2,s,w,t,l,f,c,b,n,e,c,...,p,n,n,m,1.0,0.0,0.0,0.0,0.0,0.0
3,y,w,t,p,f,c,n,n,e,e,...,p,k,s,u,0.0,0.0,0.0,0.0,0.0,1.0
4,s,g,f,n,f,w,b,k,t,e,...,e,n,a,g,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,s,n,f,n,a,c,b,y,e,?,...,p,b,c,l,0.0,0.0,0.0,1.0,0.0,0.0
8120,s,n,f,n,a,c,b,y,e,?,...,p,b,v,l,0.0,0.0,0.0,0.0,0.0,1.0
8121,s,n,f,n,a,c,b,n,e,?,...,p,b,c,l,0.0,0.0,1.0,0.0,0.0,0.0
8122,y,n,f,y,f,c,n,b,t,?,...,e,w,v,l,0.0,0.0,0.0,1.0,0.0,0.0


### 범주 특성을 레이블 인코딩으로 변환
- 숫자의 크고 작음에 대한 특성이 작용
- 회귀와 같이 연속된 실수를 다루는 알고리즘에서 숫자 크기에 따른 순서나 중요도로 인식될 수 있어서 잘못된 결과가 나올 수 있다.

In [62]:
one_x['cap-surface'].unique()

array(['s', 'y', 'f', 'g'], dtype=object)

In [63]:
one_x['cap-surface'].value_counts()

y    3244
s    2556
f    2320
g       4
Name: cap-surface, dtype: int64

In [64]:
one_x['cap-surface']

0       s
1       s
2       s
3       y
4       s
       ..
8119    s
8120    s
8121    s
8122    y
8123    s
Name: cap-surface, Length: 8124, dtype: object

In [66]:
one_x['cap-surface'] = one_x['cap-surface'].map({'y':0, 's':1, 'f':2, 'g':3})
one_x

Unnamed: 0,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,...,ring-type,spore-print-color,population,habitat,caps-shape_b,caps-shape_c,caps-shape_f,caps-shape_k,caps-shape_s,caps-shape_x
0,1,n,t,p,f,c,n,k,e,e,...,p,k,s,u,0.0,0.0,0.0,0.0,0.0,1.0
1,1,y,t,a,f,c,b,k,e,c,...,p,n,n,g,0.0,0.0,0.0,0.0,0.0,1.0
2,1,w,t,l,f,c,b,n,e,c,...,p,n,n,m,1.0,0.0,0.0,0.0,0.0,0.0
3,0,w,t,p,f,c,n,n,e,e,...,p,k,s,u,0.0,0.0,0.0,0.0,0.0,1.0
4,1,g,f,n,f,w,b,k,t,e,...,e,n,a,g,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,1,n,f,n,a,c,b,y,e,?,...,p,b,c,l,0.0,0.0,0.0,1.0,0.0,0.0
8120,1,n,f,n,a,c,b,y,e,?,...,p,b,v,l,0.0,0.0,0.0,0.0,0.0,1.0
8121,1,n,f,n,a,c,b,n,e,?,...,p,b,c,l,0.0,0.0,1.0,0.0,0.0,0.0
8122,0,n,f,y,f,c,n,b,t,?,...,e,w,v,l,0.0,0.0,0.0,1.0,0.0,0.0


#### sklearn LabelEncoder

In [67]:
from sklearn.preprocessing import LabelEncoder

In [69]:
encoder = LabelEncoder()
encoder.fit(one_x['cap-color'])
one_x['cap-color'] = encoder.transform(one_x['cap-color'])
one_x

Unnamed: 0,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,...,ring-type,spore-print-color,population,habitat,caps-shape_b,caps-shape_c,caps-shape_f,caps-shape_k,caps-shape_s,caps-shape_x
0,1,4,t,p,f,c,n,k,e,e,...,p,k,s,u,0.0,0.0,0.0,0.0,0.0,1.0
1,1,9,t,a,f,c,b,k,e,c,...,p,n,n,g,0.0,0.0,0.0,0.0,0.0,1.0
2,1,8,t,l,f,c,b,n,e,c,...,p,n,n,m,1.0,0.0,0.0,0.0,0.0,0.0
3,0,8,t,p,f,c,n,n,e,e,...,p,k,s,u,0.0,0.0,0.0,0.0,0.0,1.0
4,1,3,f,n,f,w,b,k,t,e,...,e,n,a,g,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8119,1,4,f,n,a,c,b,y,e,?,...,p,b,c,l,0.0,0.0,0.0,1.0,0.0,0.0
8120,1,4,f,n,a,c,b,y,e,?,...,p,b,v,l,0.0,0.0,0.0,0.0,0.0,1.0
8121,1,4,f,n,a,c,b,n,e,?,...,p,b,c,l,0.0,0.0,1.0,0.0,0.0,0.0
8122,0,4,f,y,f,c,n,b,t,?,...,e,w,v,l,0.0,0.0,0.0,1.0,0.0,0.0


#### 데이터 불균형 확인

In [70]:
y.value_counts()

e    4208
p    3916
Name: class, dtype: int64

In [71]:
e = y.loc[y=='e']
p = y.loc[y=='p']

In [72]:
e

1       e
2       e
4       e
5       e
6       e
       ..
8115    e
8119    e
8120    e
8121    e
8123    e
Name: class, Length: 4208, dtype: object

In [73]:
p

0       p
3       p
8       p
13      p
17      p
       ..
8114    p
8116    p
8117    p
8118    p
8122    p
Name: class, Length: 3916, dtype: object

In [77]:
e = e[:1000]
y = y[:1000]

In [78]:
y = pd.concat([e,p], axis=0) # axis=0 -> 행
y

1       e
2       e
4       e
5       e
6       e
       ..
8114    p
8116    p
8117    p
8118    p
8122    p
Name: class, Length: 4916, dtype: object