In [1]:
##### 7. 데이터 정제 및 준비 #####

In [6]:
import pandas as pd
import numpy as np

In [3]:
##### 7.1 누락된 데이터 처리하기 ##### 

In [4]:
# pandas의 모든 기술 통계는 누락된 데이터를 배제하고 처리한다.
# 산술 데이터에 한해 pandas는 누락된 데이터를 실숫값인 NaN으로 처리한다.
# 이를이용해 누락된 값을 쉽게 찾을 수 있다.

In [7]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])

In [8]:
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [9]:
string_data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [12]:
# 파이썬의 내장 None 또한 NA값으로 취급된다.

string_data[0] = None

In [13]:
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

In [14]:
##### 7.1.1 누락된 데이터 골라내기 #####

In [15]:
# dropna 사용이 유용하다.
# null이 아닌 데이터와 색인값만 들어있는 Series를 반환한다.

In [16]:
from numpy import nan as NA

In [17]:
data = pd.Series([1, NA, 3.5, NA, 7])

In [18]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [19]:
data[data.notnull()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [24]:
# DataFrame 객체의 경우 모두 NA 값이거나 NA값을 하나라도 포함하고 있는 로우/컬럼을 제외시킬 수 있다.
# dropna는 기본적으로 NA 값을 하나라도 포함하고 있는 로우를 제외시킨다.

data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])

In [25]:
cleaned = data.dropna()

In [26]:
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [28]:
cleaned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [29]:
# 모두 NA 값인 로우만 제외

data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [30]:
# 컬럼을 제외

data[4] = NA

In [31]:
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [32]:
data.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [33]:
# DataFrame의 로우를 제외시키는 방법은 시계열 데이터에서 주로 사용된다.
# 몇 개 이상의 값이 들어있는 로우만 살펴보고 싶으면 thresh 인자에 원하는 값을 넘긴다.

df = pd.DataFrame(np.random.randn(7, 3))

In [34]:
df.iloc[:4, 1] = NA

In [35]:
df.iloc[:2, 2] = NA

In [36]:
df

Unnamed: 0,0,1,2
0,0.546076,,
1,0.055413,,
2,1.416579,,0.798805
3,0.296703,,1.481489
4,1.081845,0.636531,-0.08305
5,-1.019727,-0.189269,0.691026
6,-0.075309,-1.094764,-0.082042


In [37]:
df.dropna()

Unnamed: 0,0,1,2
4,1.081845,0.636531,-0.08305
5,-1.019727,-0.189269,0.691026
6,-0.075309,-1.094764,-0.082042


In [38]:
df.dropna(thresh=2)

Unnamed: 0,0,1,2
2,1.416579,,0.798805
3,0.296703,,1.481489
4,1.081845,0.636531,-0.08305
5,-1.019727,-0.189269,0.691026
6,-0.075309,-1.094764,-0.082042


In [39]:
##### 7.1.2 결측치 채우기 #####

In [40]:
# 누락된 값을 제외시키지않고 fillna 메서드를 이용해 값 채워넣기

df.fillna(0)

Unnamed: 0,0,1,2
0,0.546076,0.0,0.0
1,0.055413,0.0,0.0
2,1.416579,0.0,0.798805
3,0.296703,0.0,1.481489
4,1.081845,0.636531,-0.08305
5,-1.019727,-0.189269,0.691026
6,-0.075309,-1.094764,-0.082042


In [41]:
# fillna에 사전값을 넘겨 각 컬럼마다 다른값 채우기

df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,0.546076,0.5,0.0
1,0.055413,0.5,0.0
2,1.416579,0.5,0.798805
3,0.296703,0.5,1.481489
4,1.081845,0.636531,-0.08305
5,-1.019727,-0.189269,0.691026
6,-0.075309,-1.094764,-0.082042


In [42]:
# 기존 객체 변경도 가능하다

_ = df.fillna(0, inplace=True)

In [43]:
df

Unnamed: 0,0,1,2
0,0.546076,0.0,0.0
1,0.055413,0.0,0.0
2,1.416579,0.0,0.798805
3,0.296703,0.0,1.481489
4,1.081845,0.636531,-0.08305
5,-1.019727,-0.189269,0.691026
6,-0.075309,-1.094764,-0.082042


In [45]:
# 재색인에서 사용가능한 보간 메서드를 fillna 메서드에서도 사용 가능하다.

df = pd.DataFrame(np.random.randn(6, 3))

In [46]:
df.iloc[2:, 1] = NA

In [48]:
df.iloc[4:, 2] = NA

In [49]:
df

Unnamed: 0,0,1,2
0,0.140158,0.621555,0.186419
1,-0.712498,-1.618172,-0.26993
2,0.417868,,-0.040869
3,-0.534361,,0.010112
4,-0.493854,,
5,0.615079,,


In [50]:
df.fillna(method='ffill')

Unnamed: 0,0,1,2
0,0.140158,0.621555,0.186419
1,-0.712498,-1.618172,-0.26993
2,0.417868,-1.618172,-0.040869
3,-0.534361,-1.618172,0.010112
4,-0.493854,-1.618172,0.010112
5,0.615079,-1.618172,0.010112


In [51]:
df.fillna(method='ffill', limit=2)

Unnamed: 0,0,1,2
0,0.140158,0.621555,0.186419
1,-0.712498,-1.618172,-0.26993
2,0.417868,-1.618172,-0.040869
3,-0.534361,-1.618172,0.010112
4,-0.493854,,0.010112
5,0.615079,,0.010112


In [52]:
# fillna를 이용해 Series의 평균값이나 중간값을 전달할 수도 있다.

data = pd.Series([1., NA, 3.5, NA, 7])

In [53]:
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64