## Handling Missing Data

In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

In [6]:
df = pd.DataFrame(
    [
        [np.nan, 2, np.nan, 0],
        [3, 4, np.nan, 1],
        [np.nan, np.nan, np.nan, 5]
    ],
    columns=list('ABCD')
)

In [7]:
df

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5


## Dropna

In [8]:
df.dropna?

In [16]:
df.dropna(axis=1, how='all')    # axis=1 (column 기준), how='all' (전부 NaN일 때) 예외

Unnamed: 0,A,B,D
0,,2.0,0
1,3.0,4.0,1
2,,,5


In [19]:
df.dropna(axis=1, how='any')    # axis=1 (column 기준), how='any' (하나라도 NaN이 있으면) 예외

Unnamed: 0,D
0,0
1,1
2,5


In [20]:
df.dropna(axis=0, how='all')    # axis=0 (row 기준), how='all' (전부 NaN일 때) 예외

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5


In [22]:
df.dropna(axis=0, how='any')    # axis=0 (row 기준), how='any' (하나라도 NaN이 있으면) 예외

Unnamed: 0,A,B,C,D


## Fillna

In [23]:
df.fillna?

In [24]:
df.fillna(0)

Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,0.0,1
2,0.0,0.0,0.0,5


In [26]:
values = {'A' : 0, 'B' : 1, 'C' : 'C', 'D' : 3}
df.fillna(value=values)

Unnamed: 0,A,B,C,D
0,0.0,2.0,C,0
1,3.0,4.0,C,1
2,0.0,1.0,C,5


In [29]:
fill_na = df['B'].mean()
fill_na

3.0

In [30]:
df.fillna(fill_na)

Unnamed: 0,A,B,C,D
0,3.0,2.0,3.0,0
1,3.0,4.0,3.0,1
2,3.0,3.0,3.0,5


In [31]:
df.isnull()

Unnamed: 0,A,B,C,D
0,True,False,True,False
1,False,False,True,False
2,True,True,True,False


In [32]:
df.isnull().sum()

A    2
B    1
C    3
D    0
dtype: int64

In [33]:
df.notnull().sum()

A    1
B    2
C    0
D    3
dtype: int64