### Missing data handling in pandas

In [2]:
import pandas as pd
import numpy as np


In [3]:
df = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1],
                   [np.nan, np.nan, np.nan, 5]],
                  columns=list('ABCD'))
df


Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5


### Drop the columns where all elements are nan:


In [4]:
df.dropna(axis=0,how = 'all')


Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5


### Drop the columns where any of the elements is nan


In [12]:
df.dropna(axis=1,how='any')


Unnamed: 0,D
0,0
1,1
2,5


### Drop the rows where all of the elements are nan (there is no row to drop, so df stays the same):

In [13]:
df.dropna(axis=0,how='all')


Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5


In [16]:
df1 = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1], [np.nan, np.nan, np.nan, 5],[3, 4, np.nan, 1], [3, 4, 0, 1]], columns=list('ABCD'))
df1


Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,3.0,4.0,,1
4,3.0,4.0,0.0,1


In [17]:
df1.dropna(thresh=2)

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
3,3.0,4.0,,1
4,3.0,4.0,0.0,1


In [19]:
df2 = pd.DataFrame([[np.nan, 2, np.nan, np.nan,0], [3, 4, np.nan,np.nan, 1], [np.nan, np.nan, np.nan, 5,6],[3, np.nan,4, np.nan, 1], [3, 4, np.nan, 1,2]], columns=list('ABCDE'))
df2

Unnamed: 0,A,B,C,D,E
0,,2.0,,,0
1,3.0,4.0,,,1
2,,,,5.0,6
3,3.0,,4.0,,1
4,3.0,4.0,,1.0,2


In [22]:
# remove row the row where noumbers Nan is exactly or greate thangiven to thresh = int value
df2.dropna(thresh=3,axis=0) # thresh mean count the numbers of NaN,if numbers of NaN exactly it remove the rows or column according axis



Unnamed: 0,A,B,C,D,E
1,3.0,4.0,,,1
3,3.0,,4.0,,1
4,3.0,4.0,,1.0,2


In [24]:
# remove columns twhere numbers Nan is exactly or greter thangiven to thresh = int value
df2.dropna(thresh=3,axis=1) # thresh mean count the numbers of NaN,if numbers of NaN exactly it remove the rows or column according axis



Unnamed: 0,A,B,E
0,,2.0,0
1,3.0,4.0,1
2,,,6
3,3.0,,1
4,3.0,4.0,2


In [26]:
df3 = pd.DataFrame([[np.nan, 2, np.nan, np.nan,0], [3, 4, np.nan,np.nan, 1], [np.nan, np.nan, np.nan, 5,6],[3, np.nan,4, np.nan, 1], [3, 4, np.nan, 1,2]], columns=list('ABCDE'))
df3

Unnamed: 0,A,B,C,D,E
0,,2.0,,,0
1,3.0,4.0,,,1
2,,,,5.0,6
3,3.0,,4.0,,1
4,3.0,4.0,,1.0,2


In [27]:
# subset

df.dropna(axis=0, subset=['A']) #remove row where if there is any 'NaN' value in column 'A'

Unnamed: 0,A,B,C,D
1,3.0,4.0,,1


In [29]:
df2.dropna(axis=1, subset=[1]) #remove column  if there is any 'NaN' value in index is '1'

Unnamed: 0,A,B,E
0,,2.0,0
1,3.0,4.0,1
2,,,6
3,3.0,,1
4,3.0,4.0,2


### fillna() function in pandas

In [30]:
df4 = pd.DataFrame([[np.nan, 2, np.nan, 0],
                   [3, 4, np.nan, 1],                   [np.nan, np.nan, np.nan, 5],
                    [np.nan, 3, np.nan, 4]],
                    columns=list('ABCD'))
df4

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,,,,5
3,,3.0,,4


In [32]:
## Replace all NaN elements with 0s.
df4.fillna(0)


Unnamed: 0,A,B,C,D
0,0.0,2.0,0.0,0
1,3.0,4.0,0.0,1
2,0.0,0.0,0.0,5
3,0.0,3.0,0.0,4


### We can also propagate non-null values forward or backward.

In [33]:
df4.fillna(method='ffill')

Unnamed: 0,A,B,C,D
0,,2.0,,0
1,3.0,4.0,,1
2,3.0,4.0,,5
3,3.0,3.0,,4


### Replace all NaN elements in column ‘A’, ‘B’, ‘C’, and ‘D’, with 0, 1, 2, and 3 respectively.

In [34]:
values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
df.fillna(value=values)


Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,2.0,1
2,0.0,1.0,2.0,5
