## Working with Missing Data in Pandas

In [6]:
import numpy as np
import pandas as pd

from pandas import DataFrame

### Filling missing values using fillna(), replace() and interpolate()

In [7]:
data = {'names': ['steve', 'shaun', 'richard','sarah','supun', 'sandun','thisali'],
         'age': [21,25,43,22,31,20,18],
         'gender': ['Male','Male','Male','Female','Male','Male','Female'],
         'rank': [2,1,4,5,3,7,6]}

ranking_df = DataFrame(data)
ranking_df.iloc[2:5,1] = np.nan
ranking_df.iloc[3:6,3] = np.nan
ranking_df.iloc[3,:] = np.nan
ranking_df


Unnamed: 0,names,age,gender,rank
0,steve,21.0,Male,2.0
1,shaun,25.0,Male,1.0
2,richard,,Male,4.0
3,,,,
4,supun,,Male,
5,sandun,20.0,Male,
6,thisali,18.0,Female,6.0


In [8]:
ranking_df.isnull()

Unnamed: 0,names,age,gender,rank
0,False,False,False,False
1,False,False,False,False
2,False,True,False,False
3,True,True,True,True
4,False,True,False,True
5,False,False,False,True
6,False,False,False,False


In [9]:
ranking_df.notnull()

Unnamed: 0,names,age,gender,rank
0,True,True,True,True
1,True,True,True,True
2,True,False,True,True
3,False,False,False,False
4,True,False,True,False
5,True,True,True,False
6,True,True,True,True


In [10]:
bool_series = pd.isnull(ranking_df['age']) 

In [11]:
ranking_df['age'] 

0    21.0
1    25.0
2     NaN
3     NaN
4     NaN
5    20.0
6    18.0
Name: age, dtype: float64

In [12]:
ranking_df[bool_series]

Unnamed: 0,names,age,gender,rank
2,richard,,Male,4.0
3,,,,
4,supun,,Male,


In [13]:
ranking_df.fillna(0)

Unnamed: 0,names,age,gender,rank
0,steve,21.0,Male,2.0
1,shaun,25.0,Male,1.0
2,richard,0.0,Male,4.0
3,0,0.0,0,0.0
4,supun,0.0,Male,0.0
5,sandun,20.0,Male,0.0
6,thisali,18.0,Female,6.0


In [14]:
ranking_df.fillna(method='pad')

  ranking_df.fillna(method='pad')


Unnamed: 0,names,age,gender,rank
0,steve,21.0,Male,2.0
1,shaun,25.0,Male,1.0
2,richard,25.0,Male,4.0
3,richard,25.0,Male,4.0
4,supun,25.0,Male,4.0
5,sandun,20.0,Male,4.0
6,thisali,18.0,Female,6.0


In [15]:
ranking_df.fillna(method='bfill')

  ranking_df.fillna(method='bfill')


Unnamed: 0,names,age,gender,rank
0,steve,21.0,Male,2.0
1,shaun,25.0,Male,1.0
2,richard,20.0,Male,4.0
3,supun,20.0,Male,6.0
4,supun,20.0,Male,6.0
5,sandun,20.0,Male,6.0
6,thisali,18.0,Female,6.0


In [16]:
ranking_df.interpolate(method='linear')

  ranking_df.interpolate(method='linear')


Unnamed: 0,names,age,gender,rank
0,steve,21.0,Male,2.0
1,shaun,25.0,Male,1.0
2,richard,23.75,Male,4.0
3,,22.5,,4.5
4,supun,21.25,Male,5.0
5,sandun,20.0,Male,5.5
6,thisali,18.0,Female,6.0


In [17]:
ranking_df.interpolate(method='linear')

  ranking_df.interpolate(method='linear')


Unnamed: 0,names,age,gender,rank
0,steve,21.0,Male,2.0
1,shaun,25.0,Male,1.0
2,richard,23.75,Male,4.0
3,,22.5,,4.5
4,supun,21.25,Male,5.0
5,sandun,20.0,Male,5.5
6,thisali,18.0,Female,6.0


In [18]:
ranking_df.interpolate(method='linear')

  ranking_df.interpolate(method='linear')


Unnamed: 0,names,age,gender,rank
0,steve,21.0,Male,2.0
1,shaun,25.0,Male,1.0
2,richard,23.75,Male,4.0
3,,22.5,,4.5
4,supun,21.25,Male,5.0
5,sandun,20.0,Male,5.5
6,thisali,18.0,Female,6.0


In [19]:
ranking_df

Unnamed: 0,names,age,gender,rank
0,steve,21.0,Male,2.0
1,shaun,25.0,Male,1.0
2,richard,,Male,4.0
3,,,,
4,supun,,Male,
5,sandun,20.0,Male,
6,thisali,18.0,Female,6.0


In [20]:
ranking_df

Unnamed: 0,names,age,gender,rank
0,steve,21.0,Male,2.0
1,shaun,25.0,Male,1.0
2,richard,,Male,4.0
3,,,,
4,supun,,Male,
5,sandun,20.0,Male,
6,thisali,18.0,Female,6.0


In [21]:
ranking_df.dropna()

Unnamed: 0,names,age,gender,rank
0,steve,21.0,Male,2.0
1,shaun,25.0,Male,1.0
6,thisali,18.0,Female,6.0


In [22]:
mask = ranking_df['age'] > 20
ranking_df[mask]

Unnamed: 0,names,age,gender,rank
0,steve,21.0,Male,2.0
1,shaun,25.0,Male,1.0


In [23]:
ranking_df.iloc[2:5,1:3]

Unnamed: 0,age,gender
2,,Male
3,,
4,,Male


In [24]:
ranking_df.dropna(how='all')

Unnamed: 0,names,age,gender,rank
0,steve,21.0,Male,2.0
1,shaun,25.0,Male,1.0
2,richard,,Male,4.0
4,supun,,Male,
5,sandun,20.0,Male,
6,thisali,18.0,Female,6.0


In [25]:
ranking_df.dropna(axis=1)

0
1
2
3
4
5
6


In [26]:
ranking_df.dropna(axis=2)

ValueError: No axis named 2 for object type DataFrame