## Working with Missing Data in Pandas

In [1]:
import numpy as np
import pandas as pd

from pandas import DataFrame

### Filling missing values using fillna(), replace() and interpolate()

In [2]:
data = {
    "name": ["Steve", "John", "Richard", "Sarah", "Randy", "Micheal", "Julie"],
    "age": [20, 22, 20, 21, 24, 23, 22],
    "gender": ["Male","Male","Male", "Female", "Male", "Male", "Female"],
    "rank": [2, 1, 4, 5, 3, 7, 6]
}

ranking_df = DataFrame(data)
ranking_df.iloc[2:5, 1] = np.nan
ranking_df.iloc[3:6, 3] = np.nan
ranking_df.iloc[3,:] = np.nan

ranking_df

Unnamed: 0,name,age,gender,rank
0,Steve,20.0,Male,2.0
1,John,22.0,Male,1.0
2,Richard,,Male,4.0
3,,,,
4,Randy,,Male,
5,Micheal,23.0,Male,
6,Julie,22.0,Female,6.0


In [None]:
ranking_df.isnull() # check for missing values

Unnamed: 0,name,age,gender,rank
0,False,False,False,False
1,False,False,False,False
2,False,True,False,False
3,True,True,True,True
4,False,True,False,True
5,False,False,False,True
6,False,False,False,False


In [None]:
ranking_df.notnull() # opposite of isnull

Unnamed: 0,name,age,gender,rank
0,True,True,True,True
1,True,True,True,True
2,True,False,True,True
3,False,False,False,False
4,True,False,True,False
5,True,True,True,False
6,True,True,True,True


In [6]:
bool_series = pd.isnull(ranking_df["age"])
ranking_df[bool_series]

Unnamed: 0,name,age,gender,rank
2,Richard,,Male,4.0
3,,,,
4,Randy,,Male,


In [7]:
ranking_df.fillna(0) # fill missing values with 0

Unnamed: 0,name,age,gender,rank
0,Steve,20.0,Male,2.0
1,John,22.0,Male,1.0
2,Richard,0.0,Male,4.0
3,0,0.0,0,0.0
4,Randy,0.0,Male,0.0
5,Micheal,23.0,Male,0.0
6,Julie,22.0,Female,6.0


In [8]:
ranking_df.fillna(method="pad") # pad: propagate last valid observation forward to next valid

  ranking_df.fillna(method="pad") # pad: propagate last valid observation forward to next valid


Unnamed: 0,name,age,gender,rank
0,Steve,20.0,Male,2.0
1,John,22.0,Male,1.0
2,Richard,22.0,Male,4.0
3,Richard,22.0,Male,4.0
4,Randy,22.0,Male,4.0
5,Micheal,23.0,Male,4.0
6,Julie,22.0,Female,6.0


In [9]:
ranking_df.fillna(method="bfill") # bfill: use next valid observation to fill gap

  ranking_df.fillna(method="bfill") # bfill: use next valid observation to fill gap


Unnamed: 0,name,age,gender,rank
0,Steve,20.0,Male,2.0
1,John,22.0,Male,1.0
2,Richard,23.0,Male,4.0
3,Randy,23.0,Male,6.0
4,Randy,23.0,Male,6.0
5,Micheal,23.0,Male,6.0
6,Julie,22.0,Female,6.0


In [10]:
ranking_df.interpolate(method = "linear") # linear interpolation

  ranking_df.interpolate(method = "linear") # linear interpolation


Unnamed: 0,name,age,gender,rank
0,Steve,20.0,Male,2.0
1,John,22.0,Male,1.0
2,Richard,22.25,Male,4.0
3,,22.5,,4.5
4,Randy,22.75,Male,5.0
5,Micheal,23.0,Male,5.5
6,Julie,22.0,Female,6.0


In [11]:
ranking_df.dropna() # drop rows with missing values

Unnamed: 0,name,age,gender,rank
0,Steve,20.0,Male,2.0
1,John,22.0,Male,1.0
6,Julie,22.0,Female,6.0


In [12]:
ranking_df.dropna(how="all") # drop rows with all missing values

Unnamed: 0,name,age,gender,rank
0,Steve,20.0,Male,2.0
1,John,22.0,Male,1.0
2,Richard,,Male,4.0
4,Randy,,Male,
5,Micheal,23.0,Male,
6,Julie,22.0,Female,6.0


In [13]:
ranking_df.dropna(axis=1) # drop columns with missing values

0
1
2
3
4
5
6


In [14]:
ranking_df.dropna(axis=0) # drop rows with missing values

Unnamed: 0,name,age,gender,rank
0,Steve,20.0,Male,2.0
1,John,22.0,Male,1.0
6,Julie,22.0,Female,6.0
