In [1]:
import numpy as np
from pandas import Series,DataFrame
import pandas as pd

In [2]:
#Now we'll learn how to deal with missing data, a very common task when analyzing datasets!

data = Series(['one','two', np.nan, 'four'])

In [3]:
#Show data
data

0     one
1     two
2     NaN
3    four
dtype: object

In [4]:
#Find the missing values
data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
#We can simply drop the NAN 
data.dropna()

0     one
1     two
3    four
dtype: object

In [6]:
# In a DataFrame we need to be a little more careful!

dframe = DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan,np.nan,np.nan]])

In [7]:
#Show
dframe

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0
3,,,


In [8]:
clean_dframe = dframe.dropna()

In [9]:
#Show
clean_dframe

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [10]:
#Note all rows where an NA occured was a drop of the entire row

In [11]:
#We can also specify to only drop rows that are complete missing all data
dframe.dropna(how='all')



Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0


In [12]:
#Or we can specify to drop columns with missing data
dframe.dropna(axis=1)

#This should drop all columns out since every column contains at least 1 NAN

0
1
2
3


In [13]:
#We can also threshold teh missing data as well

#For example if we only want rows with at least 3 data points
dframe2 = DataFrame([[1,2,3,np.nan],[2,np.nan,5,6],[np.nan,7,np.nan,9],[1,np.nan,np.nan,np.nan]])

#Show
dframe2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [14]:
#Droppin any rows tht dont have at least 2 data points
dframe2.dropna(thresh=2)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0


In [15]:
#Dropiing rows without at least 3 data points
dframe2.dropna(thresh=3)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0


In [16]:
#We can also fill any NAN
dframe2.fillna(1)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,1.0
1,2.0,1.0,5.0,6.0
2,1.0,7.0,1.0,9.0
3,1.0,1.0,1.0,1.0


In [17]:
#Can also fill in diff values for diff columns
dframe2.fillna({0:0,1:1,2:2,3:3})

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,3.0
1,2.0,1.0,5.0,6.0
2,0.0,7.0,2.0,9.0
3,1.0,1.0,2.0,3.0


In [34]:
#Note that we still have access to the original dframe
dframe2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [35]:
#If we want to modify the exsisting object, use inplace
dframe2.fillna(0,inplace=True)

In [36]:
#Now let's see the dframe
dframe2

Unnamed: 0,0,1,2,3
0,1,2,3,0
1,2,0,5,6
2,0,7,0,9
3,1,0,0,0


In [None]:
#Awesome! Next we'll learn about Index Hierarchy