Handling Missing Values

In [6]:
import pandas as pd
import numpy as np

In [9]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [10]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [11]:
string_data = pd.Series(['aardvark', np.nan, None, 'avocado'])
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [12]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

In [13]:
float_data = pd.Series([1, 2, None], dtype = 'float64')
float_data

0    1.0
1    2.0
2    NaN
dtype: float64

In [14]:
float_data.isna()

0    False
1    False
2     True
dtype: bool

Filterring Out Missing Data

In [15]:
df = pd.Series([1, np.nan, 3.5, np.nan, 7])
df.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [16]:
df[df.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [18]:
df = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                   [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])
df

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [19]:
df.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [20]:
df.dropna(how = 'all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [21]:
df[4] = np.nan
df

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [22]:
df.dropna(axis = 'columns', how = 'all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [29]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-0.214354,,
1,0.843438,,
2,-1.530274,,-1.230849
3,0.599069,,-1.389013
4,1.061744,-0.173994,0.60034
5,-1.041055,0.247154,-0.200223
6,0.812119,0.712692,0.990819


In [30]:
df.dropna()

Unnamed: 0,0,1,2
4,1.061744,-0.173994,0.60034
5,-1.041055,0.247154,-0.200223
6,0.812119,0.712692,0.990819


In [31]:
df.dropna(thresh = 2)

Unnamed: 0,0,1,2
2,-1.530274,,-1.230849
3,0.599069,,-1.389013
4,1.061744,-0.173994,0.60034
5,-1.041055,0.247154,-0.200223
6,0.812119,0.712692,0.990819


Filling in Missing Data

In [32]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.214354,0.0,0.0
1,0.843438,0.0,0.0
2,-1.530274,0.0,-1.230849
3,0.599069,0.0,-1.389013
4,1.061744,-0.173994,0.60034
5,-1.041055,0.247154,-0.200223
6,0.812119,0.712692,0.990819


In [33]:
df.fillna({1: 0.5, 2: 0})

Unnamed: 0,0,1,2
0,-0.214354,0.5,0.0
1,0.843438,0.5,0.0
2,-1.530274,0.5,-1.230849
3,0.599069,0.5,-1.389013
4,1.061744,-0.173994,0.60034
5,-1.041055,0.247154,-0.200223
6,0.812119,0.712692,0.990819


In [38]:
df = pd.DataFrame(np.random.standard_normal((6, 3)))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
df

Unnamed: 0,0,1,2
0,-1.807454,-1.60762,1.580259
1,-0.854994,-0.397385,0.96173
2,-1.187047,,-1.386178
3,0.133337,,1.526932
4,-0.753987,,
5,-1.719712,,


In [39]:
df.fillna(method = 'ffill')

  df.fillna(method = 'ffill')


Unnamed: 0,0,1,2
0,-1.807454,-1.60762,1.580259
1,-0.854994,-0.397385,0.96173
2,-1.187047,-0.397385,-1.386178
3,0.133337,-0.397385,1.526932
4,-0.753987,-0.397385,1.526932
5,-1.719712,-0.397385,1.526932


In [40]:
df.fillna(method = 'ffill', limit = 2)

  df.fillna(method = 'ffill', limit = 2)


Unnamed: 0,0,1,2
0,-1.807454,-1.60762,1.580259
1,-0.854994,-0.397385,0.96173
2,-1.187047,-0.397385,-1.386178
3,0.133337,-0.397385,1.526932
4,-0.753987,,1.526932
5,-1.719712,,1.526932


In [41]:
df = pd.Series([1., np.nan, 3.5, np.nan, 7])
df

0    1.0
1    NaN
2    3.5
3    NaN
4    7.0
dtype: float64

In [42]:
df.fillna(df.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64