## Handling Missing Values

In [1]:
import numpy as np
import pandas as pd

In [2]:
# To identify and detect null values
pd.isnull(np.nan)

True

In [3]:
pd.isnull(None)

True

In [4]:
pd.isna(np.nan)

True

In [5]:
pd.isna(None)

True

In [6]:
# To identify not null values
pd.notnull(None)

False

In [7]:
pd.notnull(np.nan)

False

In [8]:
pd.notna(np.nan)

False

In [9]:
pd.notnull(3)

True

In [11]:
# These functions work with series and dataframes
pd.isnull(pd.Series([1, np.nan,7]))

0    False
1     True
2    False
dtype: bool

In [12]:
pd.notnull(pd.Series([1, np.nan,7]))

0     True
1    False
2     True
dtype: bool

In [13]:
pd.isnull(pd.DataFrame({'Column A': [1, np.nan, 7],
                        'Column B': [np.nan, 2, 3],
                        'Column C': [np.nan, 2, np.nan]}))

Unnamed: 0,Column A,Column B,Column C
0,False,True,True
1,True,False,False
2,False,False,True


## Pandas Operations with Missing Values

In [14]:
pd.Series([1,2,np.nan]).count()

2

In [15]:
pd.Series([1,2,np.nan]).sum()

3.0

In [16]:
pd.Series([2,2,np.nan]).mean()

2.0

### Filtering missing data

In [17]:
# Combine boolean selection + pd.isnull to filter nan and null values
s= pd.Series([1,2,3,np.nan,np.nan,4])

In [18]:
pd.notnull(s)

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [19]:
pd.isnull(s)

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [21]:
pd.notnull(s).sum()

4

In [22]:
pd.isnull(s).sum()

2

In [23]:
s[pd.notnull(s)]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

In [24]:
# notnull and isnull are both methods of Series and DataFrames
s.isnull()

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

In [25]:
s.notnull()

0     True
1     True
2     True
3    False
4    False
5     True
dtype: bool

In [26]:
s[s.notnull()]

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

### Dropping null values

In [27]:
# Using the dropna method
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

In [28]:
s.dropna()

0    1.0
1    2.0
2    3.0
5    4.0
dtype: float64

### Dropping null values on DataFrames

In [29]:
# With DataFrames, you can't drop single values, only entire columns or rows
df= pd.DataFrame({
    'Column A': [1,np.nan,30,np.nan],
    'Column B': [2,8,31,np.nan],
    'Column C': [np.nan,9,32,100],
    'Column D': [5,8,34,110],

})
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [30]:
df.shape

(4, 4)

In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Column A  2 non-null      float64
 1   Column B  3 non-null      float64
 2   Column C  3 non-null      float64
 3   Column D  4 non-null      int64  
dtypes: float64(3), int64(1)
memory usage: 256.0 bytes


In [32]:
df.isnull()

Unnamed: 0,Column A,Column B,Column C,Column D
0,False,False,True,False
1,True,False,False,False
2,False,False,False,False
3,True,True,False,False


In [33]:
df.isnull().sum()

Column A    2
Column B    1
Column C    1
Column D    0
dtype: int64

In [34]:
# dropna will drop all the rows in which any null value is present
df.dropna()

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


In [35]:
# Use axis to drop columns containing null values

df.dropna(axis=1) #axis= 'columns' also works

Unnamed: 0,Column D
0,5
1,8
2,34
3,110


In [36]:
# In this case, any row or column that contains at least one null value will be dropped. 
# You can control this behavior with the how parameter. Can be either 'any' or 'all'

df2= pd.DataFrame({
    'Column A': [1, np.nan, 30],
    'Column B': [2, np.nan, 31],
    'Column C': [np.nan, np.nan, 100]
})
df2

Unnamed: 0,Column A,Column B,Column C
0,1.0,2.0,
1,,,
2,30.0,31.0,100.0


In [37]:
df.dropna(how='all')

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [38]:
df.dropna(how='any') #default behaviour

Unnamed: 0,Column A,Column B,Column C,Column D
2,30.0,31.0,32.0,34


In [39]:
# Thresh indicates a threshold (a minimum number) of non-null values for the row/column to be kept
df
df.dropna(thresh=3)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34


In [40]:
df.dropna(thresh=3, axis='columns')

Unnamed: 0,Column B,Column C,Column D
0,2.0,,5
1,8.0,9.0,8
2,31.0,32.0,34
3,,100.0,110


### Filling null values

Sometimes instead than dropping the null values, we might need to replace them with some other value.
Sometimes a nan can be replaced with a 0, or with the mean of the sample, and other times you can take the closest value. It all depends on the context.

In [41]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

#### Filling nulls withh an arbitrary value

In [42]:
s.fillna(0)

0    1.0
1    2.0
2    3.0
3    0.0
4    0.0
5    4.0
dtype: float64

In [43]:
s.fillna(s.mean())

0    1.0
1    2.0
2    3.0
3    2.5
4    2.5
5    4.0
dtype: float64

In [44]:
s

0    1.0
1    2.0
2    3.0
3    NaN
4    NaN
5    4.0
dtype: float64

#### Filling nulls with contiguous (close) values

The method argument is used to fill null values with values close to that null one.

In [45]:
s.fillna(method='ffill')

0    1.0
1    2.0
2    3.0
3    3.0
4    3.0
5    4.0
dtype: float64

In [46]:
s.fillna(method='bfill')

0    1.0
1    2.0
2    3.0
3    4.0
4    4.0
5    4.0
dtype: float64

This can still leave null values at the extremes of the Series/DataFrame

In [47]:
pd.Series([np.nan, 3, np.nan, 9]).fillna(method='ffill')

0    NaN
1    3.0
2    3.0
3    9.0
dtype: float64

In [48]:
pd.Series([1, np.nan, 3, np.nan, np.nan]).fillna(method='bfill')

0    1.0
1    3.0
2    3.0
3    NaN
4    NaN
dtype: float64

### Filling null values on DataFrames

The fillna method also works on DataFrames. The main differences are that you can specify the axis to use to fill the values (specially for methods) and that you have more control on the values passed.



In [49]:
df

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,,8.0,9.0,8
2,30.0,31.0,32.0,34
3,,,100.0,110


In [50]:
df.fillna({'Column A': 0, 'Column B': 99, 'Column C': df['Column C'].mean()})

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,47.0,5
1,0.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,0.0,99.0,100.0,110


In [51]:
df.fillna(method='ffill', axis=0)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,,5
1,1.0,8.0,9.0,8
2,30.0,31.0,32.0,34
3,30.0,31.0,100.0,110


In [52]:
df.fillna(method='ffill', axis=1)

Unnamed: 0,Column A,Column B,Column C,Column D
0,1.0,2.0,2.0,5.0
1,,8.0,9.0,8.0
2,30.0,31.0,32.0,34.0
3,,,100.0,110.0


### Checking if there are NAs

The question is: Does this Series or DataFrame contain any missing value? The answer should be yes or no: True or False. How can you verify it?

#### Example1: Checking the length

If there are missing values, s.dropna() will have less elements than s

In [53]:
s.dropna().count()

4

In [54]:
missing_values= len(s.dropna()) !=len(s)
missing_values

True

There is also a count method that excludes nans from its result

In [55]:
len(s)

6

In [56]:
s.count()

4

In [57]:
missing_values= s.count() !=len(s)
missing_values

True

#### A more Pythonic solution: any

The methods any and all check if there is any True value in a Series or all the values are True.

In [58]:
pd.Series([True, False, False]).any()

True

In [59]:
pd.Series([True, False, False]).all()

False

In [60]:
pd.Series([True, True, True]).all()

True

the isnull() method returned a Boolean Series with True values wherever there was a nan

In [61]:
s.isnull()

0    False
1    False
2    False
3     True
4     True
5    False
dtype: bool

We can use the any method with the boolean array returned

In [62]:
pd.Series([1,np.nan]).isnull().any()

True

In [63]:
pd.Series([1, 2]).isnull().any()

False

In [64]:
s.isnull().any()

True

A more strict version would check only the values of the Series

In [65]:
s.isnull().values

array([False, False, False,  True,  True, False])

In [66]:
s.isnull().values.any()

True