In [1]:
import pandas as pd
import numpy as np
import matplotlib

In [2]:
np.__version__, pd.__version__, matplotlib.__version__

('1.20.3', '1.3.5', '3.5.1')

### Load data

In [3]:
df = pd.DataFrame(data={
    'feature_1' : [np.nan, 3, 6, 9, 12, 15, np.nan],
    'feature_2' : [100, np.nan, 200, 300, np.nan, np.nan, 600],
    'feature_3' : [2000, np.nan, 2000, 3000, 4000, 6000, 8000],
})

df

Unnamed: 0,feature_1,feature_2,feature_3
0,,100.0,2000.0
1,3.0,,
2,6.0,200.0,2000.0
3,9.0,300.0,3000.0
4,12.0,,4000.0
5,15.0,,6000.0
6,,600.0,8000.0


* Alternatives: isna, notna, notnull, ==np.nan 

In [4]:
df.isnull() 

Unnamed: 0,feature_1,feature_2,feature_3
0,True,False,False
1,False,True,True
2,False,False,False
3,False,False,False
4,False,True,False
5,False,True,False
6,True,False,False


In [5]:
df.isnull().sum()

feature_1    2
feature_2    3
feature_3    1
dtype: int64

### In the `fillna` function, by specifying `pad` we can fill the `nan` value as follows

In [6]:
df.fillna(method='pad', limit=1)

Unnamed: 0,feature_1,feature_2,feature_3
0,,100.0,2000.0
1,3.0,100.0,2000.0
2,6.0,200.0,2000.0
3,9.0,300.0,3000.0
4,12.0,300.0,4000.0
5,15.0,,6000.0
6,15.0,600.0,8000.0


### `limit` flag is for specifying how many `nan` value should be filled

In [7]:
df.fillna(method='pad', limit=2)

Unnamed: 0,feature_1,feature_2,feature_3
0,,100.0,2000.0
1,3.0,100.0,2000.0
2,6.0,200.0,2000.0
3,9.0,300.0,3000.0
4,12.0,300.0,4000.0
5,15.0,300.0,6000.0
6,15.0,600.0,8000.0


* Here we can see that the `nan` value up to limit two has been filled with the value `4.0`

### `nan` backfilling with `bfill`.

In [8]:
df.fillna(method = 'bfill')

Unnamed: 0,feature_1,feature_2,feature_3
0,3.0,100.0,2000.0
1,3.0,200.0,2000.0
2,6.0,200.0,2000.0
3,9.0,300.0,3000.0
4,12.0,600.0,4000.0
5,15.0,600.0,6000.0
6,,600.0,8000.0


In [17]:
# recommended, future is not leaking
df.fillna(method = 'ffill')

Unnamed: 0,feature_1,feature_2,feature_3
0,,100.0,2000.0
1,3.0,100.0,2000.0
2,6.0,200.0,2000.0
3,9.0,300.0,3000.0
4,12.0,300.0,4000.0
5,15.0,300.0,6000.0
6,15.0,600.0,8000.0


* It fills the `NaN` value in backward direction with the value which is before the `NaN`, if we do not specify the limit, it'll fill all the values with `NaN`

In [10]:
df.dropna(axis=0)

Unnamed: 0,feature_1,feature_2,feature_3
2,6.0,200.0,2000.0
3,9.0,300.0,3000.0


In [11]:
df.dropna(axis=1)

0
1
2
3
4
5
6


### Only drop columns which have at least 90% non-NaNs

In [12]:
df.dropna(thresh=int(df.shape[0] * .9), axis=1)

Unnamed: 0,feature_3
0,2000.0
1,
2,2000.0
3,3000.0
4,4000.0
5,6000.0
6,8000.0


The parameter `thresh=N`requires that a column has at least `N` `non-NaNs` to survive. Think of this as the lower limit for missing data you will find acceptable in your columns. 

In [13]:
df.shape[0] * .9

6.3

* The col1 need atleast `6.3 Non NaN` value to survive

### Fill with the mean

In [14]:
df['feature_1'].fillna(df['feature_1'].mean())

0     9.0
1     3.0
2     6.0
3     9.0
4    12.0
5    15.0
6     9.0
Name: feature_1, dtype: float64

### Interpolation

In [15]:
df['feature_2'].interpolate()

0    100.0
1    150.0
2    200.0
3    300.0
4    400.0
5    500.0
6    600.0
Name: feature_2, dtype: float64

### Replace

In [16]:
df.replace(np.nan, 0)

Unnamed: 0,feature_1,feature_2,feature_3
0,0.0,100.0,2000.0
1,3.0,0.0,0.0
2,6.0,200.0,2000.0
3,9.0,300.0,3000.0
4,12.0,0.0,4000.0
5,15.0,0.0,6000.0
6,0.0,600.0,8000.0
