In [23]:
import pandas as pd
import numpy as np
from numpy import nan as NA

### Series

In [24]:
data = pd.Series([23, NA, 23, NA, 87, 34, NA])
data

0    23.0
1     NaN
2    23.0
3     NaN
4    87.0
5    34.0
6     NaN
dtype: float64

In [27]:
# Drop NaN
data.dropna()

0    23.0
2    23.0
4    87.0
5    34.0
dtype: float64

In [35]:
data.notnull()

0     True
1    False
2     True
3    False
4     True
5     True
6    False
dtype: bool

In [37]:
data[data.notnull()]

0    23.0
2    23.0
4    87.0
5    34.0
dtype: float64

### reindex, by creating a new index

In [40]:
data.dropna().reset_index()

Unnamed: 0,index,0
0,0,23.0
1,2,23.0
2,4,87.0
3,5,34.0


#### Drop the original indices

In [41]:
data.dropna().reset_index(drop=True)

0    23.0
1    23.0
2    87.0
3    34.0
dtype: float64

### DataFrame

In [61]:
data = pd.DataFrame([[2,5,3,6], [4,NA,2,-2], [NA,NA,NA,NA], [4,NA,1,4]])
data

Unnamed: 0,0,1,2,3
0,2.0,5.0,3.0,6.0
1,4.0,,2.0,-2.0
2,,,,
3,4.0,,1.0,4.0


In [62]:
# The default is dropping along axis = 0 (row)

data.dropna()

Unnamed: 0,0,1,2,3
0,2.0,5.0,3.0,6.0


In [63]:
# Drop only the entire row is NaN
data.dropna(how="all")

Unnamed: 0,0,1,2,3
0,2.0,5.0,3.0,6.0
1,4.0,,2.0,-2.0
3,4.0,,1.0,4.0


In [65]:
# Asign the entire column 3 to NaN
data[3]=NA
data

Unnamed: 0,0,1,2,3
0,2.0,5.0,3.0,
1,4.0,,2.0,
2,,,,
3,4.0,,1.0,


In [70]:
# drop columns with NaN in all row
data.dropna(axis=1, how="all")

Unnamed: 0,0,1,2
0,2.0,5.0,3.0
1,4.0,,2.0
2,,,
3,4.0,,1.0


#### thresh = have at least this many non-Na
only keep rows that have at least this number of non-NaN

In [71]:
data

Unnamed: 0,0,1,2,3
0,2.0,5.0,3.0,
1,4.0,,2.0,
2,,,,
3,4.0,,1.0,


In [74]:
data.dropna(thresh=2)

Unnamed: 0,0,1,2,3
0,2.0,5.0,3.0,
1,4.0,,2.0,
3,4.0,,1.0,


## Filling missing values

In [76]:
data = pd.DataFrame([[2,5,3,6],[4,NA,2,7],[NA,NA,NA,NA],[4,NA,1,4]])
data

Unnamed: 0,0,1,2,3
0,2.0,5.0,3.0,6.0
1,4.0,,2.0,7.0
2,,,,
3,4.0,,1.0,4.0


### Fill a value for all NaN

In [77]:
data.fillna(10)

Unnamed: 0,0,1,2,3
0,2.0,5.0,3.0,6.0
1,4.0,10.0,2.0,7.0
2,10.0,10.0,10.0,10.0
3,4.0,10.0,1.0,4.0


### Fill different values for each column
fillna({column index: value})

In [81]:
data.fillna({0:-10,1:-20,2:-30,3:-40})

Unnamed: 0,0,1,2,3
0,2.0,5.0,3.0,6.0
1,4.0,-20.0,2.0,7.0
2,-10.0,-20.0,-30.0,-40.0
3,4.0,-20.0,1.0,4.0


### The origninal data is not modified
### use inplace = True, to modify the original dataframe

In [83]:
data.fillna({0:-10,1:-20,2:-30,3:-40},inplace=True)
data

Unnamed: 0,0,1,2,3
0,2.0,5.0,3.0,6.0
1,4.0,-20.0,2.0,7.0
2,-10.0,-20.0,-30.0,-40.0
3,4.0,-20.0,1.0,4.0


### Forward and backward filling

In [87]:
df = pd.DataFrame(np.random.rand(5,3))
df.iloc[2:,1] = NA
df.iloc[3:,2] = NA
df

Unnamed: 0,0,1,2
0,0.828638,0.45402,0.833576
1,0.36291,0.157273,0.383438
2,0.084046,,0.60538
3,0.69944,,
4,0.450244,,


#### Forward fill, carry value from top to bottom)

In [90]:
df.fillna(method="ffill")

Unnamed: 0,0,1,2
0,0.828638,0.45402,0.833576
1,0.36291,0.157273,0.383438
2,0.084046,0.157273,0.60538
3,0.69944,0.157273,0.60538
4,0.450244,0.157273,0.60538


#### control how many forward fill to, (limit = )
example: only forward fill 1 time

df.fillna(method="ffill", limit=1)

In [92]:
df.fillna(method="ffill",limit =1)

Unnamed: 0,0,1,2
0,0.828638,0.45402,0.833576
1,0.36291,0.157273,0.383438
2,0.084046,0.157273,0.60538
3,0.69944,,0.60538
4,0.450244,,


### Backward fill

In [95]:
df = pd.DataFrame(np.random.rand(5,3))
df.iloc[2:4,1]=NA
df.iloc[2:4,2]=NA
df

Unnamed: 0,0,1,2
0,0.511408,0.105927,0.676939
1,0.216438,0.167189,0.656775
2,0.13113,,
3,0.492619,,
4,0.113168,0.520893,0.531915


In [99]:
df.fillna(method="bfill")

Unnamed: 0,0,1,2
0,0.511408,0.105927,0.676939
1,0.216438,0.167189,0.656775
2,0.13113,0.520893,0.531915
3,0.492619,0.520893,0.531915
4,0.113168,0.520893,0.531915
