In [2]:
import pandas as pd
import numpy as np

# detect the null values

d = pd.Series([1, np.nan, 'hello', None])

# check if any is null
d.isnull()


0    False
1     True
2    False
3     True
dtype: bool

In [3]:
d.isna() # same as

0    False
1     True
2    False
3     True
dtype: bool

In [4]:
d[d.notnull()] # filter on the ones that are not null

0        1
2    hello
dtype: object

## Drop NULL values

In [5]:
# dropping the null values
# for pd.series
d.dropna() # this will not modify d in place. d will still have 4 rows

0        1
2    hello
dtype: object

In [8]:
# for data.frame
# remember the second square bracket!
df = pd.DataFrame([[1, np.nan, 2],
                  [2, 3, 5],
                  [np.nan, 4, 6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [9]:
# for df, can not drop a single value; need to drop the entire row or column
# 1. drop all rows with any null, only keeps the complete rows
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [10]:
# drop the columns, only keep the complete column
df.dropna(axis = 'columns')

Unnamed: 0,2
0,2
1,5
2,6


Drop with options

In [11]:
# drop with more precision based on how much missing there are
# default is any, so only keeps the complete rows or columns
# how='all' will drop the col (or row) wit ALL NAs
# add one col with all na
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [12]:
df.dropna(axis = 'columns', how = 'all') # column 3 is dropped since all are na

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [14]:
# drop those with at least 3 non-null
df.dropna(axis = 'rows', thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


# filling null values


In [16]:
d = pd.Series([1, np.nan, 'hello', None])
d

0        1
1      NaN
2    hello
3     None
dtype: object

In [17]:
# fill with 0
d.fillna(0)

0        1
1        0
2    hello
3        0
dtype: object

In [19]:
# forward fill: carry the last value forward (or backward fill)
# both might leave some values unfilled
# d.fillna(method = 'ffill')
d.ffill()

  d.fillna(method = 'ffill')


0        1
1        1
2    hello
3    hello
dtype: object

In [20]:
d.bfill()

0        1
1    hello
2    hello
3     None
dtype: object

In [22]:
# fill with mean, but only works with floats
d2 = pd.Series([1, np.nan, 3, None])
d2.fillna(d2.mean())

0    1.0
1    2.0
2    3.0
3    2.0
dtype: float64