# Missing Data

A few convenient methods to deal with Missing Data in pandas:

In [5]:
import numpy as np
import pandas as pd

In [6]:
df = pd.DataFrame({'A':[1,2,np.nan],
                  'B':[5,np.nan,np.nan],
                  'C':[1,2,3]})

#### Checking if we have missing values

In [7]:
df.isnull()

Unnamed: 0,A,B,C
0,False,False,False
1,False,True,False
2,True,True,False


In [8]:
df.isnull().sum()

A    1
B    2
C    0
dtype: int64

#### Removing missing values

In [12]:
# Dropping all rows with missing values
df.dropna()

Unnamed: 0,A,B,C
0,1.0,5.0,1


In [13]:
# Dropping all columns with missing values
df.dropna(axis=1)

Unnamed: 0,C
0,1
1,2
2,3


In [14]:
# Dropping all rows with at least 2 missing values
df.dropna(thresh=2)

Unnamed: 0,A,B,C
0,1.0,5.0,1
1,2.0,,2


#### Filling missing values

In [15]:
# Setting a value
df.fillna(value='FILL VALUE')

Unnamed: 0,A,B,C
0,1,5,1
1,2,FILL VALUE,2
2,FILL VALUE,FILL VALUE,3


In [17]:
# Using the mean
df['A'].fillna(value=df['A'].mean())

0    1.0
1    2.0
2    1.5
Name: A, dtype: float64