In [1]:
import numpy as np

import pandas as pd
from pandas import Series,DataFrame

# Handling Missing Data in a Series:

In [4]:
data = Series(['one','two',np.nan,'four'])

data

0     one
1     two
2     NaN
3    four
dtype: object

In [5]:
# How to locate NULL values in your dataset
data.isnull()

0    False
1    False
2     True
3    False
dtype: bool

In [6]:
# Drop/Remove all NULL values
data.dropna()

0     one
1     two
3    four
dtype: object

# Handling Missing Data in a DataFrame:

In [8]:
dframe = DataFrame([[1,2,3],[np.nan,5,6],[7,np.nan,9],[np.nan,np.nan,np.nan]])

dframe

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0
3,,,


In [9]:
clean_dframe = dframe.dropna()  # dropping na within a row that contains 'na' will drop the entire row

clean_dframe

Unnamed: 0,0,1,2
0,1.0,2.0,3.0


In [10]:
# specifying syntax to drop rows missing all data
dframe.dropna(how='all')   #  this will drop all rows where all data within that row is/has a NULL value

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0


In [11]:
# dropping columns instead of rows
dframe.dropna(axis=1)  # specifying the y-axis here so axis=1 and since all columns had at least 1 NULL value, all columns were 
                       # dropped

0
1
2
3


In [12]:
npn = np.nan

dframe2 = DataFrame([[1,2,3,npn],[2,npn,5,6],[npn,7,npn,9],[1,npn,npn,npn]])

dframe2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [13]:
# If you want a threshold by number of actual data points, points that dont have a NULL 
dframe2.dropna(thresh=2)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0


In [14]:
dframe2.dropna(thresh=3)  # this syntax drops all rows that DO NOT contain at least 3 values in its row.

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0


In [15]:
dframe2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [16]:
# This syntax allows you to fill all the NULL values with a specified value
# REMEMBER, calling these functions on 'dframe2' doesnt change the actual matrix, unless you assign it to a variable.
dframe2.fillna(1)

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,1.0
1,2.0,1.0,5.0,6.0
2,1.0,7.0,1.0,9.0
3,1.0,1.0,1.0,1.0


In [19]:
dframe2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [18]:
# This syntax allows you to fill in different values for different columns by passing a dictionary to the '.fillna({})' function
dframe2.fillna({0:0,1:1,2:2,3:3})

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,3.0
1,2.0,1.0,5.0,6.0
2,0.0,7.0,2.0,9.0
3,1.0,1.0,2.0,3.0


In [22]:
dframe2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,
1,2.0,,5.0,6.0
2,,7.0,,9.0
3,1.0,,,


In [23]:
# Syntax to modify the existing object
# dframe2 = dframe2.fillna()     # this code modifies the dataframe permanently
dframe2.fillna(0,inplace=True)   # if you want to modify the dataframe in place, pass the argument in place

In [24]:
dframe2

Unnamed: 0,0,1,2,3
0,1.0,2.0,3.0,0.0
1,2.0,0.0,5.0,6.0
2,0.0,7.0,0.0,9.0
3,1.0,0.0,0.0,0.0
