# 15. How do I handle missing values in pandas?

In [1]:
import pandas as pd
ufo = pd.read_csv('data/uforeports.csv', index_col=0)
ufo.tail()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time,Location
18236,Grant Park,,TRIANGLE,IL,12/31/2000 23:00,"Grant Park , IL"
18237,Spirit Lake,,DISK,IA,12/31/2000 23:00,"Spirit Lake , IA"
18238,Eagle River,,,WI,12/31/2000 23:45,"Eagle River , WI"
18239,Eagle River,RED,LIGHT,WI,12/31/2000 23:45,"Eagle River , WI"
18240,Ybor,,OVAL,FL,12/31/2000 23:59,"Ybor , FL"


In [2]:
# 'isnull()': Detect missing values.
# Return a boolean same-sized object indicating if the values are NA.

ufo.isnull().tail()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time,Location
18236,False,True,False,False,False,False
18237,False,True,False,False,False,False
18238,False,True,True,False,False,False
18239,False,False,False,False,False,False
18240,False,True,False,False,False,False


In [3]:
# 'notnull()': Detect existing (non-missing) values.
# Return a boolean same-sized object indicating if the values are not NA.

ufo.notnull().tail()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time,Location
18236,True,False,True,True,True,True
18237,True,False,True,True,True,True
18238,True,False,False,True,True,True
18239,True,True,True,True,True,True
18240,True,False,True,True,True,True


In [4]:
# This is a commun way to know how much NaN value you have in each column.
ufo.isnull().sum()

City                  25
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
Location              25
dtype: int64

In [5]:
# Filtring data by passing a Series of conditions, in this case we can see 
# where City is unkhown.
ufo[ufo['City'].isnull()].shape

(25, 6)

In [6]:
# What to do to missing values?
ufo.shape

(18241, 6)

In [7]:
# We can delete all rows that have 'NaN' value in any column.
ufo.dropna(how='any').shape

(2486, 6)

In [8]:
# We can delete rows that have all there columns 'unknown' or have missing values.
ufo.dropna(how='all').shape

(18241, 6)

In [9]:
# We can delete on specific columns by passing them in a list.
ufo.dropna(subset=['City', 'Shape Reported'], how='any').shape

(15576, 6)

In [10]:
# We can delete specific columns by passing them in a list.
ufo.dropna(subset=['City', 'Shape Reported'], how='all').shape

(18237, 6)

In [11]:
# BONUS:
# Replacing 'NaN' by a specific value using 'fillna()' method.
ufo['Shape Reported'].fillna(value='VARIOUS', inplace=True)
ufo['Shape Reported'].value_counts(dropna=False)

VARIOUS      2977
LIGHT        2803
DISK         2122
TRIANGLE     1889
OTHER        1402
CIRCLE       1365
SPHERE       1054
FIREBALL     1039
OVAL          845
CIGAR         617
FORMATION     434
RECTANGLE     303
CYLINDER      294
CHEVRON       248
DIAMOND       234
EGG           197
FLASH         188
TEARDROP      119
CONE           60
CROSS          36
DELTA           7
CRESCENT        2
ROUND           2
PYRAMID         1
FLARE           1
HEXAGON         1
DOME            1
Name: Shape Reported, dtype: int64