# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Missing-values" data-toc-modified-id="Missing-values-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Missing values</a></div><div class="lev1 toc-item"><a href="#Count-missing-values" data-toc-modified-id="Count-missing-values-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Count missing values</a></div><div class="lev1 toc-item"><a href="#Fill-missing-values" data-toc-modified-id="Fill-missing-values-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Fill missing values</a></div><div class="lev1 toc-item"><a href="#Interpolate-missing-values" data-toc-modified-id="Interpolate-missing-values-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Interpolate missing values</a></div><div class="lev1 toc-item"><a href="#Drop-missing-values" data-toc-modified-id="Drop-missing-values-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Drop missing values</a></div>

We first saw missing data when we started merging our data in the previous notebook.
Here we will cover what exactly is "missing" data from Python's perspective,
how to check for them, and what happens when you perform calculations with them.

# Missing values

In [1]:
import pandas as pd

In [2]:
# We can use the from ... import notation to just import parts of a library we want
# instead of importing all of it
# here we are importing 3 ways missing values can be created and used
# they only differ in the capitalization, they all refer to the same notion of "missing"
from numpy import NaN, NAN, nan

In [3]:
nan

nan

In [4]:
from pandas import read_csv

In [5]:
read_csv('../data/concat_1.csv')

Unnamed: 0,A,B,C,D
0,a0,b0,c0,d0
1,a1,b1,c1,d1
2,a2,b2,c2,d2
3,a3,b3,c3,d3


In [6]:
# things that are missing do not equal anything else
nan == True

False

In [7]:
NaN == False

False

In [8]:
# not even eath other
nan == nan

False

In [9]:
# we need to use a special function to check for missing-ness
pd.isnull(nan)

True

In [10]:
pd.isnull(NAN)

True

In [11]:
pd.isnull(NaN)

True

In [12]:
pd.isnull(42)

False

In [13]:
pd.notnull(NAN)

False

In [14]:
# Here's a dataset with missing values
ebola = pd.read_csv('../data/ebola_country_timeseries.csv')

In [15]:
ebola.head()

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
3,1/2/2015,286,,8157.0,,,,,,,,3496.0,,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,


# Count missing values

In [16]:
# Note the column of non-null numbers
ebola.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 122 entries, 0 to 121
Data columns (total 18 columns):
Date                   122 non-null object
Day                    122 non-null int64
Cases_Guinea           93 non-null float64
Cases_Liberia          83 non-null float64
Cases_SierraLeone      87 non-null float64
Cases_Nigeria          38 non-null float64
Cases_Senegal          25 non-null float64
Cases_UnitedStates     18 non-null float64
Cases_Spain            16 non-null float64
Cases_Mali             12 non-null float64
Deaths_Guinea          92 non-null float64
Deaths_Liberia         81 non-null float64
Deaths_SierraLeone     87 non-null float64
Deaths_Nigeria         38 non-null float64
Deaths_Senegal         22 non-null float64
Deaths_UnitedStates    18 non-null float64
Deaths_Spain           16 non-null float64
Deaths_Mali            12 non-null float64
dtypes: float64(16), int64(1), object(1)
memory usage: 17.2+ KB


In [17]:
# get frequency counts that also count missing
# note results are in decending order
# missing values are not always going to show up first
ebola['Cases_Guinea'].value_counts(dropna=False)

NaN        29
 86.0       3
 495.0      2
 112.0      2
 390.0      2
 506.0      1
 812.0      1
 771.0      1
 648.0      1
 607.0      1
 579.0      1
 543.0      1
 519.0      1
 510.0      1
 2597.0     1
 2769.0     1
 899.0      1
 2571.0     1
 485.0      1
 472.0      1
 460.0      1
 427.0      1
 415.0      1
 861.0      1
 942.0      1
 936.0      1
 1667.0     1
 2706.0     1
 2416.0     1
 2292.0     1
           ..
 1519.0     1
 151.0      1
 1199.0     1
 143.0      1
 127.0      1
 122.0      1
 103.0      1
 49.0       1
 2695.0     1
 2730.0     1
 208.0      1
 218.0      1
 408.0      1
 412.0      1
 413.0      1
 398.0      1
 351.0      1
 344.0      1
 328.0      1
 291.0      1
 281.0      1
 258.0      1
 248.0      1
 233.0      1
 236.0      1
 235.0      1
 231.0      1
 226.0      1
 224.0      1
 2776.0     1
Name: Cases_Guinea, Length: 89, dtype: int64

In [18]:
# assert statement to check assumption that all values are not missing
assert all(pd.isnull(ebola['Cases_Guinea']))

AssertionError: 

# Fill missing values

In [19]:
# replace all NaN values with a specif value
ebola.fillna(0).head()

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,0.0,10030.0,0.0,0.0,0.0,0.0,0.0,1786.0,0.0,2977.0,0.0,0.0,0.0,0.0,0.0
1,1/4/2015,288,2775.0,0.0,9780.0,0.0,0.0,0.0,0.0,0.0,1781.0,0.0,2943.0,0.0,0.0,0.0,0.0,0.0
2,1/3/2015,287,2769.0,8166.0,9722.0,0.0,0.0,0.0,0.0,0.0,1767.0,3496.0,2915.0,0.0,0.0,0.0,0.0,0.0
3,1/2/2015,286,0.0,8157.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3496.0,0.0,0.0,0.0,0.0,0.0,0.0
4,12/31/2014,284,2730.0,8115.0,9633.0,0.0,0.0,0.0,0.0,0.0,1739.0,3471.0,2827.0,0.0,0.0,0.0,0.0,0.0


In [20]:
# Forward fill missing values
ebola.fillna(method='ffill')

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,
1,1/4/2015,288,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,
2,1/3/2015,287,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
3,1/2/2015,286,2769.0,8157.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
4,12/31/2014,284,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,
5,12/28/2014,281,2706.0,8018.0,9446.0,,,,,,1708.0,3423.0,2758.0,,,,,
6,12/27/2014,280,2695.0,8018.0,9409.0,,,,,,1697.0,3423.0,2732.0,,,,,
7,12/24/2014,277,2630.0,7977.0,9203.0,,,,,,1697.0,3413.0,2655.0,,,,,
8,12/21/2014,273,2597.0,7977.0,9004.0,,,,,,1607.0,3413.0,2582.0,,,,,
9,12/20/2014,272,2571.0,7862.0,8939.0,,,,,,1586.0,3384.0,2556.0,,,,,


In [21]:
# Back fill missing values
ebola.fillna(method='bfill')

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.0,8166.0,10030.0,20.0,1.0,4.0,1.0,7.0,1786.0,3496.0,2977.0,8.0,0.0,1.0,0.0,6.0
1,1/4/2015,288,2775.0,8166.0,9780.0,20.0,1.0,4.0,1.0,7.0,1781.0,3496.0,2943.0,8.0,0.0,1.0,0.0,6.0
2,1/3/2015,287,2769.0,8166.0,9722.0,20.0,1.0,4.0,1.0,7.0,1767.0,3496.0,2915.0,8.0,0.0,1.0,0.0,6.0
3,1/2/2015,286,2730.0,8157.0,9633.0,20.0,1.0,4.0,1.0,7.0,1739.0,3496.0,2827.0,8.0,0.0,1.0,0.0,6.0
4,12/31/2014,284,2730.0,8115.0,9633.0,20.0,1.0,4.0,1.0,7.0,1739.0,3471.0,2827.0,8.0,0.0,1.0,0.0,6.0
5,12/28/2014,281,2706.0,8018.0,9446.0,20.0,1.0,4.0,1.0,7.0,1708.0,3423.0,2758.0,8.0,0.0,1.0,0.0,6.0
6,12/27/2014,280,2695.0,7977.0,9409.0,20.0,1.0,4.0,1.0,7.0,1697.0,3413.0,2732.0,8.0,0.0,1.0,0.0,6.0
7,12/24/2014,277,2630.0,7977.0,9203.0,20.0,1.0,4.0,1.0,7.0,1607.0,3413.0,2655.0,8.0,0.0,1.0,0.0,6.0
8,12/21/2014,273,2597.0,7862.0,9004.0,20.0,1.0,4.0,1.0,7.0,1607.0,3384.0,2582.0,8.0,0.0,1.0,0.0,6.0
9,12/20/2014,272,2571.0,7862.0,8939.0,20.0,1.0,4.0,1.0,7.0,1586.0,3384.0,2556.0,8.0,0.0,1.0,0.0,6.0


# Interpolate missing values

In [22]:
# linearly interpolate missing values
ebola.interpolate()

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,289,2776.000000,,10030.0,,,,,,1786.000000,,2977.000000,,,,,
1,1/4/2015,288,2775.000000,,9780.0,,,,,,1781.000000,,2943.000000,,,,,
2,1/3/2015,287,2769.000000,8166.000000,9722.0,,,,,,1767.000000,3496.0,2915.000000,,,,,
3,1/2/2015,286,2749.500000,8157.000000,9677.5,,,,,,1753.000000,3496.0,2871.000000,,,,,
4,12/31/2014,284,2730.000000,8115.000000,9633.0,,,,,,1739.000000,3471.0,2827.000000,,,,,
5,12/28/2014,281,2706.000000,8018.000000,9446.0,,,,,,1708.000000,3423.0,2758.000000,,,,,
6,12/27/2014,280,2695.000000,7997.500000,9409.0,,,,,,1697.000000,3418.0,2732.000000,,,,,
7,12/24/2014,277,2630.000000,7977.000000,9203.0,,,,,,1652.000000,3413.0,2655.000000,,,,,
8,12/21/2014,273,2597.000000,7919.500000,9004.0,,,,,,1607.000000,3398.5,2582.000000,,,,,
9,12/20/2014,272,2571.000000,7862.000000,8939.0,,,,,,1586.000000,3384.0,2556.000000,,,,,


# Drop missing values

In [23]:
ebola = pd.read_csv('../data/ebola_country_timeseries.csv')

In [24]:
# drop a column
ebola.drop('Day', axis=1)

Unnamed: 0,Date,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
0,1/5/2015,2776.0,,10030.0,,,,,,1786.0,,2977.0,,,,,
1,1/4/2015,2775.0,,9780.0,,,,,,1781.0,,2943.0,,,,,
2,1/3/2015,2769.0,8166.0,9722.0,,,,,,1767.0,3496.0,2915.0,,,,,
3,1/2/2015,,8157.0,,,,,,,,3496.0,,,,,,
4,12/31/2014,2730.0,8115.0,9633.0,,,,,,1739.0,3471.0,2827.0,,,,,
5,12/28/2014,2706.0,8018.0,9446.0,,,,,,1708.0,3423.0,2758.0,,,,,
6,12/27/2014,2695.0,,9409.0,,,,,,1697.0,,2732.0,,,,,
7,12/24/2014,2630.0,7977.0,9203.0,,,,,,,3413.0,2655.0,,,,,
8,12/21/2014,2597.0,,9004.0,,,,,,1607.0,,2582.0,,,,,
9,12/20/2014,2571.0,7862.0,8939.0,,,,,,1586.0,3384.0,2556.0,,,,,


In [25]:
# return a dataframe where all rows have values in them
ebola.dropna()

Unnamed: 0,Date,Day,Cases_Guinea,Cases_Liberia,Cases_SierraLeone,Cases_Nigeria,Cases_Senegal,Cases_UnitedStates,Cases_Spain,Cases_Mali,Deaths_Guinea,Deaths_Liberia,Deaths_SierraLeone,Deaths_Nigeria,Deaths_Senegal,Deaths_UnitedStates,Deaths_Spain,Deaths_Mali
19,11/18/2014,241,2047.0,7082.0,6190.0,20.0,1.0,4.0,1.0,6.0,1214.0,2963.0,1267.0,8.0,0.0,1.0,0.0,6.0


In [26]:
# missing values are ignored when performing calculations
ebola.Cases_Guinea.mean()

911.0645161290323