https://jakevdp.github.io/PythonDataScienceHandbook/03.04-missing-values.html
# Handling Missing Data

In [2]:
import numpy as np
import pandas as pd

## Python None object
for arrays of Python objects

In [3]:
vals1 = np.array([1, None, 3, 4])
vals1

array([1, None, 3, 4], dtype=object)

In [4]:
for dtype in ['object', 'int']:
    print("dtype =", dtype)
    %timeit np.arange(1E6, dtype=dtype).sum()
    print()

dtype = object
72.1 ms ± 2.59 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)

dtype = int
2.79 ms ± 15.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)



# Missing numerical data
# NaN (Not a Number): Float!

In [5]:
vals2 = np.array([1, np.nan, 3, 4]) 
vals2.dtype
#this array supports fast operations

dtype('float64')

In [6]:
# NaN is a bit like a data virus–it infects any other object it touches. Regardless of the operation, the result of arithmetic with NaN will be another NaN:
1 + np.nan

nan

In [7]:
#aggregations are not useful
vals2.sum(), vals2.min(), vals2.max()

(nan, nan, nan)

In [8]:
#special aggregations
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)

(8.0, 1.0, 4.0)

## NaN and None in Pandas

In [9]:
pd.Series([1, np.nan, 2, None])

0    1.0
1    NaN
2    2.0
3    NaN
dtype: float64

In [10]:
# upcast from INT to a floating-point type
x = pd.Series(range(2), dtype=int)
x[0] = None
x

0    NaN
1    1.0
dtype: float64

In [None]:
Typeclass	Conversion When Storing NAs	NA Sentinel Value
floating	No change	    np.nan
object	    No change	    None or np.nan  (strings are stored in Object)
integer	    Cast to float64	np.nan
boolean	    Cast to object	None or np.nan

# Operating on Null Values

In [None]:
isnull():   Generate a boolean mask indicating missing values
notnull():  Opposite of isnull()
dropna():   Return a filtered version of the data
fillna():   Return a copy of the data with missing values filled or imputed

## Detecting null values

In [17]:
data = pd.Series([1, np.nan, 'hello', None])
data

0        1
1      NaN
2    hello
3     None
dtype: object

In [13]:
data.isnull()

0    False
1     True
2    False
3     True
dtype: bool

In [14]:
data.notnull()

0     True
1    False
2     True
3    False
dtype: bool

In [15]:
data[data.notnull()]

0        1
2    hello
dtype: object

## Dropping null values

In [18]:
# removes NA values
data.dropna()

0        1
2    hello
dtype: object

In [21]:
# fills in NA values
data.fillna(0)

0        1
1        0
2    hello
3        0
dtype: object

### Data Frame

In [24]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [25]:
#drop entire rows by default
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [26]:
#drop columns
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [27]:
df[3] = np.nan
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [28]:
#drop if all are NA
df.dropna(axis='columns', how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [31]:
#keep for min values, at least 3 rows with data
df.dropna(axis='rows', thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


## Filling null values

In [32]:
data = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [33]:
data.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [34]:
# forward-fill: propagate the previous value forward
data.fillna(method='ffill')

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [35]:
# back-fill: propagate the next values backward
data.fillna(method='bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

In [36]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [37]:
df.fillna(method='ffill', axis=1) #row by row

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0
