In [1]:
import numpy as np
import pandas as pd

In [4]:
vals1 = np.array([1, None, 3, 4])
vals1

array([1, None, 3, 4], dtype=object)

#### None: Missing Pythonic (Object) Data

* This dtype=object means that the best common type representation NumPy could infer for the contents of the array is that they are Python objects.

In [5]:
# Beware Such Type Errors - Aggregations performed on list containing None type object
vals1.sum()

TypeError: unsupported operand type(s) for +: 'int' and 'NoneType'

#### NaN : Missing Numerical Data (Not a Number)

* it is a special floating-point value recognized by all systems that use the standard IEEE floating-point representation:

In [6]:
(1 + np.nan, 0 * np.nan)

(nan, nan)

In [10]:
vals2 = np.array([1, np.nan, 3, 9])
vals2.sum()
# Not the most useful 

nan

In [15]:
# However we can ignore and performe a host of type numpy nan type similar operations
(np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2), np.nanmean(vals2), np.nanstd(vals2))

(13.0, 1.0, 9.0, 4.333333333333333, 3.39934634239519)

#### NaN and None in Pandas 

In [16]:
x = pd.Series(range(2), dtype=int)
x

0    0
1    1
dtype: int64

In [17]:
x[0] = None
x

0    NaN
1    1.0
dtype: float64

* Pandas automatically converts the None to a NaN value

### Operating on Null Values
* isnull()
    * Generate a Boolean mask indicating missing values
* notnull()
    * Opposite of isnull()
* dropna()
    * Return a filtered version of the data
* fillna()
    * Return a copy of the data with missing values filled or imputed

#### Detecting, Masking Null Values

In [30]:
# Detect
data = pd.Series([1, np.nan, 'hello', None])
data.isnull() # Boolean mask for series

0    False
1     True
2    False
3     True
dtype: bool

In [31]:
# This mask can be used to pull either subset nulls, the inverse, or notnull
print(data[data.isnull()])
print('\n')
print(data[~data.isnull()])
print('\n')
print(data[data.notnull()])

1     NaN
3    None
dtype: object


0        1
2    hello
dtype: object


0        1
2    hello
dtype: object


#### Dropping null values

* In addition to the masking used before, there are the convenience methods, dropna() (which removes NA values) and fillna() (which fills in NA values).

In [32]:
data.dropna()

0        1
2    hello
dtype: object

In [33]:
data # not done inplace unless argument provided

0        1
1      NaN
2    hello
3     None
dtype: object

In [34]:
# DataFrame
df = pd.DataFrame([[1,np.nan,2],[2,3,5],[np.nan,4,6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


* We cannot drop single values from a DataFrame; we can only drop full rows or full columns.

In [35]:
# Default, dropna() drop all rows in which any null values is present:
df.dropna() # only row without nulls returned

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [39]:
# Can drop alternatively by axis (either way - numerically more commonly seen)
display(df.dropna(axis=1))
display(df.dropna(axis='columns'))

Unnamed: 0,2
0,2
1,5
2,6


Unnamed: 0,2
0,2
1,5
2,6


#### But I Want Some of That Data!
* But this drops some good data as well; you might rather be interested in dropping rows or columns with all NA values, or a majority of NA values. This can be specified through the how or thresh parameters, which allow fine control of the number of nulls to allow through.

In [41]:
df[3] = np.nan # Add all nan values to dataframe
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [42]:
# How (all) - translates to delete only 
df.dropna(axis=1, how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [45]:
# Finer-grained control, thresh parameter you can specify a minimum number of non-null values to be kept
df.dropna(axis=0, thresh=3)

Unnamed: 0,0,1,2,3
1,2.0,3.0,5,


* Here the first and last row have been dropped, because they contain only two non-null values.
* Only that row has at least three non-null type values

#### Filling Null Values
* fillna() method provided by Pandas returns a copy of the array with the null values replaced

In [46]:
data_null = pd.Series([1, np.nan, 2, None, 3], index=list('abcde'))
data_null

a    1.0
b    NaN
c    2.0
d    NaN
e    3.0
dtype: float64

In [47]:
data_null.fillna(0)

a    1.0
b    0.0
c    2.0
d    0.0
e    3.0
dtype: float64

In [48]:
# specify a forward-fill to propagate the previous value forward
data_null.fillna(method='ffill') # Essentially finds nearest non-null values looking up from previous value evaluated

a    1.0
b    1.0
c    2.0
d    2.0
e    3.0
dtype: float64

In [49]:
# just the inverse and backward
data_null.fillna(method='bfill')

a    1.0
b    2.0
c    2.0
d    3.0
e    3.0
dtype: float64

#### DataFrames
* options are simlar but we can also specify an axis along which the fills take place

In [50]:
df

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,,4.0,6,


In [51]:
df.fillna(method='ffill', axis=1) # would ffill by previous row value (not column)

Unnamed: 0,0,1,2,3
0,1.0,1.0,2.0,2.0
1,2.0,3.0,5.0,5.0
2,,4.0,6.0,6.0


In [52]:
df.fillna(method='ffill', axis=0) # here there is no value to use for column 3 or the first index in col 1

Unnamed: 0,0,1,2,3
0,1.0,,2,
1,2.0,3.0,5,
2,2.0,4.0,6,
