_My notebook on_
# Python for Data Analysis - Wes McKinney
## Chapter 7 - Data Cleaning and Preparation
### Part 1 - Handling Missing Data

In [3]:
import numpy as np
import pandas as pd

In [6]:
# sentinel value
# For numeric data, pandas uses NaN to represent missing data
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
print(string_data)
print(string_data.isnull())

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object
0    False
1    False
2     True
3    False
dtype: bool


In [7]:
# Python None is also treated as NA in object arrays
string_data[0] = None

string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

Filtering Out Missing Data

In [15]:
# drop NA on Series
data = pd.Series([1, np.nan, 3.5, np.nan, 7])
print(data.dropna())
print(data[data.notnull()])

0    1.0
2    3.5
4    7.0
dtype: float64
0    1.0
2    3.5
4    7.0
dtype: float64


In [25]:
# drop NA on DataFrame
data = pd.DataFrame([
    [1., 6.5, 3.],
    [1., None, None],
    [None, None, None],
    [None, 6.5, 3.]
])
print(data)

# drop any row containing at least a NA
print(data.dropna())

# how='all' to drop only all-NA rows
print(data.dropna(how='all'))

# axis=1 to work on columns
data[4] = np.nan
data.loc[2][0] = 2
data.loc[3][0] = 2
print(data)

# drop any column containing at least a NA
print(data.dropna(axis=1))

# drop only all-NA columns
print(data.dropna(axis=1, how='all'))

     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  NaN  NaN  NaN
3  NaN  6.5  3.0
     0    1    2
0  1.0  6.5  3.0
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
3  NaN  6.5  3.0
     0    1    2   4
0  1.0  6.5  3.0 NaN
1  1.0  NaN  NaN NaN
2  2.0  NaN  NaN NaN
3  2.0  6.5  3.0 NaN
     0
0  1.0
1  1.0
2  2.0
3  2.0
     0    1    2
0  1.0  6.5  3.0
1  1.0  NaN  NaN
2  2.0  NaN  NaN
3  2.0  6.5  3.0


In [58]:
# drop NA with threshold, keep rows with given not NA values
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[1, 0] = np.nan
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan
print(df)
print(df.dropna())
print(df.dropna(thresh=2))

          0         1         2
0  0.640214       NaN       NaN
1       NaN       NaN       NaN
2 -0.489657       NaN -1.600531
3 -1.707466       NaN -0.269526
4  0.714067 -0.361530 -0.497628
5 -1.456466 -0.668461  1.609552
6  0.822007  0.433621  1.548612
          0         1         2
4  0.714067 -0.361530 -0.497628
5 -1.456466 -0.668461  1.609552
6  0.822007  0.433621  1.548612
          0         1         2
2 -0.489657       NaN -1.600531
3 -1.707466       NaN -0.269526
4  0.714067 -0.361530 -0.497628
5 -1.456466 -0.668461  1.609552
6  0.822007  0.433621  1.548612


Filling In Missing Data

In [59]:
# fillna
print(df)

# w/ constant
print(df.fillna(0))

# w/ dictionary for column-specialized filling (or no fill)
print(df.fillna({1: 0.5, 2: 0}))

# inplace
df.fillna(0.000001, inplace=True)
print(df)

          0         1         2
0  0.640214       NaN       NaN
1       NaN       NaN       NaN
2 -0.489657       NaN -1.600531
3 -1.707466       NaN -0.269526
4  0.714067 -0.361530 -0.497628
5 -1.456466 -0.668461  1.609552
6  0.822007  0.433621  1.548612
          0         1         2
0  0.640214  0.000000  0.000000
1  0.000000  0.000000  0.000000
2 -0.489657  0.000000 -1.600531
3 -1.707466  0.000000 -0.269526
4  0.714067 -0.361530 -0.497628
5 -1.456466 -0.668461  1.609552
6  0.822007  0.433621  1.548612
          0         1         2
0  0.640214  0.500000  0.000000
1       NaN  0.500000  0.000000
2 -0.489657  0.500000 -1.600531
3 -1.707466  0.500000 -0.269526
4  0.714067 -0.361530 -0.497628
5 -1.456466 -0.668461  1.609552
6  0.822007  0.433621  1.548612
          0         1         2
0  0.640214  0.000001  0.000001
1  0.000001  0.000001  0.000001
2 -0.489657  0.000001 -1.600531
3 -1.707466  0.000001 -0.269526
4  0.714067 -0.361530 -0.497628
5 -1.456466 -0.668461  1.609552
6  0.822

In [64]:
# fillna w/ interpolation
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = np.nan
df.iloc[4:, 2] = np.nan
print(df)

# forward fill, use previous valid value
print(df.fillna(method='ffill'))

# limited forward fill
print(df.fillna(method='ffill', limit=2))

          0         1         2
0  0.191802 -0.137067  0.259474
1 -1.867499 -0.202048  0.906629
2 -0.661640       NaN -0.623662
3  0.786682       NaN -0.881758
4 -0.523955       NaN       NaN
5  0.849505       NaN       NaN
          0         1         2
0  0.191802 -0.137067  0.259474
1 -1.867499 -0.202048  0.906629
2 -0.661640 -0.202048 -0.623662
3  0.786682 -0.202048 -0.881758
4 -0.523955 -0.202048 -0.881758
5  0.849505 -0.202048 -0.881758
          0         1         2
0  0.191802 -0.137067  0.259474
1 -1.867499 -0.202048  0.906629
2 -0.661640 -0.202048 -0.623662
3  0.786682 -0.202048 -0.881758
4 -0.523955       NaN -0.881758
5  0.849505       NaN -0.881758


In [65]:
# using mean as a filler
data = pd.Series([1., None, 3.5, None, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64