# Chapter 2 - Data Preparation Basics
## Segment 2 - Treating missing values

In [8]:
import numpy as np
import pandas as pd 

from pandas import Series, DataFrame

### Figuring out what data is missing

In [9]:
missing = np.nan
series_obj = Series([10,20,30,missing,50,60,missing,80])
series_obj

0    10.0
1    20.0
2    30.0
3     NaN
4    50.0
5    60.0
6     NaN
7    80.0
dtype: float64

In [10]:
series_obj.isnull()

0    False
1    False
2    False
3     True
4    False
5    False
6     True
7    False
dtype: bool

### Filling in for missing values

In [18]:
np.random.seed(25)
DF_obj = DataFrame(np.random.rand(36).reshape(6,6))
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
3,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541
4,0.514244,0.559053,0.03445,0.71993,0.421004,0.436935
5,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [19]:
DF_obj.loc[3:5,0] = missing
DF_obj.loc[1:4,5] = missing
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [29]:
filled_DF = DF_obj.fillna(0)
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.0
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.0
3,0.0,0.836375,0.481343,0.516502,0.383048,0.0
4,0.0,0.559053,0.03445,0.71993,0.421004,0.0
5,0.0,0.900274,0.669612,0.456069,0.289804,0.525819


In [30]:
filled_DF = DF_obj.fillna({0: 0.1, 5: 1.25})
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,1.25
2,0.447031,0.585445,0.161985,0.520719,0.326051,1.25
3,0.1,0.836375,0.481343,0.516502,0.383048,1.25
4,0.1,0.559053,0.03445,0.71993,0.421004,1.25
5,0.1,0.900274,0.669612,0.456069,0.289804,0.525819


In [31]:
filled_DF = DF_obj.fillna(method = 'ffill') #fill forward
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.117376
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.117376
3,0.447031,0.836375,0.481343,0.516502,0.383048,0.117376
4,0.447031,0.559053,0.03445,0.71993,0.421004,0.117376
5,0.447031,0.900274,0.669612,0.456069,0.289804,0.525819


In [34]:
filled_DF = DF_obj.fillna(method = 'bfill') #fill backward
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.525819
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.525819
3,,0.836375,0.481343,0.516502,0.383048,0.525819
4,,0.559053,0.03445,0.71993,0.421004,0.525819
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [38]:
filled_DF = DF_obj.interpolate(method = 'linear') #interpolate with 'linear' spacing
filled_DF

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.199064
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.280753
3,0.447031,0.836375,0.481343,0.516502,0.383048,0.362442
4,0.447031,0.559053,0.03445,0.71993,0.421004,0.44413
5,0.447031,0.900274,0.669612,0.456069,0.289804,0.525819


### Counting missing values

In [45]:
DF_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [44]:
DF_obj.isnull().sum() / DF_obj.shape[0]

0    0.500000
1    0.000000
2    0.000000
3    0.000000
4    0.000000
5    0.666667
dtype: float64

### Filtering out missing values

In [46]:
DF_no_na = DF_obj.dropna()
DF_no_na

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
