# Chapter 2 - Data Preparation Basics
## Segment 2 - Treating missing values

In [1]:
import numpy as np
import pandas as pd 

from pandas import Series, DataFrame

### Figuring out what data is missing

In [2]:
missing = np.nan
series_obj = Series(['row1','row2',missing,'row4','row5','row6',missing, 'row8',])
series_obj

0    row1
1    row2
2     NaN
3    row4
4    row5
5    row6
6     NaN
7    row8
dtype: object

In [3]:
series_obj.isnull()

0    False
1    False
2     True
3    False
4    False
5    False
6     True
7    False
dtype: bool

### Filling in for missing values

In [5]:
np.random.seed(25)
df_obj = DataFrame(np.random.randn(36).reshape(6,6))
df_obj

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,-0.232312
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,-1.049942
3,1.05661,-0.419678,2.294842,-2.594487,2.822756,0.680889
4,-1.577693,-1.976254,0.53334,-0.29087,-0.51352,1.982626
5,0.226001,-1.839905,1.607671,0.388292,0.399732,0.405477


In [6]:
df_obj.loc[3:5,0] = missing
df_obj.loc[1:4,5] = missing
df_obj

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,
3,,-0.419678,2.294842,-2.594487,2.822756,
4,,-1.976254,0.53334,-0.29087,-0.51352,
5,,-1.839905,1.607671,0.388292,0.399732,0.405477


In [7]:
filled_df = df_obj.fillna(0)
filled_df

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,0.0
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,0.0
3,0.0,-0.419678,2.294842,-2.594487,2.822756,0.0
4,0.0,-1.976254,0.53334,-0.29087,-0.51352,0.0
5,0.0,-1.839905,1.607671,0.388292,0.399732,0.405477


In [8]:
filled_df = df_obj.fillna({0:0.1, 5:1.25})
filled_df

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,1.25
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,1.25
3,0.1,-0.419678,2.294842,-2.594487,2.822756,1.25
4,0.1,-1.976254,0.53334,-0.29087,-0.51352,1.25
5,0.1,-1.839905,1.607671,0.388292,0.399732,0.405477


In [10]:
#fill like neiborrs
fill_df = df_obj.fillna(method="ffill")
fill_df

Unnamed: 0,0,1,2,3,4,5
0,0.228273,1.02689,-0.839585,-0.591182,-0.956888,-0.222326
1,-0.619915,1.837905,-2.053231,0.868583,-0.920734,-0.222326
2,2.152957,-1.334661,0.07638,-1.246089,1.202272,-0.222326
3,2.152957,-0.419678,2.294842,-2.594487,2.822756,-0.222326
4,2.152957,-1.976254,0.53334,-0.29087,-0.51352,-0.222326
5,2.152957,-1.839905,1.607671,0.388292,0.399732,0.405477


### Counting missing values

In [13]:
np.random.seed(25)
df_obj = DataFrame(np.random.rand(36).reshape(6,6))
df_obj.loc[3:5,0] = missing
df_obj.loc[1:4,5] = missing
df_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [15]:
# count null by column
df_obj.isnull().sum()

0    3
1    0
2    0
3    0
4    0
5    4
dtype: int64

### Filtering out missing values

In [16]:
#Drop null ROWS
df_no_NaN = df_obj.dropna()
df_no_NaN

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376


In [17]:
#Drop null Column
df_no_NaN = df_obj.dropna(axis=1)
df_no_NaN

Unnamed: 0,1,2,3,4
0,0.582277,0.278839,0.185911,0.4111
1,0.437611,0.556229,0.36708,0.402366
2,0.585445,0.161985,0.520719,0.326051
3,0.836375,0.481343,0.516502,0.383048
4,0.559053,0.03445,0.71993,0.421004
5,0.900274,0.669612,0.456069,0.289804
