# Python for data science essential training pt. 1
## 2. Data preparation basics

### 2.2 Treating missing values

In [4]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

#### Figuring out which data are missing

In [7]:
# create a variable containing an NaN value
missing = np.nan

# then create a series object with values & some missing values
series_obj = Series(['row1', 'row2', missing, 'row4', missing])

# see what it looks like
series_obj

0    row1
1    row2
2     NaN
3    row4
4     NaN
dtype: object

In [9]:
# find out which items are NaN or Null
series_obj.isnull()

0    False
1    False
2     True
3    False
4     True
dtype: bool

In [14]:
# can use this to get rid of null items
series_obj [-series_obj.isnull()]

0    row1
1    row2
3    row4
dtype: object

#### Fill in for missing values

In [19]:
np.random.seed(25)
df_obj = DataFrame(np.random.rand(36).reshape(6,6))
df_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.113041
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.699186
3,0.366395,0.836375,0.481343,0.516502,0.383048,0.997541
4,0.514244,0.559053,0.03445,0.71993,0.421004,0.436935
5,0.281701,0.900274,0.669612,0.456069,0.289804,0.525819


In [23]:
# use .loc method to select rows & columns to set certain values to missing
df_obj.loc[3:5, 0] = missing
df_obj.loc[1:4, 5] = missing
df_obj

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,
2,0.447031,0.585445,0.161985,0.520719,0.326051,
3,,0.836375,0.481343,0.516502,0.383048,
4,,0.559053,0.03445,0.71993,0.421004,
5,,0.900274,0.669612,0.456069,0.289804,0.525819


In [26]:
# use .fillna() method to fill in NA values with some other value
df_obj.fillna(0)

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.0
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.0
3,0.0,0.836375,0.481343,0.516502,0.383048,0.0
4,0.0,0.559053,0.03445,0.71993,0.421004,0.0
5,0.0,0.900274,0.669612,0.456069,0.289804,0.525819


In [27]:
# create df by replacing NAs from other df
filled_df = df_obj.fillna({0: 0.25, 5: 0.60}) #replace NAs in col 0 with 0.25 & those in col 5 with 0.6
filled_df

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.6
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.6
3,0.25,0.836375,0.481343,0.516502,0.383048,0.6
4,0.25,0.559053,0.03445,0.71993,0.421004,0.6
5,0.25,0.900274,0.669612,0.456069,0.289804,0.525819


In [30]:
# can use 'fill forward' method to fill in NAs with last non-Null value in each column:
df_obj.fillna(method='ffill')

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376
1,0.684969,0.437611,0.556229,0.36708,0.402366,0.117376
2,0.447031,0.585445,0.161985,0.520719,0.326051,0.117376
3,0.447031,0.836375,0.481343,0.516502,0.383048,0.117376
4,0.447031,0.559053,0.03445,0.71993,0.421004,0.117376
5,0.447031,0.900274,0.669612,0.456069,0.289804,0.525819


#### Counting missing values

In [35]:
# append .sum() after .isnull() to count the number of null values in each row index position
df_obj.isnull().sum()

0    3
1    0
2    0
3    0
4    0
5    4
dtype: int64

#### Filtering out missing values

In [37]:
# One method is .dropna()
df_obj.dropna()  #any rows with NA values are dropped completely

Unnamed: 0,0,1,2,3,4,5
0,0.870124,0.582277,0.278839,0.185911,0.4111,0.117376


In [41]:
# Can instead drop only columns with NAs
df_obj.dropna(axis=1)  #axis=0 gives row-dropping (default); axis=1 gives column-dropping

Unnamed: 0,1,2,3,4
0,0.582277,0.278839,0.185911,0.4111
1,0.437611,0.556229,0.36708,0.402366
2,0.585445,0.161985,0.520719,0.326051
3,0.836375,0.481343,0.516502,0.383048
4,0.559053,0.03445,0.71993,0.421004
5,0.900274,0.669612,0.456069,0.289804
