### Missing data (Sentinel Values)
- nan, np.nan, None as called sentinel values

In [7]:
import pandas as pd
from numpy import nan

weekdays = pd.Series(['sunday', nan, 'monday', 'tuesday', None, nan, 'friday', 'saturday'])
weekdays.isnull()

0    False
1     True
2    False
3    False
4     True
5     True
6    False
7    False
dtype: bool

### Series: Remove Missing Data

In [22]:
### drop na values
display(weekdays.dropna())

### it can be done like
display(weekdays[weekdays.notnull()])

0      sunday
2      monday
3     tuesday
6      friday
7    saturday
dtype: object

0      sunday
2      monday
3     tuesday
6      friday
7    saturday
dtype: object

### Series: Fill Missing Data

In [19]:
### forward fill
display(weekdays.fillna(method='ffill'))
### backward fill
display(weekdays.fillna(method='bfill') )


0      sunday
1      sunday
2      monday
3     tuesday
4     tuesday
5     tuesday
6      friday
7    saturday
dtype: object

0      sunday
1      monday
2      monday
3     tuesday
4      friday
5      friday
6      friday
7    saturday
dtype: object

### DataFrame: Remove Rows Missing Data

In [67]:
import numpy as np
arr = np.random.rand(10,5)

arr[arr<0.3] = nan
arr[2:4] = nan
df = pd.DataFrame(arr)
display(df)

display("Remove all rows with missing data", df.dropna())
display("Remove rows with all values missing", df.dropna(how='all'))
display("Remove rows which has less than 3 values", df.dropna(thresh=3))

Unnamed: 0,0,1,2,3,4
0,0.646477,0.455008,,,0.464423
1,,0.531666,,,0.644894
2,,,,,
3,,,,,
4,0.957424,0.854721,0.863963,,0.440233
5,0.934556,0.37386,0.366724,0.810804,0.853117
6,,0.65467,,0.311088,
7,0.508919,0.479805,0.359499,0.825992,0.454949
8,,0.561319,0.684116,0.782834,0.943531
9,0.634897,0.57955,,0.607956,


'Remove all rows with missing data'

Unnamed: 0,0,1,2,3,4
5,0.934556,0.37386,0.366724,0.810804,0.853117
7,0.508919,0.479805,0.359499,0.825992,0.454949


'Remove rows with all values missing'

Unnamed: 0,0,1,2,3,4
0,0.646477,0.455008,,,0.464423
1,,0.531666,,,0.644894
4,0.957424,0.854721,0.863963,,0.440233
5,0.934556,0.37386,0.366724,0.810804,0.853117
6,,0.65467,,0.311088,
7,0.508919,0.479805,0.359499,0.825992,0.454949
8,,0.561319,0.684116,0.782834,0.943531
9,0.634897,0.57955,,0.607956,


'Remove rows which has less than 3 values'

Unnamed: 0,0,1,2,3,4
0,0.646477,0.455008,,,0.464423
4,0.957424,0.854721,0.863963,,0.440233
5,0.934556,0.37386,0.366724,0.810804,0.853117
7,0.508919,0.479805,0.359499,0.825992,0.454949
8,,0.561319,0.684116,0.782834,0.943531
9,0.634897,0.57955,,0.607956,


### DataFrame: Remove Columns With Missing Data

In [76]:
### dropna will return new dataframe. To change in place user inplace=True
df_copy = df.copy()
df_copy.dropna(axis=1, thresh=5, inplace=True)
df_copy

Unnamed: 0,0,1,3,4
0,0.646477,0.455008,,0.464423
1,,0.531666,,0.644894
2,,,,
3,,,,
4,0.957424,0.854721,,0.440233
5,0.934556,0.37386,0.810804,0.853117
6,,0.65467,0.311088,
7,0.508919,0.479805,0.825992,0.454949
8,,0.561319,0.782834,0.943531
9,0.634897,0.57955,0.607956,


### DataFrame: Fill Missing Data

In [77]:
df.fillna(0)

Unnamed: 0,0,1,3,4
0,0.646477,0.455008,0.0,0.464423
1,0.0,0.531666,0.0,0.644894
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.957424,0.854721,0.0,0.440233
5,0.934556,0.37386,0.810804,0.853117
6,0.0,0.65467,0.311088,0.0
7,0.508919,0.479805,0.825992,0.454949
8,0.0,0.561319,0.782834,0.943531
9,0.634897,0.57955,0.607956,0.0


#### Fill different values for each columns

In [78]:
df.fillna({0:0, 1:-1})

Unnamed: 0,0,1,3,4
0,0.646477,0.455008,,0.464423
1,0.0,0.531666,,0.644894
2,0.0,-1.0,,
3,0.0,-1.0,,
4,0.957424,0.854721,,0.440233
5,0.934556,0.37386,0.810804,0.853117
6,0.0,0.65467,0.311088,
7,0.508919,0.479805,0.825992,0.454949
8,0.0,0.561319,0.782834,0.943531
9,0.634897,0.57955,0.607956,


#### Forward Fill

In [84]:
df.fillna(method='ffill', limit=2) ## limit = number of consecutive steps to fill

Unnamed: 0,0,1,3,4
0,0.646477,0.455008,,0.464423
1,0.646477,0.531666,,0.644894
2,0.646477,0.531666,,0.644894
3,,0.531666,,0.644894
4,0.957424,0.854721,,0.440233
5,0.934556,0.37386,0.810804,0.853117
6,0.934556,0.65467,0.311088,0.853117
7,0.508919,0.479805,0.825992,0.454949
8,0.508919,0.561319,0.782834,0.943531
9,0.634897,0.57955,0.607956,0.943531


#### Backward Fill

In [85]:
df.fillna(method='bfill', limit=2)

Unnamed: 0,0,1,3,4
0,0.646477,0.455008,,0.464423
1,,0.531666,,0.644894
2,0.957424,0.854721,,0.440233
3,0.957424,0.854721,0.810804,0.440233
4,0.957424,0.854721,0.810804,0.440233
5,0.934556,0.37386,0.810804,0.853117
6,0.508919,0.65467,0.311088,0.454949
7,0.508919,0.479805,0.825992,0.454949
8,0.634897,0.561319,0.782834,0.943531
9,0.634897,0.57955,0.607956,


#### Fill with Mean or Median

In [92]:
display(df.fillna(df.mean()))
display(df.mean())

Unnamed: 0,0,1,3,4
0,0.646477,0.455008,0.667735,0.464423
1,0.736455,0.531666,0.667735,0.644894
2,0.736455,0.561325,0.667735,0.633524
3,0.736455,0.561325,0.667735,0.633524
4,0.957424,0.854721,0.667735,0.440233
5,0.934556,0.37386,0.810804,0.853117
6,0.736455,0.65467,0.311088,0.633524
7,0.508919,0.479805,0.825992,0.454949
8,0.736455,0.561319,0.782834,0.943531
9,0.634897,0.57955,0.607956,0.633524


0    0.736455
1    0.561325
3    0.667735
4    0.633524
dtype: float64

### Duplicate Data

In [135]:
arr1 = np.random.choice(range(10), (7, 5))
arr2 = np.random.choice (range(5), (7,1))

arr = np.concatenate((arr1, arr2), axis=1)

dup_df = pd.DataFrame(arr, columns = list('_abcde'))

display(dup_df.duplicated(['_']))

display(dup_df.drop_duplicates(['_'], keep='last'))


0    False
1    False
2    False
3     True
4     True
5    False
6    False
dtype: bool

Unnamed: 0,_,a,b,c,d,e
0,2,7,3,5,6,2
3,5,0,8,0,0,0
4,3,3,6,6,7,2
5,6,5,7,8,5,3
6,1,8,7,6,5,1
