# Examples in pandas for processing missing data

In [1]:
# Build sample data
import pandas as pd
import numpy as np
COLS = ['id', 'size', 'days']
dat = [['000', 'small', 18], ['001', 'big', None], ['002', 'mid', None], ['003', 'big', 27], ['003', 'big', 27], ['004', 'small', None]]
df1 = pd.DataFrame(dat, columns=COLS)

print('Initial dataframe')
print(df1)
df = df1.copy()

Initial dataframe
    id   size  days
0  000  small  18.0
1  001    big   NaN
2  002    mid   NaN
3  003    big  27.0
4  003    big  27.0
5  004  small   NaN


## Drop duplicates and print type info

In [2]:
# Drop duplicates
df = df.drop_duplicates()
print(df)


    id   size  days
0  000  small  18.0
1  001    big   NaN
2  002    mid   NaN
3  003    big  27.0
5  004  small   NaN


In [3]:
print('Note that the number values are stored as np.nan types:')
print('type of name objects:', type(df1['id'][0]))
print('type of age objects:', type(df1['days'][0]))

Note that the number values are stored as np.nan types:
type of name objects: <class 'str'>
type of age objects: <class 'numpy.float64'>


## Use isnull and notnull in pandas

In [4]:
# isnull: Leave any rows that contain any column with a null value
filt = pd.isnull(df['days']) 
print(df[filt])

    id   size  days
1  001    big   NaN
2  002    mid   NaN
5  004  small   NaN


In [5]:
# notnull: Remove any rows that contain any column with a null value
filt = pd.notnull(df['days']) 
print(df[filt])

    id   size  days
0  000  small  18.0
3  003    big  27.0


## Fill and Replace dataframe values

In [6]:
# Fill empty values (NaN values) going forwards with the previous value from the same group
print('Before:')
print(df)
df['days'] = df.groupby('size')['days'].ffill()
print('After:')
print(df)

Before:
    id   size  days
0  000  small  18.0
1  001    big   NaN
2  002    mid   NaN
3  003    big  27.0
5  004  small   NaN
After:
    id   size  days
0  000  small  18.0
1  001    big   NaN
2  002    mid   NaN
3  003    big  27.0
5  004  small  18.0


In [7]:
# Fill empty values (NaN values) going backwards with the next value from the same group
print('Before:')
print(df)
df['days'] = df.groupby('size')['days'].bfill()
print('After:')
print(df)

Before:
    id   size  days
0  000  small  18.0
1  001    big   NaN
2  002    mid   NaN
3  003    big  27.0
5  004  small  18.0
After:
    id   size  days
0  000  small  18.0
1  001    big  27.0
2  002    mid   NaN
3  003    big  27.0
5  004  small  18.0


In [8]:
# Relpace NaN values with special value
df['days']=df['days'].fillna(-999)
print(df)

    id   size   days
0  000  small   18.0
1  001    big   27.0
2  002    mid -999.0
3  003    big   27.0
5  004  small   18.0


In [9]:
# Replace special value with NaN value (inverse the above)
df['days'] = df['days'].replace({-999: None})
print(df)

    id   size  days
0  000  small  18.0
1  001    big  27.0
2  002    mid   NaN
3  003    big  27.0
5  004  small  18.0
