# Examples in pandas for handling missing data

In [0]:
# Build sample data
import pandas as pd
import numpy as np
dat = [['000', 'small', 18], 
       ['001', 'big', None], 
       ['002', 'mid', None], 
       ['003', 'big', 27], 
       ['003', 'big', 27], 
       ['004', 'small', None]]

df1 = pd.DataFrame(dat, columns=['id', 'size', 'days'])

print('Initial dataframe')
print(df1)
df = df1.copy()

Initial dataframe
    id   size  days
0  000  small  18.0
1  001    big   NaN
2  002    mid   NaN
3  003    big  27.0
4  003    big  27.0
5  004  small   NaN


## Print type of each column in dataframe

In [0]:
print(df.dtypes)
print(">> Note that days column was converted to float due to existance of one or more NaN values")

id       object
size     object
days    float64
dtype: object
>> Note that days column was converted to float due to existance of one or more NaN values


## Drop duplicates and print type info

In [0]:
# Drop duplicates
df = df.drop_duplicates()
print(df)

    id   size  days
0  000  small  18.0
1  001    big   NaN
2  002    mid   NaN
3  003    big  27.0
5  004  small   NaN


## Use isnull and notnull in pandas

In [0]:
# isnull: Leave any rows that contain any column with a null value
filt = pd.isnull(df['days']) 
print(df[filt])

    id   size  days
1  001    big   NaN
2  002    mid   NaN
5  004  small   NaN


In [0]:
# notnull: Leave only rows that don't contain null values
filt = pd.notnull(df['days']) 
print(df[filt])

    id   size  days
0  000  small  18.0
3  003    big  27.0


## Fill missing dataframe values

In [0]:
# Relpace NaN values with special value
df['days']=df['days'].fillna(-1)
print(df)

    id   size  days
0  000  small  18.0
1  001    big  -1.0
2  002    mid  -1.0
3  003    big  27.0
5  004  small  -1.0


In [0]:
# Replace special value with NaN value (inverse the above)
df['days'] = df['days'].replace({-1: None})
print(df)

    id   size  days
0  000  small    18
1  001    big  None
2  002    mid  None
3  003    big    27
5  004  small  None
