# Handling missing values

In [33]:
import pandas as pd
import seaborn as sns
import numpy as np

In [14]:
data = sns.load_dataset('titanic')
data.head(5)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Manipulating missing values

In [16]:
def count_missing_values(data, drop_zeros=True):
    missing_values=pd.DataFrame(data=data.isnull().sum(), columns=["Number of missing values"])
    missing_values["Percent"]=round(100*missing_values["Number of missing values"]/data.shape[0],2)
    missing_values.sort_values(by=["Number of missing values"], ascending=False, inplace=True)
    if drop_zeros:
      missing_values=missing_values[missing_values["Percent"]>0]
    return missing_values

In [19]:
count_missing_values(data)

Unnamed: 0,Number of missing values,Percent
deck,688,77.22
age,177,19.87
embarked,2,0.22
embark_town,2,0.22


## Imputation methods 

### Dropping nan values

In [31]:
data.dropna(inplace =False, axis=0, subset=['fare'])

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.2500,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.9250,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1000,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.0500,S,Third,man,True,,Southampton,no,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,0,13.0000,S,Second,man,True,,Southampton,no,True
887,1,1,female,19.0,0,0,30.0000,S,First,woman,False,B,Southampton,yes,True
888,0,3,female,,1,2,23.4500,S,Third,woman,False,,Southampton,no,False
889,1,1,male,26.0,0,0,30.0000,C,First,man,True,C,Cherbourg,yes,True


### Imputation with a fixed value

In [26]:
#This could be done only using a single value as replacement of all nan values
data[['survived','pclass']].fillna(0)
# the value dictionary could have many other columns with its own replacement values assigned
data.fillna(value={'column_name':'nan_substitute'}, inplace=True)

### Imputation using related metrics

In [36]:
data.fillna({'fare': np.nanmean(data['fare'], axis=0),
             'pclass': np.nanmedian(data['pclass'], axis=0)},
           inplace=True)
#Some other metrics:  np.nanstd(), np.nanvar()

### Backfill imputation

In [35]:
#All missing values are filled using nearest rows

filling_method='bfill' #ALl the methods: ‘backfill’, ‘bfill’, ‘pad’, ‘ffill’

data.fillna(method=filling_method, axis=0).head(3)

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,C,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,C,Southampton,yes,True


### KNN imputation

In [47]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=10, copy=True, weights="distance")
# It is possible to decide which columns exclude
KNN_imputer_columns=data.columns.difference(['embark_town'])
#Or you can just select all of numerical type columns
KNN_imputer_columns=data.select_dtypes(include=np.number).columns

data[KNN_imputer_columns]=pd.DataFrame(data=imputer.fit_transform(data[KNN_imputer_columns]),
                                       columns=KNN_imputer_columns)