# Missing Values

#### Installing required packages from requirement.txt

In [None]:
pip install -r requirements.txt

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
TITANIC_TRAIN_DATA_URL = "https://stindatasamples.blob.core.windows.net/ai-ml-data/titanic/train.csv";
HOUSE_PRICES_TRAIN_DATA_URL = "https://stindatasamples.blob.core.windows.net/ai-ml-data/house-prices/train.csv";
titanicDF = pd.read_csv(TITANIC_TRAIN_DATA_URL)

In [None]:
titanicDF.head()

## Missing values in dataset

1. Finding out all null values in the given data set for each column

In [None]:
titanicDF.isnull().sum()

## Data Missing completely at random (MCAR) 

In [None]:
titanicDF[titanicDF['Embarked'].isnull()]

Possiblely Embared data is missing completely at random as there is no sign of dependency 

## Missing data not at random

Checking percentage of missing value in cabin against survived column, as there might be interdependecy w.r.t. missing values in cabin column

In [None]:
titanicDF['Cabin_null'] =np.where( titanicDF['Cabin'].isnull(),1,0)
titanicDF.groupby(['Survived'])['Cabin_null'].mean()

Cabin data has higher missing values where passenger didn't survive

## Techniques to handle Missing value 
1. Mean/Median/Mode
2. Random Sample Imputation
3. Capturing NaN values with new features
4. End of distribution imputation
5. Arbitrary Imputation 
6. Frequent categories imputation

### 1. Mean/Median/Mode

When - data missing completely at random (MCAR)

In [None]:
def impute_nan_mean_median_mode(df, column, value):
    df[column + '_NonNull'] = df[column].fillna(value)

In [None]:
impute_nan_mean_median_mode(titanicDF, 'Age', titanicDF['Age'].median())
titanicDF.isnull().mean()

In [None]:
print(titanicDF['Age'].std())
print(titanicDF['Age_NonNull'].std())

In [None]:
%matplotlib inline
fig = plt.figure()
ax = fig.add_subplot(111)
titanicDF['Age'].plot(kind='kde', ax=ax)
titanicDF['Age_NonNull'].plot(kind='kde', ax=ax, color='red')
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')


### 2. Random Sample Imputation

In [None]:
titanicDF = pd.read_csv(TITANIC_TRAIN_DATA_URL, usecols=['Age', 'Fare', 'Survived'])


In [None]:
titanicDF.isnull().mean()

In [None]:
def impute_with_random_sample(df, column):
    df[column + '_random'] = df[column]
    random_samples = df[column].dropna().sample(df[column].isnull().sum(), random_state=0)  # randome_state = 0 means random values will not change
    random_samples.index=df[df['Age'].isnull()].index
    df.loc[df[column].isnull(), column+ "_random"] = random_samples


In [None]:
impute_with_random_sample(titanicDF,'Age')
titanicDF.head()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
titanicDF['Age'].plot(kind='kde', ax=ax)
titanicDF['Age_random'].plot(kind='kde',ax=ax, color='red')
lines, labels = ax.get_legend_handles_labels()
ax.legend(lines, labels, loc='best')

### 3. Capturing NaN values with new features

In [None]:
titanicDF = pd.read_csv(TITANIC_TRAIN_DATA_URL,usecols=['Age', 'Survived'])

In [None]:
def impute_with_newfeature(df, column):
    df[column + '_NonNull'] = np.where(df[column].isnull(), 1, 0)

In [None]:
impute_with_newfeature(titanicDF, 'Age')
titanicDF['Age'].fillna(titanicDF['Age'].mean(), inplace=True)
titanicDF.head()

### 4. End of distribution imputation

In [None]:
titanicDF = pd.read_csv(TITANIC_TRAIN_DATA_URL, usecols=['Age', 'Fare', 'Survived'])
titanicDF.Age.hist(bins=50)

In [None]:
titanicDF.Age.std()*3 

In [None]:
titanicDF.Age.plot.kde()

In [None]:
sns.boxplot('Age', data=titanicDF)

In [None]:
def impute_with_end_distribution(df, column):
    df[column + '_end_distribution'] = df[column].fillna(df[column].mean() + 3*df[column].std())

In [None]:
impute_with_end_distribution(titanicDF,'Age')

In [None]:
sns.boxplot('Age_end_distribution', data=titanicDF) # outliers are gone

In [None]:
titanicDF['Age_end_distribution'].hist(bins=50) # resultant rigth skewed data

### 5. Arbitrary Imputation 

In [None]:
titanicDF = pd.read_csv(TITANIC_TRAIN_DATA_URL, usecols = ["Age","Fare","Survived"])

In [None]:
titanicDF.Age.hist(bins=50)

In [None]:
def impute_with_arbitary(df, column, value):
    df[column + "_arbitary"] = df[column].fillna(value)

In [None]:
impute_with_arbitary(titanicDF, 'Age', 100)

In [None]:
titanicDF.Age_arbitary.hist(bins=50)

### 6. Frequent categories imputation

In [None]:
houseDF = pd.read_csv(HOUSE_PRICES_TRAIN_DATA_URL, usecols=['BsmtQual','FireplaceQu', 'GarageType','SalePrice'])

In [None]:
houseDF.isnull().mean().sort_values(ascending=True)

In [None]:
houseDF.groupby(['BsmtQual'])['BsmtQual'].count().sort_values(ascending=False).plot.bar()

In [None]:
houseDF.FireplaceQu.value_counts().plot.bar()

In [None]:
houseDF.GarageType.value_counts().plot.bar()

In [None]:
def impute_with_most_frequent_category(df, column):
    most_freq_cat = df[column].value_counts().index[0]
    df[column].fillna(most_freq_cat, inplace=True)
    # OR df[column].fillna(df[column].mode(), inplace=True)

In [None]:
impute_with_most_frequent_category(houseDF, 'BsmtQual')
houseDF.isnull().mean().sort_values(ascending=True)

### Ordinal Number encoding

In [None]:
import datetime
date_list = [datetime.datetime.today() - datetime.timedelta(x) for x in range(0,15)]
dateDF = pd.DataFrame(date_list)
dateDF.columns=['Days']
dict_Week = {"Monday": 1, "Tuesday": 2, "Wednesday":3, "Thursday": 4, "Friday":5,"Saturday": 6, "Sunday":7}
dateDF['Week Day'] = dateDF["Days"].dt.day_name()

In [None]:
dateDF['Week Day']= dateDF['Week Day'].map(dict_Week)
dateDF

### Categorical values by frequency count imputaion

In [None]:
adultDF=pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header=None, index_col=None)

In [None]:
adultDF

In [None]:
columns = [1,3,5,6,7,8,9,13]

In [None]:
adultDF = adultDF[columns]

In [None]:
adultDF.columns = ['Employement', 'Degree', 'Status', 'Designation', 'Family_job', 'Race', 'Sex', 'Country']

In [None]:
for column in adultDF.columns[:]:
    print(column, ": " , len(adultDF[column].unique()), 'labels')

In [None]:
country_dict = adultDF['Country'].value_counts().to_dict()
adultDF['Country'] = adultDF['Country'].map(country_dict)
adultDF.head()

### Target guided ordinal encoding

In [None]:
titanicDF = pd.read_csv(TITANIC_TRAIN_DATA_URL, usecols=['Cabin', 'Survived'])

In [None]:
titanicDF['Cabin'].fillna('Missing', inplace=True)
titanicDF['Cabin'].unique()
# Stripping category for simplecity
titanicDF['Cabin'] = titanicDF['Cabin'].astype(str).str[0]