### Predicting Bank's Term Deposit Subscription - MCAR Test

#### Author: Guansu(Frances) Niu

#### Data Resource: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

In [2]:
# Importing basic libraries:

import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.impute import SimpleImputer
from pandas.api.types import CategoricalDtype
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

In [3]:
# Remove warnings:

import warnings
warnings.filterwarnings("ignore")
warnings.filterwarnings(action='ignore',category=DeprecationWarning)
warnings.filterwarnings(action='ignore',category=FutureWarning)

In [4]:
# Read data:

df = pd.read_csv("data/raw data.csv",sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [1]:
# Reading the Dataset:

onehot_ftrs = ['job', 'marital', 'default', 'housing','loan', 'contact','poutcome']

ordinal_ftrs = ['education','month','day_of_week']

ordinal_cats = [['basic.4y', 'basic.6y', 'basic.9y', 'high.school', 'illiterate', 'professional.course', 
                'university.degree','missing'],['mar', 'apr', 'may', 'jun', 'jul', 'aug', 'sep','oct',
                'nov','dec'],['mon', 'tue', 'wed', 'thu', 'fri']]

num_ftrs = ['age','duration','campaign','previous','pdays','emp.var.rate','cons.price.idx','cons.conf.idx', 
                 'euribor3m','nr.employed']

In [12]:
# Onehot encoding:

ohe = OneHotEncoder(sparse=False)
si = SimpleImputer(missing_values='unknown',strategy='constant',fill_value='missing')
onehot_values = ohe.fit_transform(si.fit_transform(df[onehot_ftrs]))
onehot_ftr_names = ohe.get_feature_names()
df_onehot = pd.DataFrame(data=onehot_values,columns = onehot_ftr_names)

In [13]:
# Ordinal encoding:

oe = OrdinalEncoder(categories = ordinal_cats)
si2 = SimpleImputer(missing_values='unknown',strategy='constant',fill_value='missing')
oe_values = oe.fit_transform(si2.fit_transform(df[ordinal_ftrs]))
df_ordinal = pd.DataFrame(data=oe_values,columns = ordinal_ftrs)

In [14]:
# StandardScaler:

df[['pdays']] = df[['pdays']].replace(999, np.nan)
ss = StandardScaler()
num_values = ss.fit_transform(df[num_ftrs])
df_num = pd.DataFrame(data=num_values,columns = num_ftrs)

In [15]:
# Label encoding target variable('y'):

le = LabelEncoder()
df_le = pd.DataFrame(le.fit_transform(df['y']),columns=['y'])

In [16]:
# Creating new dataframe:

frames = [df_onehot,df_ordinal,df_num,df_le]
result = pd.concat(frames,sort=False, axis=1)

In [18]:
# MCAR test to check the reason of missing numberical values:

def checks_input_mcar_tests(data):
    """ Checks whether the input parameter of class McarTests is correct
            Parameters
            ----------
            data:
                The input of McarTests specified as 'data'
            Returns
            -------
            bool
                True if input is correct
            """

    if not isinstance(data, pd.DataFrame):
        print("Error: Data should be a Pandas DataFrame")
        return False

    if not any(data.dtypes.values == np.float):
        if not any(data.dtypes.values == np.int):
            print("Error: Dataset cannot contain other value types than floats and/or integers")
            return False

    if not data.isnull().values.any():
        print("Error: No NaN's in given data")
        return False

    return True

def mcar_test(data):
    """ Implementation of Little's MCAR test
    Parameters
    ----------
    data: Pandas DataFrame
        An incomplete dataset with samples as index and variables as columns
    Returns
    -------
    p_value: Float
        This value is the outcome of a chi-square statistical test, testing whether the null hypothesis
        'the missingness mechanism of the incomplete dataset is MCAR' can be rejected.
    """

    if not checks_input_mcar_tests(data):
        raise Exception("Input not correct")

    dataset = data.copy()
    vars = dataset.dtypes.index.values
    n_var = dataset.shape[1]

    # mean and covariance estimates
    # ideally, this is done with a maximum likelihood estimator
    gmean = dataset.mean()
    gcov = dataset.cov()

    # set up missing data patterns
    r = 1 * dataset.isnull()
    mdp = np.dot(r, list(map(lambda x: ma.pow(2, x), range(n_var))))
    sorted_mdp = sorted(np.unique(mdp))
    n_pat = len(sorted_mdp)
    correct_mdp = list(map(lambda x: sorted_mdp.index(x), mdp))
    dataset['mdp'] = pd.Series(correct_mdp, index=dataset.index)

    # calculate statistic and df
    pj = 0
    d2 = 0
    for i in range(n_pat):
        dataset_temp = dataset.loc[dataset['mdp'] == i, vars]
        select_vars = ~dataset_temp.isnull().any()
        pj += np.sum(select_vars)
        select_vars = vars[select_vars]
        means = dataset_temp[select_vars].mean() - gmean[select_vars]
        select_cov = gcov.loc[select_vars, select_vars]
        mj = len(dataset_temp)
        parta = np.dot(means.T, np.linalg.solve(select_cov, np.identity(select_cov.shape[1])))
        d2 += mj * (np.dot(parta, means))

    df = pj - n_var

    # perform test and save output
    p_value = 1 - st.chi2.cdf(d2, df)

    return p_value

print(mcar_test(result))

# Since the p value > 0.05, the null hypothesis should be retained. 
# Due to > 79% of the rows have 'pdays' = 999, they should not be reduced.
# Therefore, the missingness mechanism of the incomplete dataset is MCAR, and column 'pdays'
# can be dropped. 

1.0
