# Imputing Categorical Using DataWig


# Python Imports

In [1]:
%config IPCompleter.greedy=True
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelBinarizer
pd.set_option('display.max_columns', 125)
import quilt
from scripts.preprocess import percent_missing, align_dataframes, as_dict
from string import Template
import missingno as msno
import impyute
import datawig
from sklearn.metrics import f1_score, classification_report

In [2]:
from quilt.data.avare import homecredit

In [3]:
# avoid parens and copy original data
table = 'previous_application'
df = homecredit[table]().copy(deep=True)
df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,Y,1,0.0,0.182832,0.867336,XAP,Approved,-73,Cash through the bank,XAP,,Repeater,Mobile,POS,XNA,Country-wide,35,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,Y,1,,,,XNA,Approved,-164,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Contact center,-1,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,Y,1,,,,XNA,Approved,-301,Cash through the bank,XAP,"Spouse, partner",Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,Y,1,,,,XNA,Approved,-512,Cash through the bank,XAP,,Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,Y,1,,,,Repairs,Refused,-781,Cash through the bank,HC,,Repeater,XNA,Cash,walk-in,Credit and cash offices,-1,XNA,24.0,high,Cash Street: high,,,,,,


In [4]:
# drop keys and empty columns
dropcols = ['RATE_INTEREST_PRIVILEGED','RATE_INTEREST_PRIMARY','SK_ID_PREV', 'SK_ID_CURR']
df.drop(dropcols, axis=1, inplace=True)

# drop rows containing null, also done by datawig?
df.dropna(axis=0, how='any', inplace=True)

# select random instances
seed = 500
numinstances = 1000
df = df.sample(numinstances,random_state=seed)
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 929166 to 1290315
Data columns (total 33 columns):
NAME_CONTRACT_TYPE             1000 non-null object
AMT_ANNUITY                    1000 non-null float64
AMT_APPLICATION                1000 non-null float64
AMT_CREDIT                     1000 non-null float64
AMT_DOWN_PAYMENT               1000 non-null float64
AMT_GOODS_PRICE                1000 non-null float64
WEEKDAY_APPR_PROCESS_START     1000 non-null object
HOUR_APPR_PROCESS_START        1000 non-null int64
FLAG_LAST_APPL_PER_CONTRACT    1000 non-null object
NFLAG_LAST_APPL_IN_DAY         1000 non-null int64
RATE_DOWN_PAYMENT              1000 non-null float64
NAME_CASH_LOAN_PURPOSE         1000 non-null object
NAME_CONTRACT_STATUS           1000 non-null object
DAYS_DECISION                  1000 non-null int64
NAME_PAYMENT_TYPE              1000 non-null object
CODE_REJECT_REASON             1000 non-null object
NAME_TYPE_SUITE                1000 non-null objec

# Preprocessing:  Data Types 


Types are inferred, but this may not be what you expect.  Go beyond either categorical or numerical data type:

* decide how to encode the data
* performance : Avoid OneHot for high cardinality columns and decision tree-based algorithms.
* algorithm restrictions


Rule of Thumb: 

https://towardsdatascience.com/7-data-types-a-better-way-to-think-about-data-types-for-machine-learning-939fae99a689

## Assign Data Types

In [5]:
# assign data types
description = pd.read_excel('data/HomeCredit_columns_description.xlsx', sheet_name='Sheet1',usecols=[2,3,4])
description.head()

Unnamed: 0,Table,Row,Type
0,application_train,SK_ID_CURR,numerical
1,application_train,TARGET,categorical
2,application_train,NAME_CONTRACT_TYPE,categorical
3,application_train,CODE_GENDER,categorical
4,application_train,FLAG_OWN_CAR,categorical


In [6]:
# rename to python types
python_cat_dtype = 'object'
python_num_dtype = 'float64'

description.replace('categorical', python_cat_dtype, inplace=True)
description.replace('numerical', python_num_dtype, inplace=True)

# type cols
typecols = description[(description.Table == table)]
typecols.head()

Unnamed: 0,Table,Row,Type
173,previous_application,SK_ID_PREV,float64
174,previous_application,SK_ID_CURR,float64
175,previous_application,NAME_CONTRACT_TYPE,object
176,previous_application,AMT_ANNUITY,float64
177,previous_application,AMT_APPLICATION,float64


In [7]:
# get target columns 
targetcols = pd.DataFrame(df.columns, columns=['Row'])
targetcols.head()

Unnamed: 0,Row
0,NAME_CONTRACT_TYPE
1,AMT_ANNUITY
2,AMT_APPLICATION
3,AMT_CREDIT
4,AMT_DOWN_PAYMENT


In [8]:
# join , ensure col correct -  we dont know which cols are present in the description
targetcols = targetcols.merge(typecols, how='left')
targetcols.head()

Unnamed: 0,Row,Table,Type
0,NAME_CONTRACT_TYPE,previous_application,object
1,AMT_ANNUITY,previous_application,float64
2,AMT_APPLICATION,previous_application,float64
3,AMT_CREDIT,previous_application,float64
4,AMT_DOWN_PAYMENT,previous_application,float64


In [9]:
# retrieve all columns of same type 
cat = targetcols.loc[(targetcols.Type == python_cat_dtype),'Row'].values.tolist()
num = targetcols.loc[(targetcols.Type == python_num_dtype),'Row'].values.tolist()

print(cat)
#print(num)
#print(len(cat) + len(num))

['NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'FLAG_LAST_APPL_PER_CONTRACT', 'NFLAG_LAST_APPL_IN_DAY', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE', 'SELLERPLACE_AREA', 'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION', 'NFLAG_INSURED_ON_APPROVAL']


In [10]:
## batch update types 
df[cat] = df[cat].astype(python_cat_dtype)
df[num] = df[num].astype(python_num_dtype)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 929166 to 1290315
Data columns (total 33 columns):
NAME_CONTRACT_TYPE             1000 non-null object
AMT_ANNUITY                    1000 non-null float64
AMT_APPLICATION                1000 non-null float64
AMT_CREDIT                     1000 non-null float64
AMT_DOWN_PAYMENT               1000 non-null float64
AMT_GOODS_PRICE                1000 non-null float64
WEEKDAY_APPR_PROCESS_START     1000 non-null object
HOUR_APPR_PROCESS_START        1000 non-null float64
FLAG_LAST_APPL_PER_CONTRACT    1000 non-null object
NFLAG_LAST_APPL_IN_DAY         1000 non-null object
RATE_DOWN_PAYMENT              1000 non-null float64
NAME_CASH_LOAN_PURPOSE         1000 non-null object
NAME_CONTRACT_STATUS           1000 non-null object
DAYS_DECISION                  1000 non-null float64
NAME_PAYMENT_TYPE              1000 non-null object
CODE_REJECT_REASON             1000 non-null object
NAME_TYPE_SUITE                1000 non-null 

## Encode Categorical

* Article : Exploring Category Encoders
*  https://towardsdatascience.com/smarter-ways-to-encode-categorical-data-for-machine-learning-part-1-of-3-6dca2f71b159
* Category Encoders Package http://contrib.scikit-learn.org/categorical-encoding/index.html

Take home message: categoricals should look like strings and not numbers . preprocessing feature should be done: example hours are categorical but look like numbers

* boolean flags that are integers  
* HOUR_APPR_PROCESS_START : Hour of day => made it numerical (ordinal)
* NFLAG_LAST_APPL_IN_DAY : (0,1) one hot encoded
* SELLERPLACE_AREA : 2097 instances : numeric code represents categorical ***
* NFLAG_INSURED_ON_APPROVAL : (0,1)
* validate nominals : avoid different strings mean the same concept


__Below User defined encoding : allows me to reverse encoding, hash sequence of integers to a string_

In [11]:
prefix = 's_'
df['NFLAG_LAST_APPL_IN_DAY'] =  prefix + df['NFLAG_LAST_APPL_IN_DAY'].astype(str) 
df['SELLERPLACE_AREA'] = prefix + df['SELLERPLACE_AREA'].astype(str) 
df['NFLAG_INSURED_ON_APPROVAL'] = prefix +  df['NFLAG_INSURED_ON_APPROVAL'].astype(str) 

# Train Model

In [15]:
#%%time
# select a portion of the data for evaluation
df_train, df_test = datawig.utils.random_split(df)

output_column = 'PRODUCT_COMBINATION'
output_path = 'imputer_model'
lst = [*df.columns.values]
lst.remove(output_column)
input_cols = lst


# Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=lst,  # columns containing information about the column we want to impute
    output_column=output_column,  # the column we'd like to impute values for
    output_path=output_path  # stores model data and metrics
)

# Fit an imputer model with default list of hyperparameters
imputer.fit_hpo(train_df=df_train)

2019-05-01 19:50:04,123 [INFO]  Assuming 14 numeric input columns: AMT_ANNUITY, AMT_APPLICATION, AMT_CREDIT, AMT_DOWN_PAYMENT, AMT_GOODS_PRICE, HOUR_APPR_PROCESS_START, RATE_DOWN_PAYMENT, DAYS_DECISION, CNT_PAYMENT, DAYS_FIRST_DRAWING, DAYS_FIRST_DUE, DAYS_LAST_DUE_1ST_VERSION, DAYS_LAST_DUE, DAYS_TERMINATION
2019-05-01 19:50:04,124 [INFO]  Assuming 18 string input columns: NAME_CONTRACT_TYPE, NAME_PAYMENT_TYPE, NAME_CLIENT_TYPE, NAME_SELLER_INDUSTRY, NAME_TYPE_SUITE, NAME_GOODS_CATEGORY, NFLAG_INSURED_ON_APPROVAL, NAME_PORTFOLIO, SELLERPLACE_AREA, NAME_PRODUCT_TYPE, CODE_REJECT_REASON, NFLAG_LAST_APPL_IN_DAY, NAME_CONTRACT_STATUS, WEEKDAY_APPR_PROCESS_START, NAME_YIELD_GROUP, CHANNEL_TYPE, FLAG_LAST_APPL_PER_CONTRACT, NAME_CASH_LOAN_PURPOSE
2019-05-01 19:50:04,126 [INFO]  Assuming 14 numeric input columns: AMT_ANNUITY, AMT_APPLICATION, AMT_CREDIT, AMT_DOWN_PAYMENT, AMT_GOODS_PRICE, HOUR_APPR_PROCESS_START, RATE_DOWN_PAYMENT, DAYS_DECISION, CNT_PAYMENT, DAYS_FIRST_DRAWING, DAYS_FIRST_D

2019-05-01 19:50:04,213 [INFO]  Detected 0 rows with missing labels                         for column PRODUCT_COMBINATION
2019-05-01 19:50:04,215 [INFO]  Dropping 0/640 rows
2019-05-01 19:50:04,220 [INFO]  Detected 0 rows with missing labels                         for column PRODUCT_COMBINATION
2019-05-01 19:50:04,222 [INFO]  Dropping 0/160 rows
2019-05-01 19:50:04,225 [INFO]  Train: 640, Test: 160
2019-05-01 19:50:04,226 [INFO]  Fitting data encoder <class 'datawig.column_encoders.NumericalEncoder'> on columns AMT_ANNUITY and 640 rows of training data with parameters {'input_columns': ['AMT_ANNUITY'], 'output_column': 'AMT_ANNUITY_numeric', 'output_dim': 1, 'normalize': True, 'scaler': None}
2019-05-01 19:50:04,234 [INFO]  Fitting data encoder <class 'datawig.column_encoders.NumericalEncoder'> on columns AMT_APPLICATION and 640 rows of training data with parameters {'input_columns': ['AMT_APPLICATION'], 'output_column': 'AMT_APPLICATION_numeric', 'output_dim': 1, 'normalize': True, 

2019-05-01 19:50:04,440 [INFO]  Concatenating numeric columns ['AMT_DOWN_PAYMENT'] into AMT_DOWN_PAYMENT_numeric
2019-05-01 19:50:04,441 [INFO]  Normalizing with StandardScaler
2019-05-01 19:50:04,447 [INFO]  Data Encoding - Encoded 640 rows of column                         AMT_DOWN_PAYMENT with <class 'datawig.column_encoders.NumericalEncoder'> into                         <class 'numpy.ndarray'> of shape (640, 1)                         and then into shape (640, 1)
2019-05-01 19:50:04,452 [INFO]  Concatenating numeric columns ['AMT_GOODS_PRICE'] into AMT_GOODS_PRICE_numeric
2019-05-01 19:50:04,454 [INFO]  Normalizing with StandardScaler
2019-05-01 19:50:04,459 [INFO]  Data Encoding - Encoded 640 rows of column                         AMT_GOODS_PRICE with <class 'datawig.column_encoders.NumericalEncoder'> into                         <class 'numpy.ndarray'> of shape (640, 1)                         and then into shape (640, 1)
2019-05-01 19:50:04,478 [INFO]  Data Encoding - Encoded 6

2019-05-01 19:50:04,834 [INFO]  Concatenating numeric columns ['DAYS_FIRST_DUE'] into DAYS_FIRST_DUE_numeric
2019-05-01 19:50:04,835 [INFO]  Normalizing with StandardScaler
2019-05-01 19:50:04,841 [INFO]  Data Encoding - Encoded 640 rows of column                         DAYS_FIRST_DUE with <class 'datawig.column_encoders.NumericalEncoder'> into                         <class 'numpy.ndarray'> of shape (640, 1)                         and then into shape (640, 1)
2019-05-01 19:50:04,849 [INFO]  Concatenating numeric columns ['DAYS_LAST_DUE_1ST_VERSION'] into DAYS_LAST_DUE_1ST_VERSION_numeric
2019-05-01 19:50:04,850 [INFO]  Normalizing with StandardScaler
2019-05-01 19:50:04,858 [INFO]  Data Encoding - Encoded 640 rows of column                         DAYS_LAST_DUE_1ST_VERSION with <class 'datawig.column_encoders.NumericalEncoder'> into                         <class 'numpy.ndarray'> of shape (640, 1)                         and then into shape (640, 1)
2019-05-01 19:50:04,863 [INFO]  C

2019-05-01 19:50:05,164 [INFO]  Data Encoding - Encoded 160 rows of column                         NAME_PAYMENT_TYPE with <class 'datawig.column_encoders.BowEncoder'> into                         <class 'scipy.sparse.csr.csr_matrix'> of shape (160, 32768)                         and then into shape (160, 32768)
2019-05-01 19:50:05,171 [INFO]  Data Encoding - Encoded 160 rows of column                         CODE_REJECT_REASON with <class 'datawig.column_encoders.BowEncoder'> into                         <class 'scipy.sparse.csr.csr_matrix'> of shape (160, 32768)                         and then into shape (160, 32768)
2019-05-01 19:50:05,186 [INFO]  Data Encoding - Encoded 160 rows of column                         NAME_TYPE_SUITE with <class 'datawig.column_encoders.BowEncoder'> into                         <class 'scipy.sparse.csr.csr_matrix'> of shape (160, 32768)                         and then into shape (160, 32768)
2019-05-01 19:50:05,197 [INFO]  Data Encoding - Encoded 160 ro

2019-05-01 19:50:14,937 [INFO]  Epoch[0] Validation-cross-entropy=0.590729
2019-05-01 19:50:14,938 [INFO]  Epoch[0] Validation-PRODUCT_COMBINATION-accuracy=0.837500
2019-05-01 19:50:19,050 [INFO]  Epoch[1] Batch [0-20]	Speed: 81.68 samples/sec	cross-entropy=0.419733	PRODUCT_COMBINATION-accuracy=0.869048
2019-05-01 19:50:22,610 [INFO]  Epoch[1] Train-cross-entropy=0.395694
2019-05-01 19:50:22,611 [INFO]  Epoch[1] Train-PRODUCT_COMBINATION-accuracy=0.884375
2019-05-01 19:50:22,612 [INFO]  Epoch[1] Time cost=7.673
2019-05-01 19:50:22,822 [INFO]  Saved checkpoint to "imputer_model0/model-0001.params"
2019-05-01 19:50:24,088 [INFO]  Epoch[1] Validation-cross-entropy=0.376982
2019-05-01 19:50:24,089 [INFO]  Epoch[1] Validation-PRODUCT_COMBINATION-accuracy=0.893750
2019-05-01 19:50:28,576 [INFO]  Epoch[2] Batch [0-20]	Speed: 74.58 samples/sec	cross-entropy=0.269836	PRODUCT_COMBINATION-accuracy=0.913690
2019-05-01 19:50:32,285 [INFO]  Epoch[2] Train-cross-entropy=0.267197
2019-05-01 19:50:32,2

2019-05-01 19:51:54,004 [INFO]  Attribute PRODUCT_COMBINATION, Label: POS household without interest	Reaching 0.9285714285714286 precision / 0.8666666666666667 recall at threshold 0.8788589835166931
2019-05-01 19:51:54,006 [INFO]  Attribute PRODUCT_COMBINATION, Label: Cash Street: high	Reaching 0.75 precision / 0.6 recall at threshold 0.7240794897079468
2019-05-01 19:51:54,009 [INFO]  Attribute PRODUCT_COMBINATION, Label: POS other with interest	Reaching 1.0 precision / 0.0 recall at threshold 0.9943445920944214
2019-05-01 19:51:54,012 [INFO]  Attribute PRODUCT_COMBINATION, Label: Cash X-Sell: high	Reaching 0.0 precision / 0.0 recall at threshold 0.9357686042785645
2019-05-01 19:51:54,015 [INFO]  Attribute PRODUCT_COMBINATION, Label: POS industry without interest	Reaching 0.0 precision / 0.0 recall at threshold 0.2958134710788727
2019-05-01 19:51:54,018 [INFO]  Attribute PRODUCT_COMBINATION, Label: Cash X-Sell: middle	Reaching 1.0 precision / 0.0 recall at threshold 0.7852799892425537


2019-05-01 19:51:54,264 [INFO]  Data Encoding - Encoded 160 rows of column                         CODE_REJECT_REASON with <class 'datawig.column_encoders.BowEncoder'> into                         <class 'scipy.sparse.csr.csr_matrix'> of shape (160, 32768)                         and then into shape (160, 32768)
2019-05-01 19:51:54,275 [INFO]  Data Encoding - Encoded 160 rows of column                         NAME_TYPE_SUITE with <class 'datawig.column_encoders.BowEncoder'> into                         <class 'scipy.sparse.csr.csr_matrix'> of shape (160, 32768)                         and then into shape (160, 32768)
2019-05-01 19:51:54,280 [INFO]  Data Encoding - Encoded 160 rows of column                         NAME_CLIENT_TYPE with <class 'datawig.column_encoders.BowEncoder'> into                         <class 'scipy.sparse.csr.csr_matrix'> of shape (160, 32768)                         and then into shape (160, 32768)
2019-05-01 19:51:54,293 [INFO]  Data Encoding - Encoded 160 row

2019-05-01 19:51:55,816 [INFO]  Concatenating numeric columns ['AMT_DOWN_PAYMENT'] into AMT_DOWN_PAYMENT_numeric
2019-05-01 19:51:55,817 [INFO]  Normalizing with StandardScaler
2019-05-01 19:51:55,822 [INFO]  Data Encoding - Encoded 640 rows of column                         AMT_DOWN_PAYMENT with <class 'datawig.column_encoders.NumericalEncoder'> into                         <class 'numpy.ndarray'> of shape (640, 1)                         and then into shape (640, 1)
2019-05-01 19:51:55,829 [INFO]  Concatenating numeric columns ['AMT_GOODS_PRICE'] into AMT_GOODS_PRICE_numeric
2019-05-01 19:51:55,830 [INFO]  Normalizing with StandardScaler
2019-05-01 19:51:55,833 [INFO]  Data Encoding - Encoded 640 rows of column                         AMT_GOODS_PRICE with <class 'datawig.column_encoders.NumericalEncoder'> into                         <class 'numpy.ndarray'> of shape (640, 1)                         and then into shape (640, 1)
2019-05-01 19:51:55,852 [INFO]  Data Encoding - Encoded 6

2019-05-01 19:51:56,210 [INFO]  Concatenating numeric columns ['DAYS_FIRST_DUE'] into DAYS_FIRST_DUE_numeric
2019-05-01 19:51:56,213 [INFO]  Normalizing with StandardScaler
2019-05-01 19:51:56,217 [INFO]  Data Encoding - Encoded 640 rows of column                         DAYS_FIRST_DUE with <class 'datawig.column_encoders.NumericalEncoder'> into                         <class 'numpy.ndarray'> of shape (640, 1)                         and then into shape (640, 1)
2019-05-01 19:51:56,221 [INFO]  Concatenating numeric columns ['DAYS_LAST_DUE_1ST_VERSION'] into DAYS_LAST_DUE_1ST_VERSION_numeric
2019-05-01 19:51:56,223 [INFO]  Normalizing with StandardScaler
2019-05-01 19:51:56,227 [INFO]  Data Encoding - Encoded 640 rows of column                         DAYS_LAST_DUE_1ST_VERSION with <class 'datawig.column_encoders.NumericalEncoder'> into                         <class 'numpy.ndarray'> of shape (640, 1)                         and then into shape (640, 1)
2019-05-01 19:51:56,234 [INFO]  C

<datawig.simple_imputer.SimpleImputer at 0x123dce208>

# Evaluate Performance

In [17]:
# Impute missing values and return original dataframe with predictions
predictions = imputer.predict(df_test)

predictions.head()

# Calculate f1 score for true vs predicted values
f1 = f1_score(predictions[output_column], predictions[output_column+'_imputed'], average='weighted')

# Print overall classification report
print(classification_report(predictions[output_column], predictions[output_column+'_imputed']))


2019-05-01 19:52:47,614 [INFO]  Data Encoding - Encoded 208 rows of column                         NAME_CONTRACT_TYPE with <class 'datawig.column_encoders.BowEncoder'> into                         <class 'scipy.sparse.csr.csr_matrix'> of shape (208, 32768)                         and then into shape (208, 32768)
2019-05-01 19:52:47,619 [INFO]  Concatenating numeric columns ['AMT_ANNUITY'] into AMT_ANNUITY_numeric
2019-05-01 19:52:47,621 [INFO]  Normalizing with StandardScaler
2019-05-01 19:52:47,635 [INFO]  Data Encoding - Encoded 208 rows of column                         AMT_ANNUITY with <class 'datawig.column_encoders.NumericalEncoder'> into                         <class 'numpy.ndarray'> of shape (208, 1)                         and then into shape (208, 1)
2019-05-01 19:52:47,642 [INFO]  Concatenating numeric columns ['AMT_APPLICATION'] into AMT_APPLICATION_numeric
2019-05-01 19:52:47,644 [INFO]  Normalizing with StandardScaler
2019-05-01 19:52:47,648 [INFO]  Data Encoding - Encod

2019-05-01 19:52:47,886 [INFO]  Data Encoding - Encoded 208 rows of column                         NAME_SELLER_INDUSTRY with <class 'datawig.column_encoders.BowEncoder'> into                         <class 'scipy.sparse.csr.csr_matrix'> of shape (208, 32768)                         and then into shape (208, 32768)
2019-05-01 19:52:47,891 [INFO]  Concatenating numeric columns ['CNT_PAYMENT'] into CNT_PAYMENT_numeric
2019-05-01 19:52:47,893 [INFO]  Normalizing with StandardScaler
2019-05-01 19:52:47,900 [INFO]  Data Encoding - Encoded 208 rows of column                         CNT_PAYMENT with <class 'datawig.column_encoders.NumericalEncoder'> into                         <class 'numpy.ndarray'> of shape (208, 1)                         and then into shape (208, 1)
2019-05-01 19:52:47,909 [INFO]  Data Encoding - Encoded 208 rows of column                         NAME_YIELD_GROUP with <class 'datawig.column_encoders.BowEncoder'> into                         <class 'scipy.sparse.csr.csr_ma

                                precision    recall  f1-score   support

             Cash Street: high       1.00      0.75      0.86         4
             Cash X-Sell: high       0.89      1.00      0.94         8
           Cash X-Sell: middle       1.00      1.00      1.00         1
   POS household with interest       0.86      0.99      0.92        76
POS household without interest       0.95      0.69      0.80        29
    POS industry with interest       0.93      0.96      0.94        26
      POS mobile with interest       0.96      0.98      0.97        49
   POS mobile without interest       0.00      0.00      0.00         2
       POS other with interest       1.00      0.40      0.57         5

                     micro avg       0.91      0.91      0.91       200
                     macro avg       0.84      0.75      0.78       200
                  weighted avg       0.91      0.91      0.90       200



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [20]:
predictions.head()

Unnamed: 0,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL,PRODUCT_COMBINATION_imputed,PRODUCT_COMBINATION_imputed_proba
265076,Consumer loans,16529.31,172332.0,141228.0,44595.0,172332.0,MONDAY,10.0,Y,s_1,0.261367,XAP,Approved,-316.0,Cash through the bank,XAP,Unaccompanied,New,Mobile,POS,XNA,Country-wide,s_36,Connectivity,12.0,high,POS mobile with interest,365243.0,-286.0,44.0,-46.0,-44.0,s_1.0,POS mobile with interest,0.994247
1529698,Consumer loans,10223.64,58455.0,57663.0,3510.0,58455.0,SATURDAY,20.0,Y,s_1,0.06249,XAP,Approved,-2586.0,Cash through the bank,XAP,Family,Repeater,Audio/Video,POS,XNA,Country-wide,s_2300,Consumer electronics,6.0,low_normal,POS household without interest,365243.0,-2555.0,-2405.0,-2405.0,-2396.0,s_1.0,POS household with interest,0.542133
788943,Consumer loans,25204.815,129681.0,136530.0,0.0,129681.0,TUESDAY,10.0,Y,s_1,0.0,XAP,Approved,-662.0,Cash through the bank,XAP,Unaccompanied,Refreshed,Consumer Electronics,POS,XNA,Country-wide,s_2000,Consumer electronics,6.0,middle,POS household with interest,365243.0,-631.0,-481.0,-511.0,-503.0,s_0.0,POS household with interest,0.993616
69679,Consumer loans,15988.14,127782.0,125122.5,12780.0,127782.0,SATURDAY,14.0,Y,s_1,0.100931,XAP,Approved,-637.0,XNA,XAP,Unaccompanied,Repeater,Mobile,POS,XNA,Country-wide,s_50,Connectivity,10.0,high,POS mobile with interest,365243.0,-606.0,-336.0,-576.0,-572.0,s_0.0,POS mobile with interest,0.953899
420487,Consumer loans,7208.46,58959.0,57438.0,5899.5,58959.0,SATURDAY,9.0,Y,s_1,0.101442,XAP,Approved,-2323.0,Cash through the bank,XAP,Family,Repeater,Audio/Video,POS,XNA,Stone,s_100,Industry,10.0,high,POS industry with interest,365243.0,-2292.0,-2022.0,-2022.0,-2021.0,s_1.0,POS household with interest,0.480864
