# Imputing Categorical Using DataWig


# Python Imports

In [60]:
%config IPCompleter.greedy=True
import os
import sys
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
import numpy as np
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelBinarizer
pd.set_option('display.max_columns', 125)
import quilt
from scripts.preprocess import percent_missing, align_dataframes, as_dict
from string import Template
import missingno as msno
import impyute
import datawig
from sklearn.metrics import f1_score, classification_report

In [49]:
from quilt.data.avare import homecredit

In [50]:
# avoid parens and copy original data
table = 'previous_application'
df = homecredit[table]().copy(deep=True)
df.head()

Unnamed: 0,SK_ID_PREV,SK_ID_CURR,NAME_CONTRACT_TYPE,AMT_ANNUITY,AMT_APPLICATION,AMT_CREDIT,AMT_DOWN_PAYMENT,AMT_GOODS_PRICE,WEEKDAY_APPR_PROCESS_START,HOUR_APPR_PROCESS_START,FLAG_LAST_APPL_PER_CONTRACT,NFLAG_LAST_APPL_IN_DAY,RATE_DOWN_PAYMENT,RATE_INTEREST_PRIMARY,RATE_INTEREST_PRIVILEGED,NAME_CASH_LOAN_PURPOSE,NAME_CONTRACT_STATUS,DAYS_DECISION,NAME_PAYMENT_TYPE,CODE_REJECT_REASON,NAME_TYPE_SUITE,NAME_CLIENT_TYPE,NAME_GOODS_CATEGORY,NAME_PORTFOLIO,NAME_PRODUCT_TYPE,CHANNEL_TYPE,SELLERPLACE_AREA,NAME_SELLER_INDUSTRY,CNT_PAYMENT,NAME_YIELD_GROUP,PRODUCT_COMBINATION,DAYS_FIRST_DRAWING,DAYS_FIRST_DUE,DAYS_LAST_DUE_1ST_VERSION,DAYS_LAST_DUE,DAYS_TERMINATION,NFLAG_INSURED_ON_APPROVAL
0,2030495,271877,Consumer loans,1730.43,17145.0,17145.0,0.0,17145.0,SATURDAY,15,Y,1,0.0,0.182832,0.867336,XAP,Approved,-73,Cash through the bank,XAP,,Repeater,Mobile,POS,XNA,Country-wide,35,Connectivity,12.0,middle,POS mobile with interest,365243.0,-42.0,300.0,-42.0,-37.0,0.0
1,2802425,108129,Cash loans,25188.615,607500.0,679671.0,,607500.0,THURSDAY,11,Y,1,,,,XNA,Approved,-164,XNA,XAP,Unaccompanied,Repeater,XNA,Cash,x-sell,Contact center,-1,XNA,36.0,low_action,Cash X-Sell: low,365243.0,-134.0,916.0,365243.0,365243.0,1.0
2,2523466,122040,Cash loans,15060.735,112500.0,136444.5,,112500.0,TUESDAY,11,Y,1,,,,XNA,Approved,-301,Cash through the bank,XAP,"Spouse, partner",Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,high,Cash X-Sell: high,365243.0,-271.0,59.0,365243.0,365243.0,1.0
3,2819243,176158,Cash loans,47041.335,450000.0,470790.0,,450000.0,MONDAY,7,Y,1,,,,XNA,Approved,-512,Cash through the bank,XAP,,Repeater,XNA,Cash,x-sell,Credit and cash offices,-1,XNA,12.0,middle,Cash X-Sell: middle,365243.0,-482.0,-152.0,-182.0,-177.0,1.0
4,1784265,202054,Cash loans,31924.395,337500.0,404055.0,,337500.0,THURSDAY,9,Y,1,,,,Repairs,Refused,-781,Cash through the bank,HC,,Repeater,XNA,Cash,walk-in,Credit and cash offices,-1,XNA,24.0,high,Cash Street: high,,,,,,


In [51]:
# drop keys and empty columns
dropcols = ['RATE_INTEREST_PRIVILEGED','RATE_INTEREST_PRIMARY','SK_ID_PREV', 'SK_ID_CURR']
df.drop(dropcols, axis=1, inplace=True)

# drop rows containing null, also done by datawig?
df.dropna(axis=0, how='any', inplace=True)

# select random instances
seed = 500
numinstances = 1000
df = df.sample(numinstances,random_state=seed)
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 929166 to 1290315
Data columns (total 33 columns):
NAME_CONTRACT_TYPE             1000 non-null object
AMT_ANNUITY                    1000 non-null float64
AMT_APPLICATION                1000 non-null float64
AMT_CREDIT                     1000 non-null float64
AMT_DOWN_PAYMENT               1000 non-null float64
AMT_GOODS_PRICE                1000 non-null float64
WEEKDAY_APPR_PROCESS_START     1000 non-null object
HOUR_APPR_PROCESS_START        1000 non-null int64
FLAG_LAST_APPL_PER_CONTRACT    1000 non-null object
NFLAG_LAST_APPL_IN_DAY         1000 non-null int64
RATE_DOWN_PAYMENT              1000 non-null float64
NAME_CASH_LOAN_PURPOSE         1000 non-null object
NAME_CONTRACT_STATUS           1000 non-null object
DAYS_DECISION                  1000 non-null int64
NAME_PAYMENT_TYPE              1000 non-null object
CODE_REJECT_REASON             1000 non-null object
NAME_TYPE_SUITE                1000 non-null objec

# Preprocessing:  Data Types 


Types are inferred, but this may not be what you expect.  Go beyond either categorical or numerical data type:

* decide how to encode the data
* performance : Avoid OneHot for high cardinality columns and decision tree-based algorithms.
* algorithm restrictions


Rule of Thumb: 

https://towardsdatascience.com/7-data-types-a-better-way-to-think-about-data-types-for-machine-learning-939fae99a689

## Assign Data Types

In [52]:
# assign data types
description = pd.read_excel('data/HomeCredit_columns_description.xlsx', sheet_name='Sheet1',usecols=[2,3,4])
description.head()

Unnamed: 0,Table,Row,Type
0,application_train,SK_ID_CURR,numerical
1,application_train,TARGET,categorical
2,application_train,NAME_CONTRACT_TYPE,categorical
3,application_train,CODE_GENDER,categorical
4,application_train,FLAG_OWN_CAR,categorical


In [53]:
# rename to python types
python_cat_dtype = 'object'
python_num_dtype = 'float64'

description.replace('categorical', python_cat_dtype, inplace=True)
description.replace('numerical', python_num_dtype, inplace=True)

# type cols
typecols = description[(description.Table == table)]
typecols.head()

Unnamed: 0,Table,Row,Type
173,previous_application,SK_ID_PREV,float64
174,previous_application,SK_ID_CURR,float64
175,previous_application,NAME_CONTRACT_TYPE,object
176,previous_application,AMT_ANNUITY,float64
177,previous_application,AMT_APPLICATION,float64


In [54]:
# get target columns 
targetcols = pd.DataFrame(df.columns, columns=['Row'])
targetcols.head()

Unnamed: 0,Row
0,NAME_CONTRACT_TYPE
1,AMT_ANNUITY
2,AMT_APPLICATION
3,AMT_CREDIT
4,AMT_DOWN_PAYMENT


In [55]:
# join , ensure col correct -  we dont know which cols are present in the description
targetcols = targetcols.merge(typecols, how='left')
targetcols.head()

Unnamed: 0,Row,Table,Type
0,NAME_CONTRACT_TYPE,previous_application,object
1,AMT_ANNUITY,previous_application,float64
2,AMT_APPLICATION,previous_application,float64
3,AMT_CREDIT,previous_application,float64
4,AMT_DOWN_PAYMENT,previous_application,float64


In [56]:
# retrieve all columns of same type 
cat = targetcols.loc[(targetcols.Type == python_cat_dtype),'Row'].values.tolist()
num = targetcols.loc[(targetcols.Type == python_num_dtype),'Row'].values.tolist()

print(cat)
#print(num)
#print(len(cat) + len(num))

['NAME_CONTRACT_TYPE', 'WEEKDAY_APPR_PROCESS_START', 'FLAG_LAST_APPL_PER_CONTRACT', 'NFLAG_LAST_APPL_IN_DAY', 'NAME_CASH_LOAN_PURPOSE', 'NAME_CONTRACT_STATUS', 'NAME_PAYMENT_TYPE', 'CODE_REJECT_REASON', 'NAME_TYPE_SUITE', 'NAME_CLIENT_TYPE', 'NAME_GOODS_CATEGORY', 'NAME_PORTFOLIO', 'NAME_PRODUCT_TYPE', 'CHANNEL_TYPE', 'SELLERPLACE_AREA', 'NAME_SELLER_INDUSTRY', 'NAME_YIELD_GROUP', 'PRODUCT_COMBINATION', 'NFLAG_INSURED_ON_APPROVAL']


In [57]:
## batch update types 
df[cat] = df[cat].astype(python_cat_dtype)
df[num] = df[num].astype(python_num_dtype)
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 929166 to 1290315
Data columns (total 33 columns):
NAME_CONTRACT_TYPE             1000 non-null object
AMT_ANNUITY                    1000 non-null float64
AMT_APPLICATION                1000 non-null float64
AMT_CREDIT                     1000 non-null float64
AMT_DOWN_PAYMENT               1000 non-null float64
AMT_GOODS_PRICE                1000 non-null float64
WEEKDAY_APPR_PROCESS_START     1000 non-null object
HOUR_APPR_PROCESS_START        1000 non-null float64
FLAG_LAST_APPL_PER_CONTRACT    1000 non-null object
NFLAG_LAST_APPL_IN_DAY         1000 non-null object
RATE_DOWN_PAYMENT              1000 non-null float64
NAME_CASH_LOAN_PURPOSE         1000 non-null object
NAME_CONTRACT_STATUS           1000 non-null object
DAYS_DECISION                  1000 non-null float64
NAME_PAYMENT_TYPE              1000 non-null object
CODE_REJECT_REASON             1000 non-null object
NAME_TYPE_SUITE                1000 non-null 

## Encode Categorical

* Article : Exploring Category Encoders
*  https://towardsdatascience.com/smarter-ways-to-encode-categorical-data-for-machine-learning-part-1-of-3-6dca2f71b159
* Category Encoders Package http://contrib.scikit-learn.org/categorical-encoding/index.html

Take home message: categoricals should look like strings and not numbers . preprocessing feature should be done: example hours are categorical but look like numbers

* boolean flags that are integers  
* HOUR_APPR_PROCESS_START : Hour of day => made it numerical (ordinal)
* NFLAG_LAST_APPL_IN_DAY : (0,1) one hot encoded
* SELLERPLACE_AREA : 2097 instances : numeric code represents categorical ***
* NFLAG_INSURED_ON_APPROVAL : (0,1)


User defined encoding : allows me to reverse encoding, hash sequence of integers to a string

In [58]:
prefix = 's_'
df['NFLAG_LAST_APPL_IN_DAY'] =  prefix + df['NFLAG_LAST_APPL_IN_DAY'].astype(str) 
df['SELLERPLACE_AREA'] = prefix + df['SELLERPLACE_AREA'].astype(str) 
df['NFLAG_INSURED_ON_APPROVAL'] = prefix +  df['NFLAG_INSURED_ON_APPROVAL'].astype(str) 

# Train Model

In [None]:
# select a portion of the data for evaluation
df_train, df_test = datawig.utils.random_split(df)

input_cols = [*df.columns.values] ## except producte
output_column = 'PRODUCT_COMBINATION'
output_path = 'imputer_model'

# Initialize a SimpleImputer model
imputer = datawig.SimpleImputer(
    input_columns=input_cols,  # columns containing information about the column we want to impute
    output_column='PRODUCT_COMBINATION',  # the column we'd like to impute values for
    output_path=output_path  # stores model data and metrics
)

# Fit an imputer model on the train data
#imputer.fit(train_df=df_train, num_epochs=5)

# Fit an imputer model with default list of hyperparameters
imputer.fit_hpo(train_df=df_train)

# Evaluate Performance

In [59]:
# Impute missing values and return original dataframe with predictions
predictions = imputer.predict(df_test)

# Calculate f1 score for true vs predicted values
f1 = f1_score(predictions[output_column], predictions[output_column+'_imputed'], average='weighted')

# Print overall classification report
print(classification_report(predictions[output_column], predictions[output_column+'_imputed']))


# fit an imputer model with customized hyperparameters
#imputer.fit_hpo(
#    train_df=df_train,
#    num_epochs=100,
#    patience=3,
#    learning_rate_candidates=[1e-3, 3e-4, 1e-4]
#)

2019-05-01 19:08:38,007 [INFO]  Assuming 14 numeric input columns: AMT_ANNUITY, AMT_APPLICATION, AMT_CREDIT, AMT_DOWN_PAYMENT, AMT_GOODS_PRICE, HOUR_APPR_PROCESS_START, RATE_DOWN_PAYMENT, DAYS_DECISION, CNT_PAYMENT, DAYS_FIRST_DRAWING, DAYS_FIRST_DUE, DAYS_LAST_DUE_1ST_VERSION, DAYS_LAST_DUE, DAYS_TERMINATION
2019-05-01 19:08:38,009 [INFO]  Assuming 19 string input columns: NAME_PORTFOLIO, NAME_TYPE_SUITE, NAME_CASH_LOAN_PURPOSE, NAME_CONTRACT_STATUS, NAME_PAYMENT_TYPE, NAME_CLIENT_TYPE, NAME_SELLER_INDUSTRY, NAME_GOODS_CATEGORY, NAME_PRODUCT_TYPE, FLAG_LAST_APPL_PER_CONTRACT, SELLERPLACE_AREA, NAME_YIELD_GROUP, CHANNEL_TYPE, PRODUCT_COMBINATION, NAME_CONTRACT_TYPE, NFLAG_INSURED_ON_APPROVAL, CODE_REJECT_REASON, WEEKDAY_APPR_PROCESS_START, NFLAG_LAST_APPL_IN_DAY
2019-05-01 19:08:38,010 [INFO]  Assuming 14 numeric input columns: AMT_ANNUITY, AMT_APPLICATION, AMT_CREDIT, AMT_DOWN_PAYMENT, AMT_GOODS_PRICE, HOUR_APPR_PROCESS_START, RATE_DOWN_PAYMENT, DAYS_DECISION, CNT_PAYMENT, DAYS_FIRST_

2019-05-01 19:08:38,075 [INFO]  12 most often encountered discrete values:                      ['POS household with interest' 'POS mobile with interest'
 'POS industry with interest' 'POS household without interest'
 'Cash Street: high' 'POS other with interest' 'Cash X-Sell: high'
 'POS industry without interest' 'Cash X-Sell: middle'
 'Cash Street: middle' 'POS mobile without interest' 'Cash X-Sell: low']
2019-05-01 19:08:38,096 [INFO]  Detected 0 rows with missing labels                         for column PRODUCT_COMBINATION
2019-05-01 19:08:38,098 [INFO]  Dropping 0/640 rows
2019-05-01 19:08:38,103 [INFO]  Detected 0 rows with missing labels                         for column PRODUCT_COMBINATION
2019-05-01 19:08:38,106 [INFO]  Dropping 0/160 rows
2019-05-01 19:08:38,109 [INFO]  Train: 640, Test: 160
2019-05-01 19:08:38,110 [INFO]  Fitting data encoder <class 'datawig.column_encoders.NumericalEncoder'> on columns AMT_ANNUITY and 640 rows of training data with parameters {'input_col

2019-05-01 19:08:38,351 [INFO]  Concatenating numeric columns ['AMT_CREDIT'] into AMT_CREDIT_numeric
2019-05-01 19:08:38,352 [INFO]  Normalizing with StandardScaler
2019-05-01 19:08:38,356 [INFO]  Data Encoding - Encoded 640 rows of column                         AMT_CREDIT with <class 'datawig.column_encoders.NumericalEncoder'> into                         <class 'numpy.ndarray'> of shape (640, 1)                         and then into shape (640, 1)
2019-05-01 19:08:38,362 [INFO]  Concatenating numeric columns ['AMT_DOWN_PAYMENT'] into AMT_DOWN_PAYMENT_numeric
2019-05-01 19:08:38,364 [INFO]  Normalizing with StandardScaler
2019-05-01 19:08:38,368 [INFO]  Data Encoding - Encoded 640 rows of column                         AMT_DOWN_PAYMENT with <class 'datawig.column_encoders.NumericalEncoder'> into                         <class 'numpy.ndarray'> of shape (640, 1)                         and then into shape (640, 1)
2019-05-01 19:08:38,373 [INFO]  Concatenating numeric columns ['AMT_GOOD

2019-05-01 19:08:39,089 [INFO]  Data Encoding - Encoded 640 rows of column                         PRODUCT_COMBINATION with <class 'datawig.column_encoders.BowEncoder'> into                         <class 'scipy.sparse.csr.csr_matrix'> of shape (640, 32768)                         and then into shape (640, 32768)
2019-05-01 19:08:39,092 [INFO]  Concatenating numeric columns ['DAYS_FIRST_DRAWING'] into DAYS_FIRST_DRAWING_numeric
2019-05-01 19:08:39,094 [INFO]  Normalizing with StandardScaler
2019-05-01 19:08:39,098 [INFO]  Data Encoding - Encoded 640 rows of column                         DAYS_FIRST_DRAWING with <class 'datawig.column_encoders.NumericalEncoder'> into                         <class 'numpy.ndarray'> of shape (640, 1)                         and then into shape (640, 1)
2019-05-01 19:08:39,102 [INFO]  Concatenating numeric columns ['DAYS_FIRST_DUE'] into DAYS_FIRST_DUE_numeric
2019-05-01 19:08:39,105 [INFO]  Normalizing with StandardScaler
2019-05-01 19:08:39,110 [INFO]  D

2019-05-01 19:08:39,339 [INFO]  Data Encoding - Encoded 160 rows of column                         NAME_CONTRACT_STATUS with <class 'datawig.column_encoders.BowEncoder'> into                         <class 'scipy.sparse.csr.csr_matrix'> of shape (160, 32768)                         and then into shape (160, 32768)
2019-05-01 19:08:39,343 [INFO]  Concatenating numeric columns ['DAYS_DECISION'] into DAYS_DECISION_numeric
2019-05-01 19:08:39,345 [INFO]  Normalizing with StandardScaler
2019-05-01 19:08:39,350 [INFO]  Data Encoding - Encoded 160 rows of column                         DAYS_DECISION with <class 'datawig.column_encoders.NumericalEncoder'> into                         <class 'numpy.ndarray'> of shape (160, 1)                         and then into shape (160, 1)
2019-05-01 19:08:39,366 [INFO]  Data Encoding - Encoded 160 rows of column                         NAME_PAYMENT_TYPE with <class 'datawig.column_encoders.BowEncoder'> into                         <class 'scipy.sparse.csr

2019-05-01 19:08:39,545 [INFO]  Concatenating all 33 latent symbols
2019-05-01 19:08:39,547 [INFO]  Constructing categorical loss for column PRODUCT_COMBINATION and 12 labels
2019-05-01 19:08:39,550 [INFO]  Building output of label <Symbol PRODUCT_COMBINATION> with 13 classes                      (including missing class)
2019-05-01 19:08:39,554 [INFO]  Building output symbols
2019-05-01 19:08:39,570 [INFO]  
2019-05-01 19:08:44,304 [INFO]  Epoch[0] Batch [0-20]	Speed: 76.64 samples/sec	cross-entropy=1.163805	PRODUCT_COMBINATION-accuracy=0.633929
2019-05-01 19:08:48,204 [INFO]  Epoch[0] Train-cross-entropy=0.865654
2019-05-01 19:08:48,205 [INFO]  Epoch[0] Train-PRODUCT_COMBINATION-accuracy=0.737500
2019-05-01 19:08:48,206 [INFO]  Epoch[0] Time cost=8.622
2019-05-01 19:08:48,453 [INFO]  Saved checkpoint to "imputer_model0/model-0000.params"
2019-05-01 19:08:49,788 [INFO]  Epoch[0] Validation-cross-entropy=0.466289
2019-05-01 19:08:49,789 [INFO]  Epoch[0] Validation-PRODUCT_COMBINATION-a

2019-05-01 19:10:43,380 [INFO]  Epoch[12] Time cost=8.139
2019-05-01 19:10:43,592 [INFO]  Saved checkpoint to "imputer_model0/model-0012.params"
2019-05-01 19:10:44,953 [INFO]  Epoch[12] Validation-cross-entropy=0.132220
2019-05-01 19:10:44,955 [INFO]  Epoch[12] Validation-PRODUCT_COMBINATION-accuracy=0.968750
2019-05-01 19:10:49,157 [INFO]  Epoch[13] Batch [0-20]	Speed: 80.16 samples/sec	cross-entropy=0.018635	PRODUCT_COMBINATION-accuracy=1.000000
2019-05-01 19:10:52,957 [INFO]  Epoch[13] Train-cross-entropy=0.019962
2019-05-01 19:10:52,958 [INFO]  Epoch[13] Train-PRODUCT_COMBINATION-accuracy=0.998437
2019-05-01 19:10:52,960 [INFO]  Epoch[13] Time cost=8.004
2019-05-01 19:10:53,167 [INFO]  Saved checkpoint to "imputer_model0/model-0013.params"
2019-05-01 19:10:54,506 [INFO]  Epoch[13] Validation-cross-entropy=0.131783
2019-05-01 19:10:54,507 [INFO]  Epoch[13] Validation-PRODUCT_COMBINATION-accuracy=0.968750
2019-05-01 19:10:58,650 [INFO]  Epoch[14] Batch [0-20]	Speed: 81.26 samples/se

2019-05-01 19:12:07,989 [INFO]  Attribute PRODUCT_COMBINATION, Label: POS mobile without interest	Reaching 0.75 precision / 0.75 recall at threshold 0.007787339389324188
2019-05-01 19:12:07,992 [INFO]  save metrics in imputer_model0/fit-test-metrics.json
2019-05-01 19:12:08,000 [INFO]  Keeping imputer_model0/model-0017.params
2019-05-01 19:12:08,003 [INFO]  Deleting imputer_model0/model-0012.params
2019-05-01 19:12:08,004 [INFO]  Deleting imputer_model0/model-0010.params
2019-05-01 19:12:08,008 [INFO]  Deleting imputer_model0/model-0003.params
2019-05-01 19:12:08,009 [INFO]  Deleting imputer_model0/model-0016.params
2019-05-01 19:12:08,014 [INFO]  Deleting imputer_model0/model-0019.params
2019-05-01 19:12:08,020 [INFO]  Deleting imputer_model0/model-0013.params
2019-05-01 19:12:08,023 [INFO]  Deleting imputer_model0/model-0000.params
2019-05-01 19:12:08,026 [INFO]  Deleting imputer_model0/model-0008.params
2019-05-01 19:12:08,028 [INFO]  Deleting imputer_model0/model-0014.params
2019-0

2019-05-01 19:12:08,215 [INFO]  Data Encoding - Encoded 160 rows of column                         NAME_CLIENT_TYPE with <class 'datawig.column_encoders.BowEncoder'> into                         <class 'scipy.sparse.csr.csr_matrix'> of shape (160, 32768)                         and then into shape (160, 32768)
2019-05-01 19:12:08,223 [INFO]  Data Encoding - Encoded 160 rows of column                         NAME_GOODS_CATEGORY with <class 'datawig.column_encoders.BowEncoder'> into                         <class 'scipy.sparse.csr.csr_matrix'> of shape (160, 32768)                         and then into shape (160, 32768)
2019-05-01 19:12:08,227 [INFO]  Data Encoding - Encoded 160 rows of column                         NAME_PORTFOLIO with <class 'datawig.column_encoders.BowEncoder'> into                         <class 'scipy.sparse.csr.csr_matrix'> of shape (160, 32768)                         and then into shape (160, 32768)
2019-05-01 19:12:08,231 [INFO]  Data Encoding - Encoded 160 row

2019-05-01 19:12:09,799 [INFO]  Normalizing with StandardScaler
2019-05-01 19:12:09,804 [INFO]  Data Encoding - Encoded 640 rows of column                         AMT_DOWN_PAYMENT with <class 'datawig.column_encoders.NumericalEncoder'> into                         <class 'numpy.ndarray'> of shape (640, 1)                         and then into shape (640, 1)
2019-05-01 19:12:09,808 [INFO]  Concatenating numeric columns ['AMT_GOODS_PRICE'] into AMT_GOODS_PRICE_numeric
2019-05-01 19:12:09,809 [INFO]  Normalizing with StandardScaler
2019-05-01 19:12:09,813 [INFO]  Data Encoding - Encoded 640 rows of column                         AMT_GOODS_PRICE with <class 'datawig.column_encoders.NumericalEncoder'> into                         <class 'numpy.ndarray'> of shape (640, 1)                         and then into shape (640, 1)
2019-05-01 19:12:09,827 [INFO]  Data Encoding - Encoded 640 rows of column                         WEEKDAY_APPR_PROCESS_START with <class 'datawig.column_encoders.BowEnco

2019-05-01 19:12:10,163 [INFO]  Data Encoding - Encoded 640 rows of column                         DAYS_FIRST_DRAWING with <class 'datawig.column_encoders.NumericalEncoder'> into                         <class 'numpy.ndarray'> of shape (640, 1)                         and then into shape (640, 1)
2019-05-01 19:12:10,167 [INFO]  Concatenating numeric columns ['DAYS_FIRST_DUE'] into DAYS_FIRST_DUE_numeric
2019-05-01 19:12:10,168 [INFO]  Normalizing with StandardScaler
2019-05-01 19:12:10,172 [INFO]  Data Encoding - Encoded 640 rows of column                         DAYS_FIRST_DUE with <class 'datawig.column_encoders.NumericalEncoder'> into                         <class 'numpy.ndarray'> of shape (640, 1)                         and then into shape (640, 1)
2019-05-01 19:12:10,176 [INFO]  Concatenating numeric columns ['DAYS_LAST_DUE_1ST_VERSION'] into DAYS_LAST_DUE_1ST_VERSION_numeric
2019-05-01 19:12:10,177 [INFO]  Normalizing with StandardScaler
2019-05-01 19:12:10,181 [INFO]  Data Enc

2019-05-01 19:12:16,797 [INFO]  Data Encoding - Encoded 208 rows of column                         NAME_CASH_LOAN_PURPOSE with <class 'datawig.column_encoders.BowEncoder'> into                         <class 'scipy.sparse.csr.csr_matrix'> of shape (208, 32768)                         and then into shape (208, 32768)
2019-05-01 19:12:16,804 [INFO]  Data Encoding - Encoded 208 rows of column                         NAME_CONTRACT_STATUS with <class 'datawig.column_encoders.BowEncoder'> into                         <class 'scipy.sparse.csr.csr_matrix'> of shape (208, 32768)                         and then into shape (208, 32768)
2019-05-01 19:12:16,807 [INFO]  Concatenating numeric columns ['DAYS_DECISION'] into DAYS_DECISION_numeric
2019-05-01 19:12:16,809 [INFO]  Normalizing with StandardScaler
2019-05-01 19:12:16,812 [INFO]  Data Encoding - Encoded 208 rows of column                         DAYS_DECISION with <class 'datawig.column_encoders.NumericalEncoder'> into                      

2019-05-01 19:12:16,981 [INFO]  Label Encoding - Encoded 208 rows of column                             PRODUCT_COMBINATION with <class 'datawig.column_encoders.CategoricalEncoder'> into                             <class 'numpy.ndarray'> of shape (208, 1) and                             then into shape (208, 1)


AttributeError: module 'datawig' has no attribute 'f1_score'

## Missing Numerical

In [162]:
# batch: assign each column, select from description
for c in cols:
    dtype = description[c]
    df[c] = df[c].astype(dtype)
    
df.info(versbose=True)

Unnamed: 0,Table,Row,Type
175,previous_application,NAME_CONTRACT_TYPE,categorical
181,previous_application,WEEKDAY_APPR_PROCESS_START,categorical
182,previous_application,HOUR_APPR_PROCESS_START,categorical
183,previous_application,FLAG_LAST_APPL_PER_CONTRACT,categorical
184,previous_application,NFLAG_LAST_APPL_IN_DAY,categorical


#### Strategy: Impute Probalistic

* Datawig: https://github.com/awslabs/datawig/blob/master/README.md#imputation-of-numerical-columns

In [None]:
# fill in some nulls
 ## hmm does not include missing data in th 
#seed = 200
#nullval = df.sample(frac=0.2,random_state=seed)
#test = df.loc[nullval.index,['PRODUCT_COMBINATION']]
nullval.index  
df.loc[nullval.index,['PRODUCT_COMBINATION']] = np.nan
df.loc[nullval.index,['PRODUCT_COMBINATION']].head()


# test set: counts for each value 
test.PRODUCT_COMBINATION.value_counts()

In [None]:
###### TODO : Assignment of Categoricals is not working!!!

#https://stackoverflow.com/questions/32718639/pandas-filling-nans-in-categorical-data
# update data types : Once you create Categorical Data, you can insert only values in category.
#print('Updating data types')

#table = 'previous_application'

# retriev type from description
#meta = hc_description.loc[hc_description['Table']==table,['Row','Type']]
#dict_types = as_dict(meta)

# set types in data table
#df = df.astype(dict_types)
