In [1]:
#The paper noted that beyond the core points of feature engineering and infill
#The Automunge library contains several other push-button methods
#With the goal to automate the full workflow for tabular data
#For the steps between receipt of tidy data
#And returned sets suitable for machine learning applicaiton.

#Here we'll quickly demonstrate a few more of those pushbutton methods.


In [2]:
#Note that for all of the methods demonstrated here
#Any data transformation conducted in the automunge(.) function
#Will be comparably applied to subsequent test data in the postmunge(.) function

In [2]:
#imports
import pandas as pd
import numpy as np

from Automunge import *
am = AutoMunge()


In [3]:
#As a sample data set we'll use a common introductory tabular data set
#From the Kaggle competition for "House Prices Advanced Regression Techniques"
#Available at: 
#https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data

#housing set
df_train = pd.read_csv('housing_train.csv')
df_test = pd.read_csv('housing_test.csv')


In [4]:
#Note this sets includes an intended ID column and Label column
#Which will be carved out by passing the associated column headers

labels_column = 'SalePrice'
ID_columnlist = 'Id'


# 1) feature importance evaluation

In [5]:
#Included in the library is an automated feature importance evaluation
#Available by activating the featureselection parameter
#Just make sure you designate a label column to the labels_column parameter

#The results are provided in the printouts

#And also returned in the set we call featureimportance


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict \
= am.automunge(df_train, \
               labels_column = labels_column, \
               trainID_column = ID_columnlist, \
               featureselection = True)


#obviously there's a lot of printouts here
#So if you want to inspect you can do a 'control-F' search for
#"Feature Importance results:" or "sorted metric results:"

_______________
Begin Feature Importance evaluation

_______________
Begin Automunge

_______________
Begin Postmunge

Postmunge returned test column set: 
['MSSubClass_nmbr', 'LotFrontage_nmbr', 'LotArea_nmbr', 'Street_bnry', 'Alley_bnry', 'Utilities_bnry', 'OverallQual_nmbr', 'OverallCond_nmbr', 'YearBuilt_nmbr', 'YearRemodAdd_nmbr', 'MasVnrArea_nmbr', 'BsmtFinSF1_nmbr', 'BsmtFinSF2_nmbr', 'BsmtUnfSF_nmbr', 'TotalBsmtSF_nmbr', 'CentralAir_bnry', '1stFlrSF_nmbr', '2ndFlrSF_nmbr', 'LowQualFinSF_nmbr', 'GrLivArea_nmbr', 'BsmtFullBath_nmbr', 'BsmtHalfBath_nmbr', 'FullBath_nmbr', 'HalfBath_nmbr', 'BedroomAbvGr_nmbr', 'KitchenAbvGr_nmbr', 'TotRmsAbvGrd_nmbr', 'Fireplaces_nmbr', 'GarageYrBlt_nmbr', 'GarageCars_nmbr', 'GarageArea_nmbr', 'WoodDeckSF_nmbr', 'OpenPorchSF_nmbr', 'EnclosedPorch_nmbr', '3SsnPorch_nmbr', 'ScreenPorch_nmbr', 'PoolArea_nmbr', 'MiscVal_nmbr', 'MoSold_nmbr', 'YrSold_nmbr', 'MSSubClass_NArw', 'MSZoning_NArw', 'MSZoning_1010_0', 'MSZoning_1010_1', 'MSZoning_1010_2', 'Lot


______________________
sorted metric results:

OverallQual
0.05636834153886283

GrLivArea
0.03228616126395545

BsmtFinSF1
0.004536133033733458

TotalBsmtSF
0.00445482418556109

GarageArea
0.002089001288415271

LotArea
0.0020482808072860292

YearBuilt
0.0011663829583584695

BsmtQual
0.0011361977833564962

OverallCond
0.0011057088298978002

1stFlrSF
0.0008446767904971564

KitchenQual
0.0007764180334883797

GarageCars
0.0007475283548743494

FireplaceQu
0.0007205058393088271

YearRemodAdd
0.0006206042904168818

Fireplaces
0.0004990238274421577

Neighborhood
0.00048075208022901883

LotFrontage
0.0004020762932448463

GarageYrBlt
0.0003832400737440178

SaleCondition
0.000359349324379421

CentralAir
0.000302420590103325

TotRmsAbvGrd
0.0002820875645684895

BsmtExposure
0.0002778878650815386

WoodDeckSF
0.00025315328389796665

FullBath
0.0002249929926332328

BsmtFinType1
0.00019289633014041296

LotConfig
0.00017106078825956939

BsmtUnfSF
0.0001662596757986412

GarageFinish
0.000163483399437724

______

versioning serial stamp:
_8.18_970965970804

Automunge returned train column set: 
['MSSubClass_nmbr', 'LotFrontage_nmbr', 'LotArea_nmbr', 'Street_bnry', 'Alley_bnry', 'Utilities_bnry', 'OverallQual_nmbr', 'OverallCond_nmbr', 'YearBuilt_nmbr', 'YearRemodAdd_nmbr', 'MasVnrArea_nmbr', 'BsmtFinSF1_nmbr', 'BsmtFinSF2_nmbr', 'BsmtUnfSF_nmbr', 'TotalBsmtSF_nmbr', 'CentralAir_bnry', '1stFlrSF_nmbr', '2ndFlrSF_nmbr', 'LowQualFinSF_nmbr', 'GrLivArea_nmbr', 'BsmtFullBath_nmbr', 'BsmtHalfBath_nmbr', 'FullBath_nmbr', 'HalfBath_nmbr', 'BedroomAbvGr_nmbr', 'KitchenAbvGr_nmbr', 'TotRmsAbvGrd_nmbr', 'Fireplaces_nmbr', 'GarageYrBlt_nmbr', 'GarageCars_nmbr', 'GarageArea_nmbr', 'WoodDeckSF_nmbr', 'OpenPorchSF_nmbr', 'EnclosedPorch_nmbr', '3SsnPorch_nmbr', 'ScreenPorch_nmbr', 'PoolArea_nmbr', 'MiscVal_nmbr', 'MoSold_nmbr', 'YrSold_nmbr', 'MSSubClass_NArw', 'MSZoning_NArw', 'MSZoning_1010_0', 'MSZoning_1010_1', 'MSZoning_1010_2', 'LotFrontage_NArw', 'LotArea_NArw', 'Street_NArw', 'Alley_NArw', 'Lot

In [8]:
#Note that a comparable method is also available for 
#Subsequently passed test data to the postmunge(.) function
#With the featureeval parameter

#Since the housing test set doesn't include a label columns
#We'll pass the train set for demonsrtation
#And use the postprocess_dict we populated in the prior cell

#Notice this version is faster since we're not incurring the 
#overhead of the evaluation methods

test, test_ID, test_labels, \
postreports_dict \
= am.postmunge(postprocess_dict, df_train, \
               testID_column = ID_columnlist, \
               printstatus=True, \
               featureeval = True)

_______________
Begin Postmunge

_______________
Begin Feature Importance evaluation

_______________
Begin Postmunge

______

processing column: MSSubClass
    root category: nmbr

 returned columns:
['MSSubClass_nmbr', 'MSSubClass_NArw']

______

processing column: MSZoning
    root category: 1010

 returned columns:
['MSZoning_NArw', 'MSZoning_1010_0', 'MSZoning_1010_1', 'MSZoning_1010_2']

______

processing column: LotFrontage
    root category: nmbr

 returned columns:
['LotFrontage_nmbr', 'LotFrontage_NArw']

______

processing column: LotArea
    root category: nmbr

 returned columns:
['LotArea_nmbr', 'LotArea_NArw']

______

processing column: Street
    root category: bnry

 returned columns:
['Street_bnry', 'Street_NArw']

______

processing column: Alley
    root category: bnry

 returned columns:
['Alley_bnry', 'Alley_NArw']

______

processing column: LotShape
    root category: 1010

 returned columns:
['LotShape_NArw', 'LotShape_1010_0', 'LotShape_1010_1', 'LotShape_10

 returned columns:
['GarageFinish_NArw', 'GarageFinish_1010_0', 'GarageFinish_1010_1']

______

processing column: GarageCars
    root category: nmbr

 returned columns:
['GarageCars_nmbr', 'GarageCars_NArw']

______

processing column: GarageArea
    root category: nmbr

 returned columns:
['GarageArea_nmbr', 'GarageArea_NArw']

______

processing column: GarageQual
    root category: 1010

 returned columns:
['GarageQual_NArw', 'GarageQual_1010_0', 'GarageQual_1010_1', 'GarageQual_1010_2']

______

processing column: GarageCond
    root category: 1010

 returned columns:
['GarageCond_NArw', 'GarageCond_1010_0', 'GarageCond_1010_1', 'GarageCond_1010_2']

______

processing column: PavedDrive
    root category: 1010

 returned columns:
['PavedDrive_NArw', 'PavedDrive_1010_0', 'PavedDrive_1010_1']

______

processing column: WoodDeckSF
    root category: nmbr

 returned columns:
['WoodDeckSF_nmbr', 'WoodDeckSF_NArw']

______

processing column: OpenPorchSF
    root category: nmbr

 retu

8.995728552885218e-05


for source column: GarageCond
GarageCond_1010_2
7.846287486956705e-06

GarageCond_1010_0
2.141793385490587e-05

GarageCond_NArw
5.486485823036347e-05

GarageCond_1010_1
5.853807521993826e-05


for source column: PavedDrive
PavedDrive_1010_0
-1.103298107207884e-06

PavedDrive_NArw
5.8518756852588005e-05

PavedDrive_1010_1
5.976528880968779e-05


for source column: WoodDeckSF
WoodDeckSF_NArw
-0.00015199490767436874

WoodDeckSF_nmbr
0.0


for source column: OpenPorchSF
OpenPorchSF_NArw
-9.236972127657506e-05

OpenPorchSF_nmbr
0.0


for source column: EnclosedPorch
EnclosedPorch_NArw
-4.957196360921845e-05

EnclosedPorch_nmbr
0.0


for source column: 3SsnPorch
3SsnPorch_NArw
-1.4750687785558547e-05

3SsnPorch_nmbr
0.0


for source column: ScreenPorch
ScreenPorch_nmbr
0.0

ScreenPorch_NArw
2.7833261230902906e-05


for source column: PoolArea
PoolArea_nmbr
0.0

PoolArea_NArw
0.0


for source column: PoolQC
PoolQC_NArw
0.0

PoolQC_1010_0
0.0

PoolQC_1010_1
0.0


for so

 returned columns:
['Electrical_NArw', 'Electrical_1010_0', 'Electrical_1010_1', 'Electrical_1010_2']

______

processing column: 1stFlrSF
    root category: nmbr

 returned columns:
['1stFlrSF_nmbr', '1stFlrSF_NArw']

______

processing column: 2ndFlrSF
    root category: nmbr

 returned columns:
['2ndFlrSF_nmbr', '2ndFlrSF_NArw']

______

processing column: LowQualFinSF
    root category: nmbr

 returned columns:
['LowQualFinSF_nmbr', 'LowQualFinSF_NArw']

______

processing column: GrLivArea
    root category: nmbr

 returned columns:
['GrLivArea_nmbr', 'GrLivArea_NArw']

______

processing column: BsmtFullBath
    root category: nmbr

 returned columns:
['BsmtFullBath_nmbr', 'BsmtFullBath_NArw']

______

processing column: BsmtHalfBath
    root category: nmbr

 returned columns:
['BsmtHalfBath_nmbr', 'BsmtHalfBath_NArw']

______

processing column: FullBath
    root category: nmbr

 returned columns:
['FullBath_nmbr', 'FullBath_NArw']

______

processing column: HalfBath
    root c

infill to column: HouseStyle_1010_1
     infill type: MLinfill

infill to column: HouseStyle_1010_2
     infill type: MLinfill

infill to column: HouseStyle_1010_3
     infill type: MLinfill

infill to column: KitchenAbvGr_nmbr
     infill type: MLinfill

infill to column: KitchenQual_1010_0
     infill type: MLinfill

infill to column: KitchenQual_1010_1
     infill type: MLinfill

infill to column: KitchenQual_1010_2
     infill type: MLinfill

infill to column: LandContour_1010_0
     infill type: MLinfill

infill to column: LandContour_1010_1
     infill type: MLinfill

infill to column: LandContour_1010_2
     infill type: MLinfill

infill to column: LandSlope_1010_0
     infill type: MLinfill

infill to column: LandSlope_1010_1
     infill type: MLinfill

infill to column: LotArea_nmbr
     infill type: MLinfill

infill to column: LotConfig_1010_0
     infill type: MLinfill

infill to column: LotConfig_1010_1
     infill type: MLinfill

infill to column: LotConfig_1010_2
     inf

# 2) Dimensionality Reduction

In [10]:
#When applying feature importance in an automunge(.) call
#The results of the report can be used to perform 
#A kind of dimensionality reduction
#Where columns below some threshold are trimmed

#We can either designate a percent of columns to retain
#with the featurepct = # and featuremethod = 'pct'

#Or alternatively we can designate a minimum metric score for retention 
#with featuremetric = # and featuremethod = 'metric'

#In the prior examples dimensionality reduction was not performed
#Since featuremethod = 'default' by default

#Here we'll demonstrate retaining the top 90% of columns
#Based on the evlauated feature importance metric

train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict \
= am.automunge(df_train, \
               labels_column = labels_column, \
               trainID_column = ID_columnlist, \
               featureselection = 'pct', \
               featurethreshold = 0.9)

#If you want to inspect the results in printouts
#Try a control-F for "Begin feature importance dimensionality reduction"

_______________
Begin Feature Importance evaluation

_______________
Begin Automunge

_______________
Begin Postmunge

Postmunge returned test column set: 
['MSSubClass_nmbr', 'LotFrontage_nmbr', 'LotArea_nmbr', 'Street_bnry', 'Alley_bnry', 'Utilities_bnry', 'OverallQual_nmbr', 'OverallCond_nmbr', 'YearBuilt_nmbr', 'YearRemodAdd_nmbr', 'MasVnrArea_nmbr', 'BsmtFinSF1_nmbr', 'BsmtFinSF2_nmbr', 'BsmtUnfSF_nmbr', 'TotalBsmtSF_nmbr', 'CentralAir_bnry', '1stFlrSF_nmbr', '2ndFlrSF_nmbr', 'LowQualFinSF_nmbr', 'GrLivArea_nmbr', 'BsmtFullBath_nmbr', 'BsmtHalfBath_nmbr', 'FullBath_nmbr', 'HalfBath_nmbr', 'BedroomAbvGr_nmbr', 'KitchenAbvGr_nmbr', 'TotRmsAbvGrd_nmbr', 'Fireplaces_nmbr', 'GarageYrBlt_nmbr', 'GarageCars_nmbr', 'GarageArea_nmbr', 'WoodDeckSF_nmbr', 'OpenPorchSF_nmbr', 'EnclosedPorch_nmbr', '3SsnPorch_nmbr', 'ScreenPorch_nmbr', 'PoolArea_nmbr', 'MiscVal_nmbr', 'MoSold_nmbr', 'YrSold_nmbr', 'MSSubClass_NArw', 'MSZoning_NArw', 'MSZoning_1010_0', 'MSZoning_1010_1', 'MSZoning_1010_2', 'Lot


______________________
sorted metric results:

OverallQual
0.052000379506635275

GrLivArea
0.027696327334070014

TotalBsmtSF
0.003378161693805559

BsmtFinSF1
0.003272481915978287

YearBuilt
0.0020786819208482576

YearRemodAdd
0.0017864341052838961

KitchenQual
0.0015685521574667627

MSSubClass
0.0010797298874232197

GarageArea
0.000837034538546888

Neighborhood
0.0007611533705310647

1stFlrSF
0.0005731001235970146

OverallCond
0.0005182309187093948

OpenPorchSF
0.00046181366645470234

BsmtExposure
0.00035198432743810404

GarageYrBlt
0.00031945043563452735

CentralAir
0.00024317806076845638

MSZoning
0.00020673399541404347

ExterQual
0.0002053430485393637

Foundation
0.00017432397277195033

KitchenAbvGr
0.00016353740725794097

Exterior1st
0.00013432510501987238

BsmtFinType1
0.00012831620369091468

GarageFinish
0.00011508006476157462

BsmtCond
0.00011053968708485407

ScreenPorch
0.00010352415560821449

LotConfig
0.00010164380477950541

Functional
9.462034595031366e-05

BsmtFullBath
9.3

______

versioning serial stamp:
_8.18_337613420920

Automunge returned train column set: 
['MSSubClass_nmbr', 'Street_bnry', 'Alley_bnry', 'Utilities_bnry', 'OverallQual_nmbr', 'OverallCond_nmbr', 'YearBuilt_nmbr', 'YearRemodAdd_nmbr', 'BsmtFinSF1_nmbr', 'BsmtFinSF2_nmbr', 'BsmtUnfSF_nmbr', 'TotalBsmtSF_nmbr', 'CentralAir_bnry', '1stFlrSF_nmbr', 'LowQualFinSF_nmbr', 'GrLivArea_nmbr', 'BsmtFullBath_nmbr', 'BsmtHalfBath_nmbr', 'HalfBath_nmbr', 'BedroomAbvGr_nmbr', 'KitchenAbvGr_nmbr', 'TotRmsAbvGrd_nmbr', 'GarageYrBlt_nmbr', 'GarageArea_nmbr', 'WoodDeckSF_nmbr', 'OpenPorchSF_nmbr', 'EnclosedPorch_nmbr', '3SsnPorch_nmbr', 'ScreenPorch_nmbr', 'PoolArea_nmbr', 'MiscVal_nmbr', 'MoSold_nmbr', 'YrSold_nmbr', 'MSSubClass_NArw', 'MSZoning_NArw', 'MSZoning_1010_0', 'MSZoning_1010_1', 'MSZoning_1010_2', 'Street_NArw', 'Alley_NArw', 'LotShape_NArw', 'LotShape_1010_0', 'LotShape_1010_1', 'LotShape_1010_2', 'LandContour_NArw', 'LandContour_1010_0', 'LandContour_1010_1', 'LandContour_1010_2', 'Utilit

In [11]:
#The shape of the returned train set is:
train.shape

(1460, 216)

In [12]:
#Another kind of dimensionality reduction is available by PCA
#(Principle Component Analysis)
#Which is built on top of the SciKit Learn PCA library

#The default PCA is kernel PCA for all-positive sets
#(Such as may be prepared by applying mnmx 
#as the default numerical encoding)
#Or otherwise the default PCA is sparse PCA

#Here we demonsrate performing a PCA model to consolidate numeric sets
#To 5 returned columns

#We'll also exclude from PCA any boolean or ordinal encoded columns
#As is an option in the ML_cmnd parameter

#Note that if there are any other columns we want to exclude from PCA
#We can designate as a list of column headers to the PCAexcl parameter

train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict \
= am.automunge(df_train, \
               labels_column = labels_column, \
               trainID_column = ID_columnlist, \
               PCAn_components = 5)


_______________
Begin Automunge

______

versioning serial stamp:
_8.18_301633192154

Automunge returned train column set: 
['PCA__0', 'PCA__1', 'PCA__2', 'PCA__3', 'PCA__4', 'WoodDeckSF_NArw', 'BsmtFinType2_1010_0', 'GarageType_1010_0', 'BsmtFinType1_1010_0', 'GarageArea_NArw', 'EnclosedPorch_NArw', 'Condition1_1010_3', 'BsmtExposure_1010_2', 'GarageYrBlt_NArw', 'MoSold_NArw', 'Neighborhood_1010_4', 'Heating_1010_1', 'HouseStyle_1010_3', 'MasVnrArea_NArw', 'Alley_NArw', 'LotShape_1010_0', 'SaleType_1010_2', '1stFlrSF_NArw', 'LandSlope_1010_0', 'Neighborhood_1010_3', 'LotFrontage_NArw', 'Exterior2nd_1010_2', 'GarageCond_1010_0', 'KitchenQual_NArw', 'Condition2_1010_0', 'GarageCond_NArw', 'LotConfig_1010_1', 'Condition2_1010_2', 'BsmtFinType2_1010_1', 'MSZoning_NArw', 'Exterior1st_1010_2', 'Exterior1st_1010_1', 'PoolQC_NArw', 'Street_NArw', 'GarageFinish_1010_1', 'GarageType_1010_1', 'Condition1_1010_2', 'ScreenPorch_NArw', 'ExterQual_NArw', 'BsmtFinSF1_NArw', 'YrSold_NArw', 'BsmtUnfSF_

In [13]:
#The shape of the returned train set is:
train.shape

(1460, 211)

In [14]:
#Another option for dimensionality reduction is available 
#with the Binary transform
#Which collectively applies a binary transform 
#to the set of boolean encoded columns

#Here we'll demonstrate a binary transform for boolean encoded columns
#In conjunction with a PCA transform for numerically encoded columns
#Such as to achieve a significant reduction in the set

#We'll use the Binary parameter
#And for esoteric reasons is good practice to apply excl_suffix = True

train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict \
= am.automunge(df_train, \
               labels_column = labels_column, \
               trainID_column = ID_columnlist, \
               PCAn_components = 5, \
               Binary = True, \
               excl_suffix = True)


_______________
Begin Automunge

______

versioning serial stamp:
_8.18_384444496282

Automunge returned train column set: 
['PCA__0', 'PCA__1', 'PCA__2', 'PCA__3', 'PCA__4', 'Binary__1010_0', 'Binary__1010_1', 'Binary__1010_2', 'Binary__1010_3', 'Binary__1010_4', 'Binary__1010_5', 'Binary__1010_6', 'Binary__1010_7', 'Binary__1010_8', 'Binary__1010_9', 'Binary__1010_10']

Automunge returned ID column set: 
['Id', 'Automunge_index']

Automunge returned label column set: 
['SalePrice_lbnb']

_______________
Automunge Complete



In [15]:
#The shape of the returned train set is:
train.shape

(1460, 16)

# 3) Preparation for Oversampling

In [16]:
#The preparation for oversampling method
#Evaluates distribution of label classes to 
#Basically duplicate in the set rows 
#corresponding to underrepresented labels
#By passing TrainLabelFreqLevel = True
#To (approximately) levelize the distribution of labels

#The method evaluates distribution based on categoric encodings
#But for cases of numeric labels, such as in this housing data set
#The method also works by aggregating bins from the distribution
#Such as standard deviation bins or bins based on powers of ten
#Available simply by assigning a numeric label set in assigncat
#To exc3 for standard deviation bins or exc4 for powers of ten
#Where the bins supplement the returned pass-through numeric sets

#Let's demonstrate

train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict \
= am.automunge(df_train, \
               labels_column = labels_column, \
               trainID_column = ID_columnlist, \
               assigncat = {'exc4':labels_column}, \
               TrainLabelFreqLevel = True)


_______________
Begin Automunge

______

versioning serial stamp:
_8.18_460605223677

Automunge returned train column set: 
['MSSubClass_nmbr', 'LotFrontage_nmbr', 'LotArea_nmbr', 'Street_bnry', 'Alley_bnry', 'Utilities_bnry', 'OverallQual_nmbr', 'OverallCond_nmbr', 'YearBuilt_nmbr', 'YearRemodAdd_nmbr', 'MasVnrArea_nmbr', 'BsmtFinSF1_nmbr', 'BsmtFinSF2_nmbr', 'BsmtUnfSF_nmbr', 'TotalBsmtSF_nmbr', 'CentralAir_bnry', '1stFlrSF_nmbr', '2ndFlrSF_nmbr', 'LowQualFinSF_nmbr', 'GrLivArea_nmbr', 'BsmtFullBath_nmbr', 'BsmtHalfBath_nmbr', 'FullBath_nmbr', 'HalfBath_nmbr', 'BedroomAbvGr_nmbr', 'KitchenAbvGr_nmbr', 'TotRmsAbvGrd_nmbr', 'Fireplaces_nmbr', 'GarageYrBlt_nmbr', 'GarageCars_nmbr', 'GarageArea_nmbr', 'WoodDeckSF_nmbr', 'OpenPorchSF_nmbr', 'EnclosedPorch_nmbr', '3SsnPorch_nmbr', 'ScreenPorch_nmbr', 'PoolArea_nmbr', 'MiscVal_nmbr', 'MoSold_nmbr', 'YrSold_nmbr', 'MSSubClass_NArw', 'MSZoning_NArw', 'MSZoning_1010_0', 'MSZoning_1010_1', 'MSZoning_1010_2', 'LotFrontage_NArw', 'LotArea_NArw', 

In [17]:
#The shape of the returned train set is:
train.shape

(2714, 242)

# 4) Data Distribution Drift

In [19]:
#When automunge(.) is performed, distribution properties of the train set
#Are collected and saved in the postprocess_dict by default

#Such that if we later want to compare distribution properties 
#Of subsequent test sets
#Against the original train set used to train a model
#We can simply extract the corresponding properties 
#And generate report in the postmunge(.) function

#Such a drift report can be performed 
#In conjunction with postmunge processing
#Or it can be performed independantly such as to just generate a report

#Let's demonstrate with the housing test set
#Here we'll just generate a report
#In addition to the printouts
#The results are also returned in the object we call postreports_dict

#Note that drift stats are seperately collected and reported
#For the source columns as fed to the function
#As well as each intermediate and returned column 
#As generated by transformations


test, test_ID, test_labels, \
postreports_dict \
= am.postmunge(postprocess_dict, df_test, \
               testID_column = ID_columnlist, \
               driftreport = 'report_full', \
               printstatus = True)


_______________
Begin Postmunge

_______________
Preparing Source Column Drift Report:

______
Preparing source column drift report for column: MSSubClass

original drift stats:
{'max': 190.0, 'quantile_99': 190.0, 'quantile_90': 120.0, 'quantile_66': 60.0, 'median': 50.0, 'quantile_33': 20.0, 'quantile_10': 20.0, 'quantile_01': 20.0, 'min': 20.0, 'mean': 56.897260273972606, 'std': 42.30057099381035, 'MAD': 31.282745355601453, 'skew': nan, 'shapiro_W': nan, 'shapiro_p': nan, 'nan_ratio': 0.0}

new drift stats:
{'max': 190, 'quantile_99': 190.0, 'quantile_90': 120.0, 'quantile_66': 60.0, 'median': 50.0, 'quantile_33': 20.0, 'quantile_10': 20.0, 'quantile_01': 20.0, 'min': 20, 'mean': 57.37834132967786, 'std': 42.74687961871836, 'MAD': 32.04530880860025, 'skew': nan, 'shapiro_W': nan, 'shapiro_p': nan, 'nan_ratio': 0.0}

______
Preparing source column drift report for column: MSZoning

original drift stats:
{'unique': array(['RL', 'RM', 'C (all)', 'FV', 'RH'], dtype=object), 'unique_rati

new drift stats:
{'max': 5095, 'quantile_99': 2363.26, 'quantile_90': 1671.0, 'quantile_66': 1254.5600000000002, 'median': 1079.0, 'quantile_33': 936.0, 'quantile_10': 738.0, 'quantile_01': 510.12, 'min': 407, 'mean': 1156.534612748458, 'std': 398.16581959237925, 'MAD': 306.29584235496236, 'skew': nan, 'shapiro_W': nan, 'shapiro_p': nan, 'nan_ratio': 0.0}

______
Preparing source column drift report for column: 2ndFlrSF

original drift stats:
{'max': 2065.0, 'quantile_99': 1418.920000000001, 'quantile_90': 954.2000000000003, 'quantile_66': 591.94, 'median': 0.0, 'quantile_33': 0.0, 'quantile_10': 0.0, 'quantile_01': 0.0, 'min': 0.0, 'mean': 346.99246575342465, 'std': 436.5284358862591, 'MAD': 396.4775492587708, 'skew': nan, 'shapiro_W': nan, 'shapiro_p': nan, 'nan_ratio': 0.0}

new drift stats:
{'max': 1862, 'quantile_99': 1379.9400000000005, 'quantile_90': 908.4000000000001, 'quantile_66': 546.0, 'median': 0.0, 'quantile_33': 0.0, 'quantile_10': 0.0, 'quantile_01': 0.0, 'min': 0, 'mea

______
Preparing drift report for columns derived from: MSSubClass

original returned columns:
['MSSubClass_nmbr', 'MSSubClass_NArw']

new returned columns:
['MSSubClass_nmbr', 'MSSubClass_NArw']

___
derived columns: ['MSSubClass_nmbr']

original automunge normalization parameters:
{'mean': 56.897260273972606, 'std': 42.30057099381035, 'median': 50.0, 'MAD': 31.282745355601453, 'max': 190.0, 'min': 20.0, 'offset': 0, 'multiplier': 1, 'cap': False, 'floor': False, 'abs_zero': True, 'inplace': True, 'suffix': 'nmbr', 'defaultinfill_dict': {'defaultinfill': 'negzeroinfill'}}

new postmunge normalization parameters:
{'mean': 57.37834132967786, 'std': 42.74687961871836, 'median': 50.0, 'MAD': 32.04530880860025, 'max': 190.0, 'min': 20.0, 'offset': 0, 'multiplier': 1, 'cap': False, 'floor': False, 'abs_zero': True, 'inplace': True, 'suffix': 'nmbr', 'defaultinfill_dict': {'defaultinfill': 'negzeroinfill'}}

___
derived columns: ['MSSubClass_NArw']

original automunge normalization parameter

new returned columns:
['Neighborhood_NArw', 'Neighborhood_1010_0', 'Neighborhood_1010_1', 'Neighborhood_1010_2', 'Neighborhood_1010_3', 'Neighborhood_1010_4']

___
derived columns: ['Neighborhood_NArw']

original automunge normalization parameters:
{'pct_NArw': 0.0, 'suffix': 'NArw'}

new postmunge normalization parameters:
{'pct_NArw': 0.0, 'suffix': 'NArw'}

___
derived columns: ['Neighborhood_1010_0', 'Neighborhood_1010_1', 'Neighborhood_1010_2', 'Neighborhood_1010_3', 'Neighborhood_1010_4']

original automunge normalization parameters:
{'random_generator': <class 'numpy.random._pcg64.PCG64'>, 'inplace': True, 'all_activations': False, 'add_activations': False, 'less_activations': False, 'consolidated_activations': False, 'str_convert': True, 'null_activation': True, 'max_zero': False, 'missing_marker': nan, 'inverse_consolidation_translate_dict': {}, 'binary_column_count': 5, 'binary_encoding_dict': {nan: '00000', 'Blmngtn': '00001', 'Blueste': '00010', 'BrDale': '00011', 'BrkSide'

new returned columns:
['Exterior2nd_NArw', 'Exterior2nd_1010_0', 'Exterior2nd_1010_1', 'Exterior2nd_1010_2', 'Exterior2nd_1010_3']

___
derived columns: ['Exterior2nd_NArw']

original automunge normalization parameters:
{'pct_NArw': 0.0, 'suffix': 'NArw'}

new postmunge normalization parameters:
{'pct_NArw': 0.0006854009595613434, 'suffix': 'NArw'}

___
derived columns: ['Exterior2nd_1010_0', 'Exterior2nd_1010_1', 'Exterior2nd_1010_2', 'Exterior2nd_1010_3', 'Exterior2nd_1010_4']

original automunge normalization parameters:
{'random_generator': <class 'numpy.random._pcg64.PCG64'>, 'inplace': True, 'all_activations': False, 'add_activations': False, 'less_activations': False, 'consolidated_activations': False, 'str_convert': True, 'null_activation': True, 'max_zero': False, 'missing_marker': nan, 'inverse_consolidation_translate_dict': {}, 'binary_column_count': 5, 'binary_encoding_dict': {nan: '00000', 'AsbShng': '00001', 'AsphShn': '00010', 'Brk Cmn': '00011', 'BrkFace': '00100', 'CBl

new returned columns:
['BsmtFinType1_NArw', 'BsmtFinType1_1010_0', 'BsmtFinType1_1010_1', 'BsmtFinType1_1010_2']

___
derived columns: ['BsmtFinType1_NArw']

original automunge normalization parameters:
{'pct_NArw': 0.025342465753424658, 'suffix': 'NArw'}

new postmunge normalization parameters:
{'pct_NArw': 0.02878684030157642, 'suffix': 'NArw'}

___
derived columns: ['BsmtFinType1_1010_0', 'BsmtFinType1_1010_1', 'BsmtFinType1_1010_2']

original automunge normalization parameters:
{'random_generator': <class 'numpy.random._pcg64.PCG64'>, 'inplace': True, 'all_activations': False, 'add_activations': False, 'less_activations': False, 'consolidated_activations': False, 'str_convert': True, 'null_activation': True, 'max_zero': False, 'missing_marker': nan, 'inverse_consolidation_translate_dict': {}, 'binary_column_count': 3, 'binary_encoding_dict': {nan: '000', 'ALQ': '001', 'BLQ': '010', 'GLQ': '011', 'LwQ': '100', 'Rec': '101', 'Unf': '110'}, '_1010_activations_dict': {nan: 0.0, 'ALQ': 

new postmunge normalization parameters:
{'pct_NArw': 0.0, 'suffix': 'NArw'}

______
Preparing drift report for columns derived from: BsmtFullBath

original returned columns:
['BsmtFullBath_nmbr', 'BsmtFullBath_NArw']

new returned columns:
['BsmtFullBath_nmbr', 'BsmtFullBath_NArw']

___
derived columns: ['BsmtFullBath_nmbr']

original automunge normalization parameters:
{'mean': 0.42534246575342466, 'std': 0.5189106060897992, 'median': 0.0, 'MAD': 0.4987577406642878, 'max': 3.0, 'min': 0.0, 'offset': 0, 'multiplier': 1, 'cap': False, 'floor': False, 'abs_zero': True, 'inplace': True, 'suffix': 'nmbr', 'defaultinfill_dict': {'defaultinfill': 'negzeroinfill'}}

new postmunge normalization parameters:
{'mean': 0.4344543582704187, 'std': 0.5306475357080688, 'median': 0.0, 'MAD': 0.5063167469754083, 'max': 3.0, 'min': 0.0, 'offset': 0, 'multiplier': 1, 'cap': False, 'floor': False, 'abs_zero': True, 'inplace': True, 'suffix': 'nmbr', 'defaultinfill_dict': {'defaultinfill': 'negzeroinfill'}}

new returned columns:
['GarageFinish_NArw', 'GarageFinish_1010_0', 'GarageFinish_1010_1']

___
derived columns: ['GarageFinish_NArw']

original automunge normalization parameters:
{'pct_NArw': 0.05547945205479452, 'suffix': 'NArw'}

new postmunge normalization parameters:
{'pct_NArw': 0.053461274845784786, 'suffix': 'NArw'}

___
derived columns: ['GarageFinish_1010_0', 'GarageFinish_1010_1']

original automunge normalization parameters:
{'random_generator': <class 'numpy.random._pcg64.PCG64'>, 'inplace': True, 'all_activations': False, 'add_activations': False, 'less_activations': False, 'consolidated_activations': False, 'str_convert': True, 'null_activation': True, 'max_zero': False, 'missing_marker': nan, 'inverse_consolidation_translate_dict': {}, 'binary_column_count': 2, 'binary_encoding_dict': {nan: '00', 'Fin': '01', 'RFn': '10', 'Unf': '11'}, '_1010_activations_dict': {nan: 0.0, 'Fin': 0.2410958904109589, 'RFn': 0.28904109589041094, 'Unf': 0.4143835616438356}, '_1010_columnlis

new postmunge normalization parameters:
{'random_generator': <class 'numpy.random._pcg64.PCG64'>, 'inplace': True, 'all_activations': False, 'add_activations': False, 'less_activations': False, 'consolidated_activations': False, 'str_convert': True, 'null_activation': True, 'max_zero': False, 'missing_marker': nan, 'inverse_consolidation_translate_dict': {}, 'binary_column_count': 3, 'binary_encoding_dict': {nan: '000', 'GdPrv': '001', 'GdWo': '010', 'MnPrv': '011', 'MnWw': '100'}, '_1010_activations_dict': {nan: 0.0, 'GdPrv': 0.04043865661411926, 'GdWo': 0.039753255654557916, 'MnPrv': 0.11788896504455106, 'MnWw': 0.0006854009595613434}, '_1010_columnlist': ['Fence_1010_0', 'Fence_1010_1', 'Fence_1010_2']}

______
Preparing drift report for columns derived from: MiscFeature

original returned columns:
['MiscFeature_NArw', 'MiscFeature_1010_0', 'MiscFeature_1010_1', 'MiscFeature_1010_2']

new returned columns:
['MiscFeature_NArw', 'MiscFeature_1010_0', 'MiscFeature_1010_1']

___
derived

# 5) Processing subsequent data with postmunge(.)

In [20]:
#I mean all of these push-button stuff is great
#But the real value of the platform is just how easy it is
#To not only initially prepare data for training
#But then consistently and efficiently process 
#Streams of data for inference

#All we have to do is pass the postprocess_dict dictionary
#That was populated during the corresponding automunge(.) call
#And subsequent test sets are consistently prepared

test, test_ID, test_labels, \
postreports_dict \
= am.postmunge(postprocess_dict, df_test, \
               testID_column = ID_columnlist)

_______________
Begin Postmunge

Postmunge returned test column set: 
['MSSubClass_nmbr', 'LotFrontage_nmbr', 'LotArea_nmbr', 'Street_bnry', 'Alley_bnry', 'Utilities_bnry', 'OverallQual_nmbr', 'OverallCond_nmbr', 'YearBuilt_nmbr', 'YearRemodAdd_nmbr', 'MasVnrArea_nmbr', 'BsmtFinSF1_nmbr', 'BsmtFinSF2_nmbr', 'BsmtUnfSF_nmbr', 'TotalBsmtSF_nmbr', 'CentralAir_bnry', '1stFlrSF_nmbr', '2ndFlrSF_nmbr', 'LowQualFinSF_nmbr', 'GrLivArea_nmbr', 'BsmtFullBath_nmbr', 'BsmtHalfBath_nmbr', 'FullBath_nmbr', 'HalfBath_nmbr', 'BedroomAbvGr_nmbr', 'KitchenAbvGr_nmbr', 'TotRmsAbvGrd_nmbr', 'Fireplaces_nmbr', 'GarageYrBlt_nmbr', 'GarageCars_nmbr', 'GarageArea_nmbr', 'WoodDeckSF_nmbr', 'OpenPorchSF_nmbr', 'EnclosedPorch_nmbr', '3SsnPorch_nmbr', 'ScreenPorch_nmbr', 'PoolArea_nmbr', 'MiscVal_nmbr', 'MoSold_nmbr', 'YrSold_nmbr', 'MSSubClass_NArw', 'MSZoning_NArw', 'MSZoning_1010_0', 'MSZoning_1010_1', 'MSZoning_1010_2', 'LotFrontage_NArw', 'LotArea_NArw', 'Street_NArw', 'Alley_NArw', 'LotShape_NArw', 'LotShap

In [21]:
#Voila

# 6) One more thing

In [22]:
#Oh almost forgot to demonstrate ML infill
#Which basically trains a model for each returned column or set of columns
#For automated predictions to missing data infill

#Note that this method can be applied even when a train set does not require infill
#As a precaution against imperfections in subsequent data streams

#The method is currently built on top of SciKit Learn random forest implementations
#And parameters can be passed to the functions by the ML_cmnd parameter if desired
#Such as shown here we'll increase the n_estimators from default of 100
#Which is a tradeoff between training time and accuracy

#We can apply ML infill globally with the MLinfill parameter
#Or designate to specific source or derived columns with the assigninfill parameter

train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict \
= am.automunge(df_train, \
               labels_column = labels_column, \
               trainID_column = ID_columnlist, \
               MLinfill = True, \
               ML_cmnd = {'MLinfill_cmnd':{'RandomForestClassifier':{'n_estimators':1000}, \
                                           'RandomForestRegressor':{'n_estimators':1000}}})

_______________
Begin Automunge

______

versioning serial stamp:
_8.18_118030660932

Automunge returned train column set: 
['MSSubClass_nmbr', 'LotFrontage_nmbr', 'LotArea_nmbr', 'Street_bnry', 'Alley_bnry', 'Utilities_bnry', 'OverallQual_nmbr', 'OverallCond_nmbr', 'YearBuilt_nmbr', 'YearRemodAdd_nmbr', 'MasVnrArea_nmbr', 'BsmtFinSF1_nmbr', 'BsmtFinSF2_nmbr', 'BsmtUnfSF_nmbr', 'TotalBsmtSF_nmbr', 'CentralAir_bnry', '1stFlrSF_nmbr', '2ndFlrSF_nmbr', 'LowQualFinSF_nmbr', 'GrLivArea_nmbr', 'BsmtFullBath_nmbr', 'BsmtHalfBath_nmbr', 'FullBath_nmbr', 'HalfBath_nmbr', 'BedroomAbvGr_nmbr', 'KitchenAbvGr_nmbr', 'TotRmsAbvGrd_nmbr', 'Fireplaces_nmbr', 'GarageYrBlt_nmbr', 'GarageCars_nmbr', 'GarageArea_nmbr', 'WoodDeckSF_nmbr', 'OpenPorchSF_nmbr', 'EnclosedPorch_nmbr', '3SsnPorch_nmbr', 'ScreenPorch_nmbr', 'PoolArea_nmbr', 'MiscVal_nmbr', 'MoSold_nmbr', 'YrSold_nmbr', 'MSSubClass_NArw', 'MSZoning_NArw', 'MSZoning_1010_0', 'MSZoning_1010_1', 'MSZoning_1010_2', 'LotFrontage_NArw', 'LotArea_NArw', 

In [20]:
#God bless