# A Numbers Game - Demonstration Notebook

This notebook will demonstrate material related to discussions in the paper "A Numbers Game: Numeric Encoding Options with Automunge". Let's get right to it.

In [1]:
#Automunge is available for pip install:
# !pip install Automunge

In [2]:
#or to upgrade (we currently roll out upgrades fairly frequently)
#!pip install Automunge --upgrade

In [3]:
#Once installed, run this in local session to initialize
from Automunge import *
am = AutoMunge()

In [4]:
#To demonstrate, we'll popuilate a simple data of numeric data

#col1 has mixed poistive and negative floats
#col2 has all positive floats
#col3 has all negative floats
#col4 has integers
#col5 is a categoric set
#col6 is a categoric set

import pandas as pd
import numpy as np

df_train = pd.DataFrame({'col1':[-0.5, 12, 0, 3.3, -4.2, 2.1, 101, -33], \
                         'col2':[12, 13.1, 26.2, 111, 2, 1, 2, 5], \
                         'col3':[-.33, -48, -11.2, -0.3, -6, -52, -121, -30.2], \
                         'col4':[5, 1, 5, 3, 1, 5, 11, 12], \
                         'col5':['circle', 'square', 'circle', 'square', 'triangle', 'triangle', 'circle', 'square'], \
                         'col6':['on', 'off', 'on', 'on', 'off', 'off', 'on', 'on']})

#for a test set we'll just copy the train set
df_test = df_train.copy()

# Demonstrations from Paper

## 2 - Normalizations

In [5]:
#Let's demonstrate each normalizaitons from Table 1

df = pd.DataFrame()
df[['col1', 'col2', 'col3', 'col4']] = df_train[['col1', 'col2', 'col3', 'col4']].copy()
df['col5'] = df_train['col1'].copy()

assigncat = {'nmbr':'col1', 'mnmx':'col2', 'mean':'col3', 'MAD3':'col4', 'lgnm':'col5'}

#then we can apply
train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  NArw_marker = False, \
  MLinfill = False, \
  assigncat = assigncat)

_______________
Begin Automunge

______

versioning serial stamp:
_8.18_567803139356

Automunge returned train column set: 
['col1_nmbr', 'col2_mnmx', 'col3_mean', 'col4_MAD3', 'col5_lgnm_nmbr']

Automunge returned ID column set: 
['Automunge_index']

_______________
Automunge Complete



In [6]:
#here is the data before normalizaitons
df

Unnamed: 0,col1,col2,col3,col4,col5
0,-0.5,12.0,-0.33,5,-0.5
1,12.0,13.1,-48.0,1,12.0
2,0.0,26.2,-11.2,5,0.0
3,3.3,111.0,-0.3,3,3.3
4,-4.2,2.0,-6.0,1,-4.2
5,2.1,1.0,-52.0,5,2.1
6,101.0,2.0,-121.0,11,101.0
7,-33.0,5.0,-30.2,12,-33.0


In [7]:
#and here is after normalizations
train

Unnamed: 0,col1_nmbr,col2_mnmx,col3_mean,col4_MAD3,col5_lgnm_nmbr
0,-0.271396,0.1,0.27588,-2.285714,0.0
1,0.049024,0.11,-0.119066,-3.591837,0.198834
2,-0.258579,0.229091,0.185822,-2.285714,0.0
3,-0.173988,1.0,0.276129,-2.938776,-0.937298
4,-0.366241,0.009091,0.228904,-3.591837,0.0
5,-0.204749,0.0,-0.152206,-2.285714,-1.335068
6,2.330419,0.009091,-0.723871,-0.326531,2.073531
7,-1.10449,0.036364,0.028407,0.0,0.0


In [8]:
#The paper noted that in some cases a floor or cap may be desired
#to maintain consistent range between train and test data

#Passing parameters for mnmx trasnform for instance could be as follows:

assignparam = {'mnmx' : {'col2' : {'floor':True, 'cap':False}}}
assigncat = {'mnmx':'col2'}

#then we can apply
train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  NArw_marker = False, \
  MLinfill = False, \
  assigncat = assigncat, \
  assignparam = assignparam)

_______________
Begin Automunge

______

versioning serial stamp:
_8.18_954270113687

Automunge returned train column set: 
['col1_nmbr', 'col2_mnmx', 'col3_nmbr', 'col4_nmbr', 'col5_nmbr']

Automunge returned ID column set: 
['Automunge_index']

_______________
Automunge Complete



In [9]:
#Now let's try retain normalization
#note that based on their respective received ranges
#col1/col2/col3 will have different formulas applied

assigncat = {'retn':['col1', 'col2', 'col3'], 'null':['col4', 'col5']}

#then we can apply
train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  NArw_marker = False, \
  MLinfill = False, \
  assigncat = assigncat)

_______________
Begin Automunge

______

versioning serial stamp:
_8.18_543950477465

Automunge returned train column set: 
['col1_retn', 'col2_retn', 'col3_retn']

Automunge returned ID column set: 
['Automunge_index']

_______________
Automunge Complete



In [10]:
#here is the data before normalizaitons
df[['col1', 'col2', 'col3']]

Unnamed: 0,col1,col2,col3
0,-0.5,12.0,-0.33
1,12.0,13.1,-48.0
2,0.0,26.2,-11.2
3,3.3,111.0,-0.3
4,-4.2,2.0,-6.0
5,2.1,1.0,-52.0
6,101.0,2.0,-121.0
7,-33.0,5.0,-30.2


In [11]:
#and here is after normalizations
train

Unnamed: 0,col1_retn,col2_retn,col3_retn
0,-0.003731,0.1,-0.000249
1,0.089552,0.11,-0.395195
2,0.0,0.229091,-0.090307
3,0.024627,1.0,0.0
4,-0.031343,0.009091,-0.047225
5,0.015672,0.0,-0.428335
6,0.753731,0.009091,-1.0
7,-0.246269,0.036364,-0.247722


In [12]:
train.max()

col1_retn    0.753731
col2_retn    1.000000
col3_retn    0.000000
dtype: float32

In [13]:
train.min()

col1_retn   -0.246269
col2_retn    0.000000
col3_retn   -1.000000
dtype: float32

## 3 - Transformations

In [14]:
#as an example of a populated family tree in a transformdict data structure
#Let's say we want to replace our use of min-max scaling with min-max scaling
#supplumented by power of ten bins, and a marker for applied infill via NArw

#we could populate a transformdict data structure as

transformdict =  {'mnmx' : {'parents'       : [],
                            'siblings'      : [],
                            'auntsuncles'   : ['mnmx', 'pwr2'],
                            'cousins'       : ['NArw'],
                            'children'      : [],
                            'niecesnephews' : [],
                            'coworkers'     : [],
                            'friends'       : []}}

#we can just pass this transformdict to an automunge(.) call

## 4 - Bins and Grainings

In [15]:
#Let's demonstrate a few scenarios of populating bins for a numeric set
#we'll use the 'col1' column as our basis

df = pd.DataFrame()

df['col1'] = df_train['col1'].copy()
df['1'] = df_train['col1'].copy()
df['2'] = df_train['col1'].copy()
df['3'] = df_train['col1'].copy()

df

Unnamed: 0,col1,1,2,3
0,-0.5,-0.5,-0.5,-0.5
1,12.0,12.0,12.0,12.0
2,0.0,0.0,0.0,0.0
3,3.3,3.3,3.3,3.3
4,-4.2,-4.2,-4.2,-4.2
5,2.1,2.1,2.1,2.1
6,101.0,101.0,101.0,101.0
7,-33.0,-33.0,-33.0,-33.0


In [16]:
#let's demonsrtate the same column with standard deviation bins
#aggregated as one-hot, ordinal, or binary

assigncat = {'bins':'1', 'bsor':'2', 'bsbn':'3', 'excl':'col1'}

#then we can apply
train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  NArw_marker = False, \
  MLinfill = False, \
  assigncat = assigncat)

train

_______________
Begin Automunge

______

versioning serial stamp:
_8.18_562858533635

Automunge returned train column set: 
['col1', '2_bsor', '1_bins_0', '1_bins_1', '1_bins_2', '1_bins_3', '1_bins_4', '1_bins_5', '3_bsbn_1010_0', '3_bsbn_1010_1', '3_bsbn_1010_2']

Automunge returned ID column set: 
['Automunge_index']

_______________
Automunge Complete



Unnamed: 0,col1,2_bsor,1_bins_0,1_bins_1,1_bins_2,1_bins_3,1_bins_4,1_bins_5,3_bsbn_1010_0,3_bsbn_1010_1,3_bsbn_1010_2
0,-0.5,2,0,0,1,0,0,0,0,1,0
1,12.0,3,0,0,0,1,0,0,0,1,1
2,0.0,2,0,0,1,0,0,0,0,1,0
3,3.3,2,0,0,1,0,0,0,0,1,0
4,-4.2,2,0,0,1,0,0,0,0,1,0
5,2.1,2,0,0,1,0,0,0,0,1,0
6,101.0,5,0,0,0,0,0,1,1,0,0
7,-33.0,1,0,1,0,0,0,0,0,0,1


In [17]:
#Here the bins aggregation returns columns:
# ['1_bins_s<-2', '1_bins_s-21', '1_bins_s-10', '1_bins_s+01', '1_bins_s+12', '1_bins_s>+2']

#the bsor aggregation returns columns:
# ['2_bsor']

#and the bsbn aggregation returns columns:
# ['3_bsor_1010_0', '3_bsor_1010_1', '3_bsor_1010_2']

#with the source column 'col1' shown for comparison

In [18]:
#The remainder of the bins we'll just demonstrate as ordinal

df = pd.DataFrame()
df['col1'] = df_train['col1'].copy()
df['1'] = df_train['col1'].copy()
df['2'] = df_train['col1'].copy()
df['3'] = df_train['col1'].copy()
df['4'] = df_train['col1'].copy()

assigncat = {'pwor':'1', 'por2':'2', 'bnwo':'3', 'bneo':'4', 'excl':'col1'}

#then we can apply
train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  NArw_marker = False, \
  MLinfill = False, \
  assigncat = assigncat)

train

_______________
Begin Automunge

______

versioning serial stamp:
_8.18_876317074368

Automunge returned train column set: 
['col1', '1_pwor', '2_por2', '3_bnwo', '4_bneo']

Automunge returned ID column set: 
['Automunge_index']

_______________
Automunge Complete



Unnamed: 0,col1,1_pwor,2_por2,3_bnwo,4_bneo
0,-0.5,0,1,32,1
1,12.0,2,2,44,4
2,0.0,0,0,32,2
3,3.3,3,4,36,3
4,-4.2,0,5,28,0
5,2.1,3,4,35,2
6,101.0,4,6,133,4
7,-33.0,0,7,0,0


In [19]:
#if we want to define some custom bin aggregations
#we can pass parameters to assignparam

df = pd.DataFrame()
df['col1'] = df_train['col1'].copy()
df['1'] = df_train['col1'].copy()
df['2'] = df_train['col1'].copy()

assigncat = {'bkt3':'1', 'bkt4':'2', 'excl':'col1'}

assignparam = {'default_assignparam' : {'bkt3' : {'buckets':[-5, 0, 10, 100]}, \
                                        'bkt4' : {'buckets':[-5, 0, 10, 100]}}}

#then we can apply
train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  NArw_marker = False, \
  MLinfill = False, \
  assigncat = assigncat, \
  assignparam = assignparam)

train

_______________
Begin Automunge

______

versioning serial stamp:
_8.18_828099729608

Automunge returned train column set: 
['col1', '1_bkt3', '2_bkt4']

Automunge returned ID column set: 
['Automunge_index']

_______________
Automunge Complete



Unnamed: 0,col1,1_bkt3,2_bkt4
0,-0.5,1,0
1,12.0,3,2
2,0.0,1,0
3,3.3,2,1
4,-4.2,1,0
5,2.1,2,1
6,101.0,4,3
7,-33.0,0,3


## 5 - Noise Injection

In [20]:
#To demonstrate the noise injection, we'll show three returned sets:
# the source column, the normalizaiton without noise, and with full noise
#and with partial noise
#first let's start with z-score:

df = pd.DataFrame()
df['col1'] = df_train['col1'].copy()
df['NoNoise'] = df_train['col1'].copy()
df['PartialNoise'] = df_train['col1'].copy()
df['FullNoise'] = df_train['col1'].copy()

assigncat = {'excl':'col1', 'nmbr':'NoNoise', 'DPnb':['PartialNoise', 'FullNoise']}

#since noise injectino accepts parameters, we'll demonsrtate those parameter assignments
#here we'll show the default values for mu and sigma
#and vary the flip_prob parameter based on what ratio of data we want to receive noise
#note that we don't have to set parameters for defaults, just showing as a demosnrtation

assignparam = {'DPnb' : {'PartialNoise' : {'mu':0, 'sigma':0.03, 'flip_prob':0.33}, \
                         'FullNoise' : {'mu':0, 'sigma':0.03, 'flip_prob':1.}}}



#then we can apply
train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  NArw_marker = False, \
  MLinfill = False, \
  assigncat = assigncat, \
  assignparam = assignparam)

train



_______________
Begin Automunge

______

versioning serial stamp:
_8.18_791701090223

Automunge returned train column set: 
['col1', 'NoNoise_nmbr', 'PartialNoise_DPn3_DPnb', 'FullNoise_DPn3_DPnb']

Automunge returned ID column set: 
['Automunge_index']

_______________
Automunge Complete



Unnamed: 0,col1,NoNoise_nmbr,PartialNoise_DPn3_DPnb,FullNoise_DPn3_DPnb
0,-0.5,-0.271396,-0.247817,-0.247099
1,12.0,0.049024,0.049024,0.034895
2,0.0,-0.258579,-0.258579,-0.220058
3,3.3,-0.173988,-0.173988,-0.143679
4,-4.2,-0.366241,-0.366241,-0.362634
5,2.1,-0.204749,-0.204749,-0.144002
6,101.0,2.330419,2.370308,2.278097
7,-33.0,-1.10449,-1.117911,-1.130974


In [21]:
#similarly for min-max

df = pd.DataFrame()
df['col1'] = df_train['col1'].copy()
df['NoNoise'] = df_train['col1'].copy()
df['PartialNoise'] = df_train['col1'].copy()
df['FullNoise'] = df_train['col1'].copy()

assigncat = {'excl':'col1', 'mnmx':'NoNoise', 'DPmm':['PartialNoise', 'FullNoise']}

#since noise injectino accepts parameters, we'll demonsrtate those parameter assignments
#here we'll show the default values for mu and sigma
#and vary the flip_prob parameter based on what ratio of data we want to receive noise
#note that we don't have to set parameters for defaults, just showing as a demosnrtation

assignparam = {'DPmm' : {'PartialNoise' : {'mu':0, 'sigma':0.03, 'flip_prob':0.33}, \
                         'FullNoise' : {'mu':0, 'sigma':0.03, 'flip_prob':1.}}}



#then we can apply
train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  NArw_marker = False, \
  MLinfill = False, \
  assigncat = assigncat, \
  assignparam = assignparam)

train

_______________
Begin Automunge

______

versioning serial stamp:
_8.18_308168153650

Automunge returned train column set: 
['col1', 'NoNoise_mnmx', 'PartialNoise_DPm2_DPmm', 'FullNoise_DPm2_DPmm']

Automunge returned ID column set: 
['Automunge_index']

_______________
Automunge Complete



Unnamed: 0,col1,NoNoise_mnmx,PartialNoise_DPm2_DPmm,FullNoise_DPm2_DPmm
0,-0.5,0.242537,0.222832,0.230314
1,12.0,0.335821,0.315473,0.339806
2,0.0,0.246269,0.246269,0.223442
3,3.3,0.270896,0.270896,0.237744
4,-4.2,0.214925,0.214487,0.190952
5,2.1,0.26194,0.26194,0.247425
6,101.0,1.0,0.986657,1.0
7,-33.0,0.0,0.0,0.000682


In [22]:
#similarly for retain normalization

df = pd.DataFrame()
df['col1'] = df_train['col1'].copy()
df['NoNoise'] = df_train['col1'].copy()
df['PartialNoise'] = df_train['col1'].copy()
df['FullNoise'] = df_train['col1'].copy()

assigncat = {'excl':'col1', 'retn':'NoNoise', 'DPrt':['PartialNoise', 'FullNoise']}

#since noise injectino accepts parameters, we'll demonsrtate those parameter assignments
#here we'll show the default values for mu and sigma
#and vary the flip_prob parameter based on what ratio of data we want to receive noise
#note that we don't have to set parameters for defaults, just showing as a demosnrtation

assignparam = {'DPrt' : {'PartialNoise' : {'mu':0, 'sigma':0.03, 'flip_prob':0.33}, \
                         'FullNoise' : {'mu':0, 'sigma':0.03, 'flip_prob':1.}}}



#then we can apply
train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  NArw_marker = False, \
  MLinfill = False, \
  assigncat = assigncat, \
  assignparam = assignparam)

train

_______________
Begin Automunge

______

versioning serial stamp:
_8.18_775346208689

Automunge returned train column set: 
['col1', 'NoNoise_retn', 'PartialNoise_DPrt', 'FullNoise_DPrt']

Automunge returned ID column set: 
['Automunge_index']

_______________
Automunge Complete



Unnamed: 0,col1,NoNoise_retn,PartialNoise_DPrt,FullNoise_DPrt
0,-0.5,-0.003731,-0.003731,-0.009078
1,12.0,0.089552,0.103787,0.045134
2,0.0,0.0,0.0,-0.009811
3,3.3,0.024627,0.024627,0.044012
4,-4.2,-0.031343,-0.031343,-0.04443
5,2.1,0.015672,0.00577,0.04371
6,101.0,0.753731,0.753731,0.752965
7,-33.0,-0.246269,-0.188624,-0.230926


In [23]:
#notice how on FullNoise_DPrt the maximum and minimum values
#are notperturned outside fo the original range
#this is by design

In [24]:
#similarly for retain normalization

df = pd.DataFrame()
df['col1'] = df_train['col1'].copy()
df['NoNoise'] = df_train['col1'].copy()
df['PartialNoise'] = df_train['col1'].copy()
df['FullNoise'] = df_train['col1'].copy()

assigncat = {'excl':'col1', 'retn':'NoNoise', 'DPrt':['PartialNoise', 'FullNoise']}

#since noise injectino accepts parameters, we'll demonsrtate those parameter assignments
#here we'll show the default values for mu and sigma
#and vary the flip_prob parameter based on what ratio of data we want to receive noise
#note that we don't have to set parameters for defaults, just showing as a demosnrtation

assignparam = {'DPrt' : {'PartialNoise' : {'mu':0, 'sigma':0.03, 'flip_prob':0.33}, \
                         'FullNoise' : {'mu':0, 'sigma':0.03, 'flip_prob':1.}}}



#then we can apply
train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  NArw_marker = False, \
  MLinfill = False, \
  assigncat = assigncat, \
  assignparam = assignparam)

train

_______________
Begin Automunge

______

versioning serial stamp:
_8.18_600391587665

Automunge returned train column set: 
['col1', 'NoNoise_retn', 'PartialNoise_DPrt', 'FullNoise_DPrt']

Automunge returned ID column set: 
['Automunge_index']

_______________
Automunge Complete



Unnamed: 0,col1,NoNoise_retn,PartialNoise_DPrt,FullNoise_DPrt
0,-0.5,-0.003731,-0.024899,0.030167
1,12.0,0.089552,0.089552,0.085817
2,0.0,0.0,0.0,0.002085
3,3.3,0.024627,0.024806,0.061428
4,-4.2,-0.031343,-0.031343,-0.032867
5,2.1,0.015672,0.052467,0.04065
6,101.0,0.753731,0.753731,0.753731
7,-33.0,-0.246269,-0.246269,-0.207172


In [25]:
#now we'll demonstrate the categoric noise injectors:

#bnry transform is for two-vale categoric sets
#noise just flips to value between 0 and 1

df = pd.DataFrame()
df['col6'] = df_train['col6'].copy()
df['NoNoise'] = df_train['col6'].copy()
df['PartialNoise'] = df_train['col6'].copy()
df['FullNoise'] = df_train['col6'].copy()

assigncat = {'excl':'col6', 'bnry':'NoNoise', 'DPbn':['PartialNoise', 'FullNoise']}

#since noise injectino accepts parameters, we'll demonsrtate those parameter assignments
#here we'll vary the flip_prob parameter based on what ratio of data we want to receive noise

assignparam = {'DPbn' : {'PartialNoise' : {'flip_prob':0.33}, \
                         'FullNoise' : {'flip_prob':1.}}}



#then we can apply
train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  NArw_marker = False, \
  MLinfill = False, \
  assigncat = assigncat, \
  assignparam = assignparam)

train

_______________
Begin Automunge

______

versioning serial stamp:
_8.18_306482632533

Automunge returned train column set: 
['col6', 'NoNoise_bnry', 'PartialNoise_DPb2_DPbn', 'FullNoise_DPb2_DPbn']

Automunge returned ID column set: 
['Automunge_index']

_______________
Automunge Complete



Unnamed: 0,col6,NoNoise_bnry,PartialNoise_DPb2_DPbn,FullNoise_DPb2_DPbn
0,on,1,1,1
1,off,0,0,1
2,on,1,1,1
3,on,1,1,0
4,off,0,0,0
5,off,0,1,1
6,on,1,1,0
7,on,1,1,0


In [26]:
#ord3 transform is for ordinal encoding of categoric sets
#noise triggers a uniform random selection between categories
#including posibility of retention

df = pd.DataFrame()
df['col5'] = df_train['col5'].copy()
df['NoNoise'] = df_train['col5'].copy()
df['PartialNoise'] = df_train['col5'].copy()
df['FullNoise'] = df_train['col5'].copy()

assigncat = {'excl':'col5', 'ord3':'NoNoise', 'DPod':['PartialNoise', 'FullNoise']}

#since noise injectino accepts parameters, we'll demonsrtate those parameter assignments
#here we'll vary the flip_prob parameter based on what ratio of data we want to receive noise


assignparam = {'DPod' : {'PartialNoise' : {'flip_prob':0.33}, \
                         'FullNoise' : {'flip_prob':1.}}}



#then we can apply
train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  NArw_marker = False, \
  MLinfill = False, \
  assigncat = assigncat, \
  assignparam = assignparam)

train

_______________
Begin Automunge

______

versioning serial stamp:
_8.18_369819381411

Automunge returned train column set: 
['col5', 'NoNoise_ord3', 'PartialNoise_DPo4_DPod', 'FullNoise_DPo4_DPod']

Automunge returned ID column set: 
['Automunge_index']

_______________
Automunge Complete



Unnamed: 0,col5,NoNoise_ord3,PartialNoise_DPo4_DPod,FullNoise_DPo4_DPod
0,circle,1,1,3
1,square,2,2,2
2,circle,1,1,2
3,square,2,2,3
4,triangle,3,1,2
5,triangle,3,3,1
6,circle,1,1,3
7,square,2,2,2


In [27]:
#onht transform is for one hot encoding of categoric sets
#noise triggers a uniform random selection between categories
#including posibility of retention

#note that onht is similar to text transform but integer encodes the column headers

df = pd.DataFrame()
df['col5'] = df_train['col5'].copy()
df['NoNoise'] = df_train['col5'].copy()
df['PartialNoise'] = df_train['col5'].copy()
df['FullNoise'] = df_train['col5'].copy()

assigncat = {'excl':'col5', 'onht':'NoNoise', 'DPoh':['PartialNoise', 'FullNoise']}

#since noise injectino accepts parameters, we'll demonsrtate those parameter assignments
#here we'll vary the flip_prob parameter based on what ratio of data we want to receive noise


assignparam = {'DPoh' : {'PartialNoise' : {'flip_prob':0.33}, \
                         'FullNoise' : {'flip_prob':1.}}}



#then we can apply
train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  NArw_marker = False, \
  MLinfill = False, \
  assigncat = assigncat, \
  assignparam = assignparam)

train

_______________
Begin Automunge

______

versioning serial stamp:
_8.18_218393099729

Automunge returned train column set: 
['col5', 'NoNoise_onht_0', 'NoNoise_onht_1', 'NoNoise_onht_2', 'PartialNoise_DPo5_0_DPoh', 'PartialNoise_DPo5_1_DPoh', 'PartialNoise_DPo5_2_DPoh', 'FullNoise_DPo5_0_DPoh', 'FullNoise_DPo5_1_DPoh', 'FullNoise_DPo5_2_DPoh']

Automunge returned ID column set: 
['Automunge_index']

_______________
Automunge Complete



Unnamed: 0,col5,NoNoise_onht_0,NoNoise_onht_1,NoNoise_onht_2,PartialNoise_DPo5_0_DPoh,PartialNoise_DPo5_1_DPoh,PartialNoise_DPo5_2_DPoh,FullNoise_DPo5_0_DPoh,FullNoise_DPo5_1_DPoh,FullNoise_DPo5_2_DPoh
0,circle,1,0,0,1,0,0,0,0,1
1,square,0,1,0,0,1,0,1,0,0
2,circle,1,0,0,1,0,0,0,0,1
3,square,0,1,0,0,1,0,1,0,0
4,triangle,0,0,1,0,1,0,1,0,0
5,triangle,0,0,1,1,0,0,0,1,0
6,circle,1,0,0,1,0,0,1,0,0
7,square,0,1,0,0,1,0,1,0,0


In [28]:
#1010 transform is for binary encoding of categoric sets
#noise triggers a uniform random selection between categories
#including posibility of retention
#where a category may be represented by a set of activations

df = pd.DataFrame()
df['col5'] = df_train['col5'].copy()
df['NoNoise'] = df_train['col5'].copy()
df['PartialNoise'] = df_train['col5'].copy()
df['FullNoise'] = df_train['col5'].copy()

assigncat = {'excl':'col5', '1010':'NoNoise', 'DP10':['PartialNoise', 'FullNoise']}

#since noise injectino accepts parameters, we'll demonsrtate those parameter assignments
#here we'll vary the flip_prob parameter based on what ratio of data we want to receive noise

assignparam = {'DP10' : {'PartialNoise' : {'flip_prob':0.33}, \
                         'FullNoise' : {'flip_prob':1.}}}



#then we can apply
train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  NArw_marker = False, \
  MLinfill = False, \
  assigncat = assigncat, \
  assignparam = assignparam)

train

_______________
Begin Automunge

______

versioning serial stamp:
_8.18_148333363207

Automunge returned train column set: 
['col5', 'NoNoise_1010_0', 'NoNoise_1010_1', 'PartialNoise_DPo6_0_DP10', 'PartialNoise_DPo6_1_DP10', 'FullNoise_DPo6_0_DP10', 'FullNoise_DPo6_1_DP10']

Automunge returned ID column set: 
['Automunge_index']

_______________
Automunge Complete



Unnamed: 0,col5,NoNoise_1010_0,NoNoise_1010_1,PartialNoise_DPo6_0_DP10,PartialNoise_DPo6_1_DP10,FullNoise_DPo6_0_DP10,FullNoise_DPo6_1_DP10
0,circle,0,1,0,1,1,1
1,square,1,0,1,0,0,1
2,circle,0,1,0,1,1,1
3,square,1,0,1,0,0,1
4,triangle,1,1,1,1,1,0
5,triangle,1,1,1,1,1,0
6,circle,0,1,0,1,1,0
7,square,1,0,1,0,1,0


## 6 - Sequential Data

In [29]:
#dxdt transform is for sequential data delta between time steps
#where data includes a retn normalziation, as well as a dxdt with retn normalization
#here we'll show d2dt which also include a set with two tiers of dxdt applied

df = pd.DataFrame()
df['col4'] = df_train['col4'].copy()
df['1'] = df_train['col4'].copy()

assigncat = {'excl':'col4', 'd2dt':'1'}


#then we can apply
train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  NArw_marker = False, \
  MLinfill = False, \
  assigncat = assigncat)

train

_______________
Begin Automunge

______

versioning serial stamp:
_8.18_634735310548

Automunge returned train column set: 
['col4', '1_retn', '1_d2dt_retn', '1_d2dt_dxdt_retn']

Automunge returned ID column set: 
['Automunge_index']

_______________
Automunge Complete



Unnamed: 0,col4,1_retn,1_d2dt_retn,1_d2dt_dxdt_retn
0,5,0.363636,-0.4,0.0
1,1,0.0,-0.4,0.0
2,5,0.363636,0.4,0.571429
3,3,0.181818,-0.2,-0.428571
4,1,0.0,-0.2,0.0
5,5,0.363636,0.4,0.428571
6,11,0.909091,0.6,0.142857
7,12,1.0,0.1,-0.357143


In [30]:
#the paper noted a variation on the dxdt transforms to return averages
#between sets of points such as to smooth or denoise data
#this is available with the dxd2 family, 
#with a similar number of tieers for last demosnrtation with the d2d2 root category
#note that as with dxdt family, dxd2 family accepts a 'periods' parameter for number fo time steps
#here we'll demonstrate with periods as 2
#note that further variations are also available without the downstream normalizations on the output


df = pd.DataFrame()
df['col4'] = df_train['col4'].copy()
df['1'] = df_train['col4'].copy()

assigncat = {'excl':'col4', 'd2d2':'1'}
assignparam = {'d2d2' : {'1' : {'periods' : 2}}, \
               'd2dt' : {'1' : {'periods' : 2}}}


#then we can apply
train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  NArw_marker = False, \
  MLinfill = False, \
  assigncat = assigncat, \
  assignparam = assignparam)

train


_______________
Begin Automunge

______

versioning serial stamp:
_8.18_757749414190

Automunge returned train column set: 
['col4', '1_retn', '1_d2d2_retn', '1_d2d2_dxd2_retn']

Automunge returned ID column set: 
['Automunge_index']

_______________
Automunge Complete



Unnamed: 0,col4,1_retn,1_d2d2_retn,1_d2d2_dxd2_retn
0,5,0.363636,0.105263,0.0
1,1,0.0,0.105263,0.0
2,5,0.363636,0.105263,0.0
3,3,0.181818,0.105263,0.0
4,1,0.0,-0.105263,-0.097561
5,5,0.363636,-0.105263,-0.195122
6,11,0.909091,0.631579,0.243902
7,12,1.0,0.894737,0.804878


## 7 - Integer Sets

In [31]:
#the ntgr family of transforms are intended for received integer sets
#and encode the data in multiple configurations
#here we'll demonsrtate the ntgr root category
#which returns a retn normalizaiton, 1010 binary transformm, ordl ordinal encding
#and ord4 as ord3_mnmx which is a scaled metric for frequency of redundant entries

df = pd.DataFrame()
df['col4'] = df_train['col4'].copy()
df['1'] = df_train['col4'].copy()

assigncat = {'excl':'col4', 'ntgr':'1'}


#then we can apply
train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  NArw_marker = False, \
  MLinfill = False, \
  assigncat = assigncat)

train

_______________
Begin Automunge

______

versioning serial stamp:
_8.18_501618949461

Automunge returned train column set: 
['col4', '1_ordl', '1_ntgr_mnmx', '1_retn', '1_1010_0', '1_1010_1', '1_1010_2']

Automunge returned ID column set: 
['Automunge_index']

_______________
Automunge Complete



Unnamed: 0,col4,1_ordl,1_ntgr_mnmx,1_retn,1_1010_0,1_1010_1,1_1010_2
0,5,5,0.0,0.363636,1,0,1
1,1,1,0.25,0.0,0,0,1
2,5,5,0.0,0.363636,1,0,1
3,3,4,1.0,0.181818,1,0,0
4,1,1,0.25,0.0,0,0,1
5,5,5,0.0,0.363636,1,0,1
6,11,2,0.5,0.909091,0,1,0
7,12,3,0.75,1.0,0,1,1


And of course for each of these examples, the returned postprocess_dict can be used as a key to prepare additional data on a consistent basis.

In [32]:
df_test = df.copy()

test, test_ID, test_labels, \
postreports_dict \
= am.postmunge(postprocess_dict, df_test)

_______________
Begin Postmunge

Postmunge returned test column set: 
['col4', '1_ordl', '1_ntgr_mnmx', '1_retn', '1_1010_0', '1_1010_1', '1_1010_2']

_______________
Postmunge returned ID column set: 
['Automunge_index']

_______________
Postmunge Complete



In [33]:
#Best regards