# String Theory - Demonstration Notebook

This notebook will demonstrate material related to discussions in the paper "String Theory: Parsed Categoric Encodings with Automunge". Let's get right to it.

In [1]:

#Automunge is available for pip install:
!pip install Automunge



In [2]:
#or to upgrade (we currently roll out upgrades fairly frequently)
#!pip install Automunge --upgrade

In [3]:
#Once installed, run this in local session to initialize
from Automunge import *
am = AutoMunge()

In [4]:
#To demonstrate, we'll populate a simple dataframe consistent with the examples in the paper.
import pandas as pd
import numpy as np

df_train = \
pd.DataFrame({'column1':['circle','circle','circle','square','square','triangle',1234,np.nan,np.nan], \
              'column2':['yes','yes','yes','yes','no','no','no',np.nan,np.nan], \
              'address':['1234 North Peterson St Orlando, FL 32714',
                         '2345 South Anderson St Altamonte Springs, FL 32715',
                         '3456 South Peterson St Maitland, FL 32789',
                         '4567 North Peterson St Orlando, FL 32714',
                         '5678 Avenue St Orlando, FL 32714',
                         '6789 South Peterson St Maitland, FL 32789',
                         '5858 North Other St Altamonte Springs, FL 32715',
                         None,
                         'Orlando, FL']})

#for a test set we'll just copy the train set
df_test = df_train.copy()

In [5]:
#The returned postprocess_dict should be saved such as with pickle

In [6]:
#I find it helps to just copy and paste the full range of parameters for reference

train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(df_train, df_test = False,
  labels_column = False, trainID_column = False, 
  testID_column = False,
  valpercent=0.0, floatprecision = 32, cat_type = False, 
  shuffletrain = True, noise_augment = 0,
  dupl_rows = False, TrainLabelFreqLevel = False, 
  powertransform = False, binstransform = False,
  MLinfill = True, infilliterate=1, randomseed = False, 
  eval_ratio = .5,
  numbercategoryheuristic = 255, pandasoutput = 'dataframe', 
  NArw_marker = True,
  featureselection = False, featurethreshold = 0., 
  inplace = False, orig_headers = False,
  Binary = False, PCAn_components = False, PCAexcl = [], 
  excl_suffix = False,
  ML_cmnd = {'autoML_type':'randomforest',
             'MLinfill_cmnd':{'RandomForestClassifier':{}, 
                              'RandomForestRegressor':{}},
             'PCA_type':'default',
             'PCA_cmnd':{}},
  assigncat = {'1010':[], 'onht':[], 'ordl':[], 
               'bnry':[], 'hash':[], 'hsh2':[],
               'DP10':[], 'DPoh':[], 'DPod':[], 
               'DPbn':[], 'DPhs':[], 'DPh2':[],
               'nmbr':[], 'mnmx':[], 'retn':[], 
               'DPnb':[], 'DPmm':[], 'DPrt':[],
               'bins':[], 'pwr2':[], 'bnep':[], 
               'bsor':[], 'por2':[], 'bneo':[],
               'ntgr':[], 'srch':[], 'or19':[], 
               'tlbn':[], 'excl':[], 'exc2':[]},
  assignparam = {'global_assignparam'  : {'(parameter)': 42},
                 'default_assignparam' : {'(category)' : 
                                          {'(parameter)' : 42}},
                          '(category)' : {'(column)'   : 
                                          {'(parameter)' : 42}}},
  assigninfill = {'stdrdinfill':[], 'MLinfill':[], 
                  'zeroinfill':[], 'oneinfill':[],
                  'adjinfill':[], 'meaninfill':[], 
                  'medianinfill':[], 'negzeroinfill':[],
                  'modeinfill':[], 'lcinfill':[], 
                  'naninfill':[]},
  assignnan = {'categories':{}, 'columns':{}, 'global':[]},
  transformdict = {}, processdict = {}, evalcat = False, 
  ppd_append = False,  entropy_seeds = False, 
  random_generator = False, sampling_dict = False,
  privacy_encode = False, encrypt_key = False, 
  printstatus = 'summary', logger = {})

_______________
Begin Automunge

______

versioning serial stamp:
_8.19_692727428295

Automunge returned train column set: 
['column2_bnry', 'column1_NArw', 'column1_1010_0', 'column1_1010_1', 'column1_1010_2', 'column2_NArw', 'address_NArw', 'address_hash_0', 'address_hash_1', 'address_hash_2', 'address_hash_3', 'address_hash_4', 'address_hash_5', 'address_hash_6', 'address_hash_7']

Automunge returned ID column set: 
['Automunge_index']

_______________
Automunge Complete



In [7]:
#or for postmunge(.) with full range of parameters:
test, test_ID, test_labels, \
postreports_dict = \
am.postmunge(postprocess_dict, df_test,
  testID_column = False,
  pandasoutput = 'dataframe', printstatus = 'summary',
  dupl_rows = False, TrainLabelFreqLevel = False,
  featureeval = False, traindata = False, noise_augment = 0,
  driftreport = False, inversion = False,
  returnedsets = True, shuffletrain = False,
  entropy_seeds = False, random_generator = False, 
  sampling_dict = False, randomseed = False, 
  encrypt_key = False, logger = {})

_______________
Begin Postmunge

Postmunge returned test column set: 
['column2_bnry', 'column1_NArw', 'column1_1010_0', 'column1_1010_1', 'column1_1010_2', 'column2_NArw', 'address_NArw', 'address_hash_0', 'address_hash_1', 'address_hash_2', 'address_hash_3', 'address_hash_4', 'address_hash_5', 'address_hash_6', 'address_hash_7']

_______________
Postmunge returned ID column set: 
['Automunge_index']

_______________
Postmunge Complete



In [8]:
#Where the function returns numpy arrays of the encoded data
#we could alternatively return pandas dataframes by passing pandasoutput=True
#which we'll apply for below examples
#and turning off default shuffling and printouts

# Demonstrations from Paper

## Categoric Encodings - Figure 1

In [9]:
#'text' (one-hot encoding)

#we'll just apply to one column at a time for demonstrations
df = pd.DataFrame(df_train['column1'].copy())
#and a copy of that column for comparison
df['source_column'] = df['column1'].copy()


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  printstatus = False)

#show the returned train set
train


Unnamed: 0,column1_NArw,column1_1010_0,column1_1010_1,column1_1010_2,source_column_NArw,source_column_1010_0,source_column_1010_1,source_column_1010_2
0,0,0,1,0,0,0,1,0
1,0,0,1,0,0,0,1,0
2,0,0,1,0,0,0,1,0
3,0,0,1,1,0,0,1,1
4,0,0,1,1,0,0,1,1
5,0,1,0,0,0,1,0,0
6,0,0,0,1,0,0,0,1
7,1,0,0,0,1,0,0,0
8,1,0,0,0,1,0,0,0


In [10]:
#'1010' (binary encoding)

df = pd.DataFrame(df_train['column1'].copy())
df['source_column'] = df['column1'].copy()


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,column1_NArw,column1_1010_0,column1_1010_1,column1_1010_2,source_column_NArw,source_column_1010_0,source_column_1010_1,source_column_1010_2
0,0,0,1,0,0,0,1,0
1,0,0,1,0,0,0,1,0
2,0,0,1,0,0,0,1,0
3,0,0,1,1,0,0,1,1
4,0,0,1,1,0,0,1,1
5,0,1,0,0,0,1,0,0
6,0,0,0,1,0,0,0,1
7,1,0,0,0,1,0,0,0
8,1,0,0,0,1,0,0,0


In [11]:
#'ordl' (ordinal alphabetical)

df = pd.DataFrame(df_train['column1'].copy())
df['source_column'] = df['column1'].copy()


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,column1_NArw,column1_1010_0,column1_1010_1,column1_1010_2,source_column_NArw,source_column_1010_0,source_column_1010_1,source_column_1010_2
0,0,0,1,0,0,0,1,0
1,0,0,1,0,0,0,1,0
2,0,0,1,0,0,0,1,0
3,0,0,1,1,0,0,1,1
4,0,0,1,1,0,0,1,1
5,0,1,0,0,0,1,0,0
6,0,0,0,1,0,0,0,1
7,1,0,0,0,1,0,0,0
8,1,0,0,0,1,0,0,0


In [12]:
#'ord3' (ordinal by frequency)

df = pd.DataFrame(df_train['column1'].copy())
df['source_column'] = df['column1'].copy()


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,column1_NArw,column1_1010_0,column1_1010_1,column1_1010_2,source_column_NArw,source_column_1010_0,source_column_1010_1,source_column_1010_2
0,0,0,1,0,0,0,1,0
1,0,0,1,0,0,0,1,0
2,0,0,1,0,0,0,1,0
3,0,0,1,1,0,0,1,1
4,0,0,1,1,0,0,1,1
5,0,1,0,0,0,1,0,0
6,0,0,0,1,0,0,0,1
7,1,0,0,0,1,0,0,0
8,1,0,0,0,1,0,0,0


In [13]:
#'bnry' (boolean)


df = pd.DataFrame(df_train['column2'].copy())
df['source_column'] = df['column2'].copy()


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,column2_bnry,source_column_bnry,column2_NArw,source_column_NArw
0,1,1,0,0
1,1,1,0,0
2,1,1,0,0
3,1,1,0,0
4,0,0,0,0
5,0,0,0,0
6,0,0,0,0
7,1,1,1,1
8,1,1,1,1


## String Parsing - Figure 2

In [14]:
#'splt' (string overlap identification)

df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,address_NArw,address_hash_0,address_hash_1,address_hash_2,address_hash_3,address_hash_4,address_hash_5,address_hash_6,address_hash_7,source_column_NArw,source_column_hash_0,source_column_hash_1,source_column_hash_2,source_column_hash_3,source_column_hash_4,source_column_hash_5,source_column_hash_6,source_column_hash_7
0,0,12,7,22,39,40,1,9,0,0,12,7,22,39,40,1,9,0
1,0,44,27,36,39,12,44,1,41,0,44,27,36,39,12,44,1,41
2,0,4,27,22,39,25,1,4,0,0,4,27,22,39,25,1,4,0
3,0,11,7,22,39,40,1,9,0,0,11,7,22,39,40,1,9,0
4,0,11,13,39,40,1,9,0,0,0,11,13,39,40,1,9,0,0
5,0,13,27,22,39,25,1,4,0,0,13,27,22,39,25,1,4,0
6,0,8,7,13,39,12,44,1,41,0,8,7,13,39,12,44,1,41
7,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0
8,0,40,1,0,0,0,0,0,0,0,40,1,0,0,0,0,0,0


In [15]:
#'splt' (string overlap identification)

df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()

#the paper noted also that parameters may be passed to string parsing functions
#to exclude spaces and special characters
#such as to promote single word activations
#here is a demonstration (was not shown in Figure 2)

#Here we pass 'space_and_punctuation' parameter as False to exclude special characters from overlaps

train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  assigncat = {'splt':['address'], 'excl':['source_column']}, \
  assignparam = {'splt' : {'address' : {'space_and_punctuation':False}}}, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,source_column,address_NArw,address_splt_Altamonte,address_splt_Peterson,address_splt_Maitland,address_splt_Orlando,address_splt_Springs,address_splt_North,address_splt_32714,address_splt_South,address_splt_32715,address_splt_32789
0,"1234 North Peterson St Orlando, FL 32714",0,0,1,0,1,0,1,1,0,0,0
1,"2345 South Anderson St Altamonte Springs, FL 3...",0,1,0,0,0,1,0,0,1,1,0
2,"3456 South Peterson St Maitland, FL 32789",0,0,1,1,0,0,0,0,1,0,1
3,"4567 North Peterson St Orlando, FL 32714",0,0,1,0,1,0,1,1,0,0,0
4,"5678 Avenue St Orlando, FL 32714",0,0,0,0,1,0,0,1,0,0,0
5,"6789 South Peterson St Maitland, FL 32789",0,0,1,1,0,0,0,0,1,0,1
6,"5858 North Other St Altamonte Springs, FL 32715",0,1,0,0,0,1,1,0,0,1,0
7,,1,0,0,0,0,0,0,0,0,0,0
8,"Orlando, FL",0,0,0,0,1,0,0,0,0,0,0


In [16]:
#'splt' (string overlap identification)

df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()

#the paper noted also that the parsing inspection is incremented from max entry length
#down to a configurable minimum length for overlap detection threshold
#here is a demonstration (was not shown in Figure 2)

#Here we pass 'misplit' parameter as 32 such as to filter out overlaps below this threshold

train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  assigncat = {'splt':['address'], 'excl':['source_column']}, \
  assignparam = {'splt' : {'address' : {'minsplit':32}}}, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,source_column,address_NArw,"address_splt_ South Peterson St Maitland, FL 32789","address_splt_ North Peterson St Orlando, FL 32714"
0,"1234 North Peterson St Orlando, FL 32714",0,0,1
1,"2345 South Anderson St Altamonte Springs, FL 3...",0,0,0
2,"3456 South Peterson St Maitland, FL 32789",0,1,0
3,"4567 North Peterson St Orlando, FL 32714",0,0,1
4,"5678 Avenue St Orlando, FL 32714",0,0,0
5,"6789 South Peterson St Maitland, FL 32789",0,1,0
6,"5858 North Other St Altamonte Springs, FL 32715",0,0,0
7,,1,0,0
8,"Orlando, FL",0,0,0


In [17]:
#'spl2' (string overlap ordinal)

df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  assigncat = {'spl2':['address'], 'excl':['source_column']}, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,source_column,address_NArw,address_spl2_ord3
0,"1234 North Peterson St Orlando, FL 32714",0,1
1,"2345 South Anderson St Altamonte Springs, FL 3...",0,3
2,"3456 South Peterson St Maitland, FL 32789",0,2
3,"4567 North Peterson St Orlando, FL 32714",0,1
4,"5678 Avenue St Orlando, FL 32714",0,4
5,"6789 South Peterson St Maitland, FL 32789",0,2
6,"5858 North Other St Altamonte Springs, FL 32715",0,3
7,,1,6
8,"Orlando, FL",0,5


In [18]:
#'spl5' (spl2 w/ excluded non-overlaps)


df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  assigncat = {'spl5':['address'], 'excl':['source_column']}, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,source_column,address_NArw,address_spl5_ord3
0,"1234 North Peterson St Orlando, FL 32714",0,2
1,"2345 South Anderson St Altamonte Springs, FL 3...",0,4
2,"3456 South Peterson St Maitland, FL 32789",0,3
3,"4567 North Peterson St Orlando, FL 32714",0,2
4,"5678 Avenue St Orlando, FL 32714",0,1
5,"6789 South Peterson St Maitland, FL 32789",0,3
6,"5858 North Other St Altamonte Springs, FL 32715",0,4
7,,1,1
8,"Orlando, FL",0,1


In [19]:
#'sp15' (string overlap with allowed concurrent activations)


df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  assigncat = {'sp15':['address'], 'excl':['source_column']}, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,source_column,address_NArw,"address_sp15_ South Peterson St Maitland, FL 32789","address_sp15_ North Peterson St Orlando, FL 32714","address_sp15_ St Altamonte Springs, FL 32715","address_sp15_ St Orlando, FL 32714","address_sp15_Orlando, FL",address_sp15_erson St,address_sp15_ South,address_sp15_ North
0,"1234 North Peterson St Orlando, FL 32714",0,0,1,0,1,1,1,0,1
1,"2345 South Anderson St Altamonte Springs, FL 3...",0,0,0,1,0,0,1,1,0
2,"3456 South Peterson St Maitland, FL 32789",0,1,0,0,0,0,1,1,0
3,"4567 North Peterson St Orlando, FL 32714",0,0,1,0,1,1,1,0,1
4,"5678 Avenue St Orlando, FL 32714",0,0,0,0,1,1,0,0,0
5,"6789 South Peterson St Maitland, FL 32789",0,1,0,0,0,0,1,1,0
6,"5858 North Other St Altamonte Springs, FL 32715",0,0,0,1,0,0,0,0,1
7,,1,0,0,0,0,0,0,0,0
8,"Orlando, FL",0,0,0,0,0,1,0,0,0


A variation on string parsing with concurrent activations is available to reduce the dimensionality of the returned set, by way of a binary encoding consolidation of the set of activations. For example if sp15 returned 8 columns, and two of the rows had the same set of activations, the binary consolidation would assign a distinct activation set for those two rows represented in a reduced number of columns, here we see that taking place for rows 0/3 and 2/5.

In [20]:
#'sp19' (string overlap with allowed concurrent activations, binary consolidated activations)


df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  assigncat = {'sp19':['address'], 'excl':['source_column']}, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,source_column,address_NArw,address_sp19_0,address_sp19_1,address_sp19_2
0,"1234 North Peterson St Orlando, FL 32714",0,1,0,1
1,"2345 South Anderson St Altamonte Springs, FL 3...",0,1,0,0
2,"3456 South Peterson St Maitland, FL 32789",0,1,1,0
3,"4567 North Peterson St Orlando, FL 32714",0,1,0,1
4,"5678 Avenue St Orlando, FL 32714",0,0,1,0
5,"6789 South Peterson St Maitland, FL 32789",0,1,1,0
6,"5858 North Other St Altamonte Springs, FL 32715",0,0,1,1
7,,1,0,0,0
8,"Orlando, FL",0,0,0,1


Another variation is available with the "sbst" transforms - sbst is sort of a simpler version of string parsing in which instead of comparing string character subsets of entries to string character subsets of other entries, the sbst string parsing only compares string character subsets of entries to complete character representations of other entries, such as to identify presence of overlaps between complete entries and subsets of other entries. Here is a demonstration, which for this example will identify the entry "Orlando, FL" present in as a subset of some of the other entries:

In [21]:
#'sbst' (string overlap with complete entries)


df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  assigncat = {'sbst':['address'], 'excl':['source_column']}, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,source_column,address_NArw,"address_sbst_Orlando, FL"
0,"1234 North Peterson St Orlando, FL 32714",0,1
1,"2345 South Anderson St Altamonte Springs, FL 3...",0,0
2,"3456 South Peterson St Maitland, FL 32789",0,0
3,"4567 North Peterson St Orlando, FL 32714",0,1
4,"5678 Avenue St Orlando, FL 32714",0,1
5,"6789 South Peterson St Maitland, FL 32789",0,0
6,"5858 North Other St Altamonte Springs, FL 32715",0,0
7,,1,0
8,"Orlando, FL",0,1


## Parsing Unbounded Sets - Figure 3

In [22]:
#'nmcm' (string parse for number, commas ok)

df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  assigncat = {'nmcm':['address'], 'excl':['source_column']}, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,source_column,address_NArw,address_nmcm
0,"1234 North Peterson St Orlando, FL 32714",0,32714.0
1,"2345 South Anderson St Altamonte Springs, FL 3...",0,32715.0
2,"3456 South Peterson St Maitland, FL 32789",0,32789.0
3,"4567 North Peterson St Orlando, FL 32714",0,32714.0
4,"5678 Avenue St Orlando, FL 32714",0,32714.0
5,"6789 South Peterson St Maitland, FL 32789",0,32789.0
6,"5858 North Other St Altamonte Springs, FL 32715",0,32715.0
7,,1,32735.714844
8,"Orlando, FL",1,32735.714844


In [23]:
#'nmc2' (nmcm with z-score)

df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  assigncat = {'nmc2':['address'], 'excl':['source_column']}, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,source_column,address_NArw,address_nmc2_nmbr
0,"1234 North Peterson St Orlando, FL 32714",0,-0.68876
1,"2345 South Anderson St Altamonte Springs, FL 3...",0,-0.657041
2,"3456 South Peterson St Maitland, FL 32789",0,1.690181
3,"4567 North Peterson St Orlando, FL 32714",0,-0.68876
4,"5678 Avenue St Orlando, FL 32714",0,-0.68876
5,"6789 South Peterson St Maitland, FL 32789",0,1.690181
6,"5858 North Other St Altamonte Springs, FL 32715",0,-0.657041
7,,1,0.0
8,"Orlando, FL",1,0.0


In [24]:
#'nmc3' (nmcm with min-max)

df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  assigncat = {'nmc3':['address'], 'excl':['source_column']}, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,source_column,address_NArw,address_nmc3_mnmx
0,"1234 North Peterson St Orlando, FL 32714",0,0.0
1,"2345 South Anderson St Altamonte Springs, FL 3...",0,0.013333
2,"3456 South Peterson St Maitland, FL 32789",0,1.0
3,"4567 North Peterson St Orlando, FL 32714",0,0.0
4,"5678 Avenue St Orlando, FL 32714",0,0.0
5,"6789 South Peterson St Maitland, FL 32789",0,1.0
6,"5858 North Other St Altamonte Springs, FL 32715",0,0.013333
7,,1,0.289524
8,"Orlando, FL",1,0.289524


In [25]:
#'strn' (string extraction of non-numeric characters)

#this was noted in paper and not shown in figure
#strn is similar to nmcm but extracts longest length non-numeric character set

#by default it is followed with an ord3 ordinal for numeric encoding

df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  assigncat = {'strn':['address'], 'excl':['source_column']}, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,source_column,address_NArw,address_strn_ord3
0,"1234 North Peterson St Orlando, FL 32714",0,1
1,"2345 South Anderson St Altamonte Springs, FL 3...",0,5
2,"3456 South Peterson St Maitland, FL 32789",0,2
3,"4567 North Peterson St Orlando, FL 32714",0,1
4,"5678 Avenue St Orlando, FL 32714",0,3
5,"6789 South Peterson St Maitland, FL 32789",0,2
6,"5858 North Other St Altamonte Springs, FL 32715",0,4
7,,1,7
8,"Orlando, FL",0,6


In [26]:
#'srch' (categoric string search)

df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()

#note that the srch transform accepts parameter 'search'
#as a list of search terms


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  assigncat = {'srch':['address'], 'excl':['source_column']}, \
  assignparam = {'srch' : {'address'   : {'search' : ['Maitland', 'Orlando', 'Altamonte Springs']}}}, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,source_column,address_NArw,address_srch_Maitland,address_srch_Orlando,address_srch_Altamonte Springs
0,"1234 North Peterson St Orlando, FL 32714",0,0,1,0
1,"2345 South Anderson St Altamonte Springs, FL 3...",0,0,0,1
2,"3456 South Peterson St Maitland, FL 32789",0,1,0,0
3,"4567 North Peterson St Orlando, FL 32714",0,0,1,0
4,"5678 Avenue St Orlando, FL 32714",0,0,1,0
5,"6789 South Peterson St Maitland, FL 32789",0,1,0,0
6,"5858 North Other St Altamonte Springs, FL 32715",0,0,0,1
7,,1,0,0,0
8,"Orlando, FL",0,0,1,0


In [27]:
#'srch' (categoric string search)

#the paper noted potential to aggregate search terms into a common activation
#this is achieved by passing parameter search with embedded lists of terms to be aggregated

#Here we'll demonstrate by aggregating ['Maitland', 'Orlando'] into common activation
#in context of pass search parameter as [['Maitland', 'Orlando'], 'Altamonte Springs']

df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  assigncat = {'srch':['address'], 'excl':['source_column']}, \
  assignparam = {'srch' : {'address'   : {'search' : [['Maitland', 'Orlando'], 'Altamonte Springs']}}}, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,source_column,address_NArw,address_srch_Orlando,address_srch_Altamonte Springs
0,"1234 North Peterson St Orlando, FL 32714",0,1,0
1,"2345 South Anderson St Altamonte Springs, FL 3...",0,0,1
2,"3456 South Peterson St Maitland, FL 32789",0,1,0
3,"4567 North Peterson St Orlando, FL 32714",0,1,0
4,"5678 Avenue St Orlando, FL 32714",0,1,0
5,"6789 South Peterson St Maitland, FL 32789",0,1,0
6,"5858 North Other St Altamonte Springs, FL 32715",0,0,1
7,,1,0,0
8,"Orlando, FL",0,1,0


In [28]:
#'src2' (categoric string search)

#the paper also noted a variation with potential improved efficiency based on 
#added assumptions of whether the target set has a narrow range of entries

#This is available with the src2 transform, which returns comparable
#activations as srch (just a different suffix appender)


df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  assigncat = {'src2':['address'], 'excl':['source_column']}, \
  assignparam = {'src2' : {'address'   : {'search' : ['Maitland', 'Orlando', 'Altamonte Springs']}}}, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,source_column,address_NArw,address_src2_Maitland,address_src2_Orlando,address_src2_Altamonte Springs
0,"1234 North Peterson St Orlando, FL 32714",0,0,1,0
1,"2345 South Anderson St Altamonte Springs, FL 3...",0,0,0,1
2,"3456 South Peterson St Maitland, FL 32789",0,1,0,0
3,"4567 North Peterson St Orlando, FL 32714",0,0,1,0
4,"5678 Avenue St Orlando, FL 32714",0,0,1,0
5,"6789 South Peterson St Maitland, FL 32789",0,1,0,0
6,"5858 North Other St Altamonte Springs, FL 32715",0,0,0,1
7,,1,0,0,0
8,"Orlando, FL",0,0,1,0


In [29]:
#'src4' (categoric string search, ordinal encoding)

#the convention is that for cases of multiple activations to same row
#entries toward end of search parameter list take precendence

df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()

#note that the src4 transform accepts parameter 'search'
#as a list of search terms


train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  assigncat = {'src4':['address'], 'excl':['source_column']}, \
  assignparam = {'src4' : {'address'   : {'search' : ['Maitland', 'Orlando', 'Altamonte Springs']}}}, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,address_src4,source_column,address_NArw
0,2,"1234 North Peterson St Orlando, FL 32714",0
1,3,"2345 South Anderson St Altamonte Springs, FL 3...",0
2,1,"3456 South Peterson St Maitland, FL 32789",0
3,2,"4567 North Peterson St Orlando, FL 32714",0
4,2,"5678 Avenue St Orlando, FL 32714",0
5,1,"6789 South Peterson St Maitland, FL 32789",0
6,3,"5858 North Other St Altamonte Springs, FL 32715",0
7,0,,1
8,2,"Orlando, FL",0


## Family Tree Aggregations - Figure 5

In [30]:
#'or19' (a family tree of transformations)

df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()

#here we'll also activate the NArw_marker parameter
#to be consistent with the demonstrations in paper

train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  assigncat = {'or19':['address'], 'excl':['source_column']}, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,source_column,address_NArw,address_or19_sp13_ord3,address_or19_sp13_sp10_ord3,address_or19_nmc8_nmbr,address_or19_1010_0,address_or19_1010_1,address_or19_1010_2,address_or19_1010_3
0,"1234 North Peterson St Orlando, FL 32714",0,1,2,-0.68876,0,0,0,1
1,"2345 South Anderson St Altamonte Springs, FL 3...",0,3,1,-0.657041,0,0,1,0
2,"3456 South Peterson St Maitland, FL 32789",0,2,3,1.690181,0,0,1,1
3,"4567 North Peterson St Orlando, FL 32714",0,1,2,-0.68876,0,1,0,0
4,"5678 Avenue St Orlando, FL 32714",0,4,2,-0.68876,0,1,0,1
5,"6789 South Peterson St Maitland, FL 32789",0,2,3,1.690181,0,1,1,1
6,"5858 North Other St Altamonte Springs, FL 32715",0,3,1,-0.657041,0,1,1,0
7,,1,6,1,0.0,0,0,0,0
8,"Orlando, FL",0,5,1,0.0,1,0,0,0


In [31]:
#here we'll validate or19 by running same data through postmunge(.)

#the comparison between the returned sets demonstrates consistency of transforms
#these type of comparisons between functions applied to different kinds of data
#are one of the ways we validate

test, test_ID, test_labels, \
postreports_dict = \
am.postmunge(postprocess_dict, df, \
             pandasoutput = True, printstatus = False)

del test['source_column']
del train['source_column']

test == train

Unnamed: 0,address_NArw,address_or19_sp13_ord3,address_or19_sp13_sp10_ord3,address_or19_nmc8_nmbr,address_or19_1010_0,address_or19_1010_1,address_or19_1010_2,address_or19_1010_3
0,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True
5,True,True,True,True,True,True,True,True
6,True,True,True,True,True,True,True,True
7,True,True,True,True,True,True,True,True
8,True,True,True,True,True,True,True,True


In [32]:
#Note that if we want to recover the original form of data from the returned sets
#we can run an inversion operation with the postmunge(.) function

#although some of the transformations, such as UPCS, may not have full info recovery 
#(such as here with case configuration)

df_invert, recovered_list, inversion_info_dict = \
am.postmunge(postprocess_dict, test, inversion='test', \
             pandasoutput=True, printstatus=False)

df_invert['address']

0             1234 NORTH PETERSON ST ORLANDO, FL 32714
1    2345 SOUTH ANDERSON ST ALTAMONTE SPRINGS, FL 3...
2            3456 SOUTH PETERSON ST MAITLAND, FL 32789
3             4567 NORTH PETERSON ST ORLANDO, FL 32714
4                     5678 AVENUE ST ORLANDO, FL 32714
5            6789 SOUTH PETERSON ST MAITLAND, FL 32789
6      5858 NORTH OTHER ST ALTAMONTE SPRINGS, FL 32715
7                                                  NaN
8                                          ORLANDO, FL
Name: address, dtype: object

In [33]:
#the paper noted that the UPCS (upper case conversion) could be turned off by parameter
#parameters are passed to a transformation functions by assignparam using
#the associated transformation category populated in the family tree of the root category
#that is associated with the transfomration funciton
#for the case of the 'or19' root category, the transformation category
#associated with the UPCS transfomation function is 'or19'
#but note this is due to the 'or19' entered as a transformation category in the family
#tree, not due to the root category of same name

#Here we'll demonstrate turning off UPCS

#'or19' (a family tree of transformations)

df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()

#here we'll also activate the NArw_marker parameter
#to be consistent with the demonstrations in paper

train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  assigncat = {'or19':['address'], 'excl':['source_column']}, \
  assignparam = {'or19' : {'address' : {'activate' : False}}}, \
  printstatus = False)

#show the returned train set
train

Unnamed: 0,source_column,address_NArw,address_or19_sp13_ord3,address_or19_sp13_sp10_ord3,address_or19_nmc8_nmbr,address_or19_1010_0,address_or19_1010_1,address_or19_1010_2,address_or19_1010_3
0,"1234 North Peterson St Orlando, FL 32714",0,1,2,-0.68876,0,0,0,1
1,"2345 South Anderson St Altamonte Springs, FL 3...",0,3,1,-0.657041,0,0,1,0
2,"3456 South Peterson St Maitland, FL 32789",0,2,3,1.690181,0,0,1,1
3,"4567 North Peterson St Orlando, FL 32714",0,1,2,-0.68876,0,1,0,0
4,"5678 Avenue St Orlando, FL 32714",0,4,2,-0.68876,0,1,0,1
5,"6789 South Peterson St Maitland, FL 32789",0,2,3,1.690181,0,1,1,1
6,"5858 North Other St Altamonte Springs, FL 32715",0,3,1,-0.657041,0,1,1,0
7,,1,6,1,0.0,0,0,0,0
8,"Orlando, FL",0,5,1,0.0,1,0,0,0


In [34]:
#As an admittedly low scale demonstration of processing time between train and test sets
#Let's run a quick comparison on this small data set
#Here we're interested in relative performance

import timeit

In [35]:
#'or19' (a family tree of transformations)

df = pd.DataFrame(df_train['address'].copy())
df['source_column'] = df['address'].copy()

start_time = timeit.default_timer()

train, train_ID, labels, \
val, val_ID, val_labels, \
test, test_ID, test_labels, \
postprocess_dict = \
am.automunge(
  df, \
  shuffletrain = False, \
  assigncat = {'or19':['address'], 'excl':['source_column']}, \
  printstatus = False)

print()
print("time elapsed:")
print(timeit.default_timer() - start_time)


time elapsed:
0.19930649600000017


In [36]:
#corresponding postmunge application

start_time = timeit.default_timer()

test, test_ID, test_labels, \
postreports_dict \
= am.postmunge(postprocess_dict, df, \
               pandasoutput = True, \
               printstatus = False)

print()
print("time elapsed:")
print(timeit.default_timer() - start_time)


time elapsed:
0.04989512499999993


Thanks!