# Experiments Pipeline Demonstration

This notebook first assembles an "experiments_dict" describing all variations of properties for each numbered experiment scenario. The training pipeline saves externally the cumulative experiments_dict populated with a log of performance metrics under repetition upon completion of each experiment scenario. 

A spreadsheet "04 - Experiment scenarios key.xlsx" is provided seperately which may serve as a key between the numbered experiment scenarios and associated property variations.

The populated experiments_dict entries are then aggregated to generate the Figures in a seperate notebook "5 - Experiments - generation of plots for figures"

In [1]:
import pandas as pd
import numpy as np

from Automunge import *
am = AutoMunge()

from catboost import CatBoostClassifier
from catboost import CatBoostRegressor

from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

import datetime as dt
from copy import deepcopy
import pickle
import statistics

In [2]:
notebook_number = 0
# notebook_number = 2
# notebook_number = 2
# notebook_number = 3
# notebook_number = 4
# notebook_number = 5

In [3]:
assigninfill_scenarios = \
[
  'stdrdinfill', 
  'adjinfill', 
  'modeinfill', 
  'MLinfill',
]

ML_cmnd_scenarios = \
[{'ML_cmnd'     : {'autoML_type':'randomforest'}},
 {'ML_cmnd'     : {'autoML_type':'catboost'}},
 {'ML_cmnd'     : {'autoML_type':'autogluon'}},
]

assignnan_missingatrandom_scenarios = \
[{'inject_ratio' : 0},
 {'inject_ratio' : 0.1},
 {'inject_ratio' : 0.33},
 {'inject_ratio' : 0.66},
 {'inject_ratio' : 1.},
]

target_missingatrandom_columns = \
['numeric_column',
 'categoric_column',
]

minmax_range_notatrandom_numericcolumn = \
[
  [0, 0.33],
  [0.33, 0.67],
  [0.67, 1.0],
]

# entry_ratio_notatrandom_categoriccolumn = \
# [
#   {'topentry' : 0.},
#   {'topentry' : 0.33},
#   {'topentry' : 0.66},
#   {'topentry' : 1.},
# ]

NArw_marker_scenario = \
[False, True]

In [4]:
Datasets = \
[
  'Boston_Housing',
  'AllstateClaimsSeverity',
  'IEEE-CIS',
]


Top_fifteen_Housing = \
['OverallQual',
 'GrLivArea',
 'TotalBsmtSF',
 'BsmtFinSF1',
 'OverallCond',
 'YearBuilt',
 'LotArea',
 'GarageCars',
 'GarageType',
 '2ndFlrSF',
 '1stFlrSF',
 'GarageArea',
 'YearRemodAdd',
 'OpenPorchSF',
 'BsmtUnfSF',
]

Housing_numeric_target   = 'GrLivArea'
Housing_categoric_target = 'OverallQual'

Housing_numeric_bin_ranking = \
['GrLivArea_tlbn_8',
 'GrLivArea_tlbn_0',
 'GrLivArea_tlbn_6',
 'GrLivArea_tlbn_5',
 'GrLivArea_tlbn_4',
 'GrLivArea_tlbn_7',
 'GrLivArea_tlbn_3',
 'GrLivArea_tlbn_2',
 'GrLivArea_tlbn_1']

Housing_categoric_entry_ranking = \
['OverallQual_7.0',
 'OverallQual_8.0',
 'OverallQual_5.0',
 'OverallQual_4.0',
 'OverallQual_6.0',
 'OverallQual_9.0',
 'OverallQual_3.0',
 'OverallQual_10.0',
 'OverallQual_2.0',
 'OverallQual_1.0']


Top_fifteen_Allstate = \
['cat80',
 'cat12',
 'cont7',
 'cat101',
 'cat100',
 'cat79',
 'cont2',
 'cat81',
 'cont14',
 'cat57',
 'cat53',
 'cat2',
 'cat72',
 'cat111',
 'cont12']

Allstate_numeric_target   = 'cont7'
Allstate_categoric_target = 'cat80'

Allstate_numeric_bin_ranking = \
['cont7_tlbn_8',
 'cont7_tlbn_7',
 'cont7_tlbn_6',
 'cont7_tlbn_0',
 'cont7_tlbn_5',
 'cont7_tlbn_2',
 'cont7_tlbn_3',
 'cont7_tlbn_4',
 'cont7_tlbn_1']

Allstate_categoric_entry_ranking = \
['cat80_B', 'cat80_D', 'cat80_C', 'cat80_A']

Allstate_metric2_key_categoric = \
{0.018307124232432304: ['cat80_B'],
 0.08329083961923034: ['cat80_D'],
 0.09096580710519919: ['cat80_C'],
 0.09403209895288578: ['cat80_A']}

Allstate_metric2_key_numeric = \
{0.00756453760699094: ['cont7_tlbn_8'],
 0.022622084019138344: ['cont7_tlbn_7'],
 0.02600484958623228: ['cont7_tlbn_6'],
 0.026589563628192647: ['cont7_tlbn_0'],
 0.026828957070519688: ['cont7_tlbn_5'],
 0.028168605903439725: ['cont7_tlbn_2'],
 0.02822993233999005: ['cont7_tlbn_3'],
 0.028499276919530803: ['cont7_tlbn_4'],
 0.02901271933636529: ['cont7_tlbn_1']}

#the IEEE scenarios were not conducted due to time constraints

Top_fifteen_IEEE = \
['card6',
 'C13',
 'C14',
 'C1',
 'V283',
 'V294',
 'P_emaildomain',
 'card2',
 'V317',
 'TransactionAmt',
 'card1',
 'D15',
 'C11',
 'D2',
 'V315']

IEEE_numeric_target   = 'TransactionAmt'
IEEE_categoric_target = 'card6'

IEEE_numeric_bin_ranking = \
['TransactionAmt_tlbn_8',
 'TransactionAmt_tlbn_0',
 'TransactionAmt_tlbn_7',
 'TransactionAmt_tlbn_5',
 'TransactionAmt_tlbn_1',
 'TransactionAmt_tlbn_3',
 'TransactionAmt_tlbn_2',
 'TransactionAmt_tlbn_4',
 'TransactionAmt_tlbn_6']

IEEE_categoric_entry_ranking = \
['card6_credit', 'card6_debit', 'card6_charge card']

IEEE_metric2_key_categoric = \
{0.001312358180648121: ['card6_credit'],
 0.0022183086666440754: ['card6_debit'],
 0.003776204829478047: ['card6_charge card', 'card6_debit or credit']}

IEEE_metric2_key_numeric = \
{0.002294510109391301: ['TransactionAmt_tlbn_8'],
 0.0025231144376333114: ['TransactionAmt_tlbn_0'],
 0.0031581264605275994: ['TransactionAmt_tlbn_7'],
 0.003166593287499575: ['TransactionAmt_tlbn_5'],
 0.0032258610763029605: ['TransactionAmt_tlbn_1'],
 0.003259728384190641: ['TransactionAmt_tlbn_3'],
 0.0032681952111626167: ['TransactionAmt_tlbn_2'],
 0.0033020625190502972: ['TransactionAmt_tlbn_4'],
 0.0033697971348256583: ['TransactionAmt_tlbn_6']}

In [5]:
#experiments_dict is a numbered aggregation of each experiment scenario
#for a summary of various scenarios with associated scenario number
#plase see seperate file "04 - Experiment scenarios key.xlsx"

experiments_dict = {}

i = notebook_number * 480

for NArw_marker in NArw_marker_scenario:
  
  for dataset in Datasets:

    if dataset == 'Boston_Housing':

#       path = 'E:/Benchmark_datasets/Housing/housing_train.csv'
      path = 'housing_train.csv'

      numeric_target = Housing_numeric_target
      categoric_target = Housing_categoric_target

      top_numeric_bin = 9
      top_categoric_bin = 7

      topfifteen = Top_fifteen_Housing

      labels_column = 'SalePrice'
      trainID_column = 'Id'

      modeltype = 'regression'

    if dataset == 'AllstateClaimsSeverity':

#       path = 'E:/Benchmark_datasets/AllstateClaimsSeverity/train.csv'
      path = 'allstate_train.csv'

      numeric_target = Allstate_numeric_target
      categoric_target = Allstate_categoric_target

      top_numeric_bin = 9
      top_categoric_bin = 'cat80_B'

      topfifteen = Top_fifteen_Allstate

      labels_column = 'loss'
      trainID_column = 'id'

      modeltype = 'regression'

    if dataset == 'IEEE-CIS':

      path = 'E:/Benchmark_datasets/IEEE-CISFraudDetection/train_transaction.csv'

      numeric_target = IEEE_numeric_target
      categoric_target = IEEE_categoric_target

      top_numeric_bin = 9
      top_categoric_bin = 'card6_credit'

      topfifteen = Top_fifteen_IEEE

      labels_column = 'isFraud'
  #     trainID_column = False
      trainID_column = "TransactionID"

      modeltype = 'classification'
    
    for assigninfill_scenario in assigninfill_scenarios:

      for targetcolumn in [numeric_target, categoric_target]:

        for nantype in ['missingatrandom', 'categoric', 'numeric']:

          if assigninfill_scenario != 'MLinfill':
            ml_cmnd = {}

          elif assigninfill_scenario == 'MLinfill':
            ml_cmnd = {'autoML_type':'catboost'}

          assigninfill = {assigninfill_scenario : targetcolumn}

          for injectratio in assignnan_missingatrandom_scenarios:

            if nantype == 'missingatrandom':
              assignnan = {'injections' : {targetcolumn : injectratio}}

            if nantype == 'categoric':
              if targetcolumn == categoric_target:
                assignnan = {'injections' : {targetcolumn : {'entry_ratio' : {top_categoric_bin : injectratio['inject_ratio']}}}}
              else:
                assignnan = False

            if nantype == 'numeric':
              if targetcolumn == numeric_target:
                assignnan = {'injections' : {targetcolumn : {'minmax_range' : {'ratio'  : injectratio['inject_ratio'], \
                                                                               'ranges' : [[1/9*(top_numeric_bin-1), 1/9*(top_numeric_bin)]]}}}}
              else:
                assignnan = False

            if assignnan is not False:

              experiments_dict.update(
              {i : {
                'NArw_marker'  : NArw_marker,
                'dataset'      : dataset,
                'path'         : path,
                'modeltype'    : modeltype,
                'topfifteen'   : topfifteen,
                'labels_column': labels_column,
                'trainID_column': trainID_column,
                'targetcolumn' : targetcolumn,
                'assigninfill_scenario' : assigninfill_scenario,
                'nantype'      : nantype,
                'injectratio'  : injectratio['inject_ratio'],
                'ML_cmnd'      : ml_cmnd,
                'assignnan'    : assignnan,
                'assigninfill' : assigninfill,
              }})

              i+=1
            


In [9]:
# for experiment in experiments_dict:
#first batch are for experiments without NArw
#other two batches are scenarios of ML infill with NArw
for experiment in list(range(160)) + list(range(300, 320, 1)) + list(range(380, 400, 1)):

  print('_______________')
  print()
  print('experiment: ', experiment)
  print(experiments_dict[experiment]['dataset'])
  print()

  #import data
  df_train = pd.read_csv(experiments_dict[experiment]['path'], error_bad_lines=False)

  #take top fifteen columns
  df_train = df_train[experiments_dict[experiment]['topfifteen'] + [experiments_dict[experiment]['labels_column'], experiments_dict[experiment]['trainID_column']]]

  #only assigncat is for classificaiton convert to string for catboost
  if experiments_dict[experiment]['modeltype'] == 'classification':
    assigncat = {'lbos' : experiments_dict[experiment]['labels_column']}
  else:
    assigncat = {}

  #this way NArw_marker only included on target column
  transformdict = {'numeric' : {'auntsuncles' : ['nmbr']},
                   'categoric' : {'auntsuncles' : ['1010']},
                  }

  if experiments_dict[experiment]['NArw_marker'] is True:
    transformdict['numeric']['auntsuncles'].append('NArw')
    transformdict['categoric']['auntsuncles'].append('NArw')

  processdict = {'numeric' : {'functionpointer' : 'nmbr'},
                 'categoric' : {'functionpointer' : '1010'},
                }

  targetcolumn = experiments_dict[experiment]['targetcolumn']
  if targetcolumn in [Housing_numeric_target, Allstate_numeric_target, IEEE_numeric_target]:
    assigncat.update({'numeric' : targetcolumn})
  elif targetcolumn in [Housing_categoric_target, Allstate_categoric_target, IEEE_categoric_target]:
    assigncat.update({'categoric' : targetcolumn})

  #labels and ID sets designation
  labels_column = experiments_dict[experiment]['labels_column']
  trainID_column = experiments_dict[experiment]['trainID_column']

  ML_cmnd      = deepcopy(experiments_dict[experiment]['ML_cmnd'])
  assignnan    = deepcopy(experiments_dict[experiment]['assignnan'])
  assigninfill = deepcopy(experiments_dict[experiment]['assigninfill'])

  #we'll use GPU 1
#     GPU_device = '1'
  #for macbook
  GPU_device = '0'

#   ML_cmnd.update({'MLinfill_cmnd' : {'catboost_classifier_model'   : {'task_type' : 'GPU', 'devices' : GPU_device },
#                                      'catboost_regressor_model'    : {'task_type' : 'GPU', 'devices' : GPU_device }}})

#     ML_cmnd.update({'MLinfill_cmnd' : {'catboost_classifier_model'   : {'devices' : GPU_device },
#                                        'catboost_regressor_model'    : {'devices' : GPU_device }}})

  print('ML_cmnd')
  print(ML_cmnd)
  print()
  print('assignnan')
  print(assignnan)
  print()
  print('assigninfill')
  print(assigninfill)
  print()

  score_log = []
  
  #this is the number of repetitions for each experiment scenario
  for j in range(25):

    print()
    print('repetition number: ', j)


    #now prepare the data
    train, trainID, labels, \
    validation1, validationID1, validationlabels1, \
    test, testID, testlabels, \
    postprocess_dict \
    = am.automunge(df_train,
                   labels_column = labels_column,
                   trainID_column = trainID_column,
                   valpercent=0.25,
                   assigncat = assigncat,
                   ML_cmnd = ML_cmnd,
                   assignnan = assignnan,
                   assigninfill = assigninfill,
                   transformdict = transformdict,
                   processdict = processdict,
                   printstatus = False
                  )

    #catboost accepts categoric features designation
    categorical_features_indices = \
    postprocess_dict['columntype_report']['boolean'] + postprocess_dict['columntype_report']['ordinal'] \
    + postprocess_dict['columntype_report']['onehot'] + postprocess_dict['columntype_report']['binary']

    #now train our model and access the metric score on validation data
    if experiments_dict[experiment]['modeltype'] == 'classification':

      #metric = 'accuracy_score'
      metric = 'roc_auc_score'

#       model = CatBoostClassifier(task_type='GPU',
#                                 devices = GPU_device,)

      model = CatBoostClassifier(devices = GPU_device,)

      model.fit(train, 
                labels,
                eval_set=(validation1, validationlabels1),
                cat_features= categorical_features_indices,
                verbose=False,
               )

      #evaluate results on validation
      #inferred_labels = model.predict(validation1)
      inferred_labels = model.predict_proba(validation1)

      #score = accuracy_score(validationlabels1.to_numpy().ravel(), inferred_labels)
      score = roc_auc_score(validationlabels1.astype(int).to_numpy().ravel(), inferred_labels[:,0])

      score_log.append(score)

      print('score =  ', score)


    elif experiments_dict[experiment]['modeltype'] == 'regression':

      metric = 'rmse'

#       model = CatBoostRegressor(task_type='GPU',
#                                 devices = GPU_device,)

      model = CatBoostRegressor(devices = GPU_device,)

      model.fit(train, 
                labels,
                eval_set=(validation1, validationlabels1),
                cat_features= categorical_features_indices,
                verbose=False,
               )

      #evaluate results on validation
      inferred_labels = model.predict(validation1)

      score = mean_squared_error(validationlabels1.to_numpy().ravel(), inferred_labels, squared=False)

      score_log.append(score)

      print('score =  ', score)

  #now aggregate the repetitions
  scoremean = sum(score_log) / len(score_log)
  scorestdev = statistics.stdev(score_log)


  print()
  print('scoremean = ', scoremean)
  print('scorestdev = ', scorestdev)
  print()

  experiments_dict[experiment].update({'metric' : metric,
                                       'score'  : scoremean,
                                       'score_log' : score_log,
                                       'scorestdev' : scorestdev,
                                      })

  experiments_dict_filename = 'infill_experiments_gpusetting4_' + str(experiment) + '.pickle'

  with open(experiments_dict_filename, 'wb') as handle:
    pickle.dump(experiments_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)


_______________

experiment:  70
Boston_Housing

ML_cmnd
{'autoML_type': 'catboost'}

assignnan
{'injections': {'OverallQual': {'inject_ratio': 0}}}

assigninfill
{'MLinfill': 'OverallQual'}


repetition number:  0
score =   25303.80283291794

repetition number:  1
score =   25231.313300102604

repetition number:  2
score =   34408.8233657513

repetition number:  3
score =   31020.98580027178

repetition number:  4
score =   27274.04134110233

repetition number:  5
score =   29186.28026477407

repetition number:  6
score =   23784.70425667061

repetition number:  7
score =   26426.83100551273

repetition number:  8
score =   23446.85757181771

repetition number:  9
score =   24485.030671937075

repetition number:  10
score =   29739.745818439344

repetition number:  11
score =   24184.442476168828

repetition number:  12
score =   27416.46196476224

repetition number:  13
score =   26513.96999856017

repetition number:  14
score =   29334.66955769958

repetition number:  15
score =   2

score =   25521.92770602078

repetition number:  5
score =   23365.119199088844

repetition number:  6
score =   29952.18913587778

repetition number:  7
score =   28154.16117572973

repetition number:  8
score =   33290.50686501203

repetition number:  9
score =   27261.973171426496

repetition number:  10
score =   21308.27842921409

repetition number:  11
score =   34690.880991600854

repetition number:  12
score =   23902.671711297964

repetition number:  13
score =   24123.265264082394

repetition number:  14
score =   23174.39250742695

repetition number:  15
score =   24875.22354560946

repetition number:  16
score =   24243.165081907642

repetition number:  17
score =   26354.751654580516

repetition number:  18
score =   25674.309271456204

repetition number:  19
score =   31253.51448415184

repetition number:  20
score =   27319.556140544326

repetition number:  21
score =   24850.99555577586

repetition number:  22
score =   23609.245556439964

repetition number:  23
score =