# Preprocessing pipeline 2 - bond graph features models

This notebook starts from the output at step 10 (10_network_featureEng.ipynb) in order to prepare the data to be put into the new bond graph featured models for predictions

In [1]:
import numpy as np
import pandas as pd
import pickle
import datetime
from scripts_ml.preprocessing_pipeline import *

## Importing data and main settings

In [2]:
datafolder = "../data/"
inputfilename = '04_instrumentsdf_bondgraph.pkl'

df = pd.read_pickle(datafolder+inputfilename)

#feature selection
feat_str = [] #enforced single column for each currency and enclosed in feat_quant for timeseq step
feat_quant = ['currency_Schweizer Franken', 'currency_Euro', 'currency_US-Dollar', 'currency_Britisches Pfund',
              'has_purchase', 'dd_value_date', 'cd_lent_c', 'cd_repaid_c', 'cd_impaired1_c', 'cd_pastdue90_c', 'cd_trend_a', 'c_lent_c', 'c_repaid_c', 'c_impaired1_c', 
              'c_pastdue90_c', 'c_trend_a', 'd_repaid_c', 'd_impaired1_c', 'd_pastdue90_c', 'd_trend_a', 'd_we_payment_share', 'flow_shock_imp1', 'imp_c_node_eff', 'imp_energy', 'imp_d_node_flow', 
              'flow_shock_p90', 'p90_c_node_eff', 'p90_energy', 'p90_d_node_flow', 'flow_shock_p180', 'p180_d_node_flow', 'p180_energy']
feat_exp = ['invoice_amount', 'purchase_amount']
feat_date = ['invoice_date']

#settings
targets = ['has_impairment1', 'is_pastdue90', 'is_pastdue180']

pfixes = ['imp_bg_', 'p90_bg_', 'p180_bg_']

output_path = datafolder+"/preproc_traintest/"

In [3]:
df.head()

Unnamed: 0_level_0,customer_id,customer_name_1,debtor_id,debtor_name_1,invoice_number,invoice_date,due_date,invoice_amount,purchase_amount,purchase_amount_open,...,p90_c_node_eff,p90_node_flow,p90_energy,flow_shock_p90,p180_edge_flow,p180_d_node_flow,p180_c_node_eff,p180_node_flow,p180_energy,flow_shock_p180
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2744:79/231,2004008,Castillo GmbH,79,Sana Hyannis Sarl,2744,2013-07-23,2013-08-02,913.7,0.0,0.0,...,239912.0,0.0,0.0,1.0,0.0,0.0,239912.0,0.0,0.0,1.0
2861:79/232,2004008,Castillo GmbH,79,Sana Hyannis Sarl,2861,2013-07-30,2013-08-09,2233.45,0.0,0.0,...,239912.0,0.0,0.0,1.0,0.0,0.0,239912.0,0.0,0.0,1.0
2932:79/233,2004008,Castillo GmbH,79,Sana Hyannis Sarl,2932,2013-08-06,2013-08-16,1370.5,0.0,0.0,...,239912.0,0.0,0.0,1.0,0.0,0.0,239912.0,0.0,0.0,1.0
1472:489/688,2004009,Orpheus Wyandotte Supply LLC,489,Isfahan SA,1472,2013-08-13,2013-08-23,9195.1,0.0,0.0,...,234247.0,1.665866,366437.206374,36.0,0.0,0.0,234247.0,0.54333,119515.16508,1.0
2042:512/645,2004009,Orpheus Wyandotte Supply LLC,512,Aldrich Chloe GmbH,2042,2013-08-13,2013-08-23,4594.6,0.0,0.0,...,234247.0,1.665866,366437.206374,36.0,0.0,0.0,234247.0,0.54333,119515.16508,1.0


In [4]:
#drop all instruments that are not due yet, since they can't be labelled
#print("{:} instruments that are not due yet, dropping...".format(sum(~df.is_due)))
#df=df.loc[df.is_due, :]
#print("{:} instruments remaining".format(df.shape[0]))

## Pipelines

### Generating preprocessed data for impairment, pastdue90 and pastdue180 credit events - Shuffle mode

In [None]:
for t in range(len(targets)):
    y_train, X_train, y_test, X_test, feature_labels = preprocessing_pipeline(df, feat_str, feat_quant, feat_exp, feat_date, targets[t], 'invoice_date', 
                                                                              'enriched_shuffle', trainsize = .8, testsize = .2, #int(df.shape[0]*.80) int(df.shape[0]*.20)-1,
                                                                         save_to_file=True, outputpath=output_path, prefix=pfixes[t],
                                                                             decompose_currency=True)

### Generating preprocessed data for impairment, pastdue90 and pastdue180 credit events - Timewise mode

In [None]:
for t in range(len(targets)):
    if t!=2:
        tdate = datetime.datetime(2018, 4, 30)
        y_train, X_train, y_test, X_test, feature_labels = preprocessing_pipeline(df, feat_str, feat_quant, feat_exp, feat_date, targets[t],
                                                                             'invoice_date', 'enriched_time', timewise=True, testdate = tdate,
                                                                             save_to_file=True, outputpath=output_path, prefix=pfixes[t],
                                                                                 decompose_currency=True)
    else:
        tdate = datetime.datetime(2018, 2, 20)
        y_train, X_train, y_test, X_test, feature_labels = preprocessing_pipeline(df, feat_str, feat_quant, feat_exp, feat_date, targets[t],
                                                                             'invoice_date', 'enriched_time', timewise=True, testdate = tdate,
                                                                             save_to_file=True, outputpath=output_path, prefix=pfixes[t],
                                                                                 decompose_currency=True)

### Generating preprocessed data for impairment, pastdue90 and pastdue180 credit events - Sequential Timewise mode

In [5]:
#changing input dataset for the sequential time split
inputfilename = '03_instrumentsdf_deg1stats.pkl'
df = pd.read_pickle(datafolder+inputfilename)

train_window = 26000 #16000, 30000, 24000 #
test_window = 2000 #4000, 6000, 6000

bond_graph_settings = [
    #impairment settings
    {'col_to_calc_effort' : 'purchase_amount',
    'effort_col' : 'imp_edge_eff', 
    'flow_col' : 'imp_edge_flow', 
    'col_to_calc_flow' : 'total_impairment',
     'col_ratio_flow' : 'invoice_amount',
     'col_mult_flow': None,
    'node_flow_col' : 'imp_node_flow', 
    'energy_col' : 'imp_energy', 
    'c_node_eff_col' : 'imp_c_node_eff',
    'd_node_flow_col' : 'imp_d_node_flow', 
    'shock_col' : 'flow_shock_imp1',
    'red_coeff' : 10**6},
    
    #pastdue90 settings
    {'col_to_calc_effort' : 'cd_lent_c',#'payment_date_mismatch',
    'effort_col' : 'p90_edge_eff', 
    'flow_col' : 'p90_edge_flow', 
    'col_to_calc_flow' : 'payment_date_mismatch',#'cd_pastdue90_r',
     'col_ratio_flow' : None,
     'col_mult_flow': 'cd_pastdue90_r',
    'node_flow_col' : 'p90_node_flow', 
    'energy_col' : 'p90_energy', 
    'c_node_eff_col' : 'p90_c_node_eff',
    'd_node_flow_col' : 'p90_d_node_flow', 
    'shock_col' : 'flow_shock_p90',
    'red_coeff' : 10**10 #10**4
    },
    
    #pastdue180 settings
    {'col_to_calc_effort' : 'cd_lent_c', #'payment_date_mismatch',
    'effort_col' : 'p180_edge_eff', 
    'flow_col' : 'p180_edge_flow', 
    'col_to_calc_flow' : 'payment_date_mismatch',
     'col_ratio_flow' : None,
     'col_mult_flow': 'cd_pastdue180_r',
    'node_flow_col' : 'p180_node_flow', 
    'energy_col' : 'p180_energy', 
    'c_node_eff_col' : 'p180_c_node_eff',
    'd_node_flow_col' : 'p180_d_node_flow', 
    'shock_col' : 'flow_shock_p180',
    'red_coeff' : 10**10 #10**5
    }
]

for t in range(len(targets)):
    if targets[t]!='is_pastdue180':
        tdate = datetime.datetime(2018, 4, 30)
          #y_valid_train, X_valid_train, y_valid_test, X_valid_test, feature_labels, folds_idx  ###only validation folds ---> validation_prep_only=True
        #y_train, X_train, y_test, X_test, feature_labels, y_valid_train, X_valid_train, y_valid_test, X_valid_test, folds_idx ###both train test and validation folds ---> validation_prep_only=False and train_test_prep_only=False
        #y_train, X_train, y_test, X_test, feature_labels ###only train test ---> train_test_prep_only=True
        
        y_valid_train, X_valid_train, y_valid_test, X_valid_test, feature_labels, folds_idx= preproc_pipeline_timeseq(df, 
                                                                                   feat_str, feat_quant, feat_exp, feat_date, targets[t],
                                                                                 'invoice_date', 'enriched_time_seq', bond_graph_settings, testdate = tdate,
                                                                                train_window=train_window, test_window=test_window,
                                                                                   use_previous_whole_bg = True,
                                                                                   whole_network_with_bg_file_path="../data/04_instrumentsdf_bondgraph2.pkl",
                                                                             save_to_file=True, outputpath=output_path, prefix=pfixes[t],
                                                                                    export_whole_network=False,
                                                                                    #whole_network_output_path="../data/04_instrumentsdf_bondgraph2.pkl",
                                                                                   decompose_currency=True, 
                                                                                   validation_prep_only=True, 
                                                                                   train_test_prep_only=False)
    else:
        tdate = datetime.datetime(2018, 2, 20)

        y_valid_train, X_valid_train, y_valid_test, X_valid_test, feature_labels, folds_idx= preproc_pipeline_timeseq(df, 
                                                                                   feat_str, feat_quant, feat_exp, feat_date, targets[t],
                                                                                 'invoice_date', 'enriched_time_seq', bond_graph_settings, testdate = tdate,
                                                                                train_window=train_window, test_window=test_window,
                                                                                   use_previous_whole_bg = True,
                                                                                   whole_network_with_bg_file_path="../data/04_instrumentsdf_bondgraph2.pkl",
                                                                             save_to_file=True, outputpath=output_path, prefix=pfixes[t],
                                                                                    export_whole_network=False,
                                                                                    #whole_network_output_path="../data/04_instrumentsdf_bondgraph2.pkl",
                                                                                   decompose_currency=True, 
                                                                                   validation_prep_only=True, 
                                                                                   train_test_prep_only=False)

Decomposing currency column to multiple columns with boolean values...
---------MACRO TRAIN SPLIT-----------
2201 instruments that are not due yet, dropping...
57619 instruments remaining
Splitting train and test sets by time, test cutoff: 2018-04-30 00:00:00...
  46101(80.0%) train, 11518(20.0%) test
---------Sequential validation splits-----------
Preparing fold 0 with 26101 train observations and 2000 test observations, starti=2101...
---------Train test for validation fold 0-----------
---------Adding bond graph features 1 of 3 to TRAIN SET for fold 0-----------
Calculating effort and flow for starting dataset with shape (26101, 114)...
Starting bg features - dataset shape: (26101, 116)
Creating the undirected graph of the whole dataset network...
Adding effort and flow feature to the dataset...
Dataset shape after effort and flow features: (26101, 120)
Isolating components and creating directed graphs...
Looking for hybrid nodes...
Modelling the flow...
Creating flow graphs...
104

In [None]:
#trainx = []
#testx = []
#for count, train, test, boh in rolling_window(df.shape[0], 12000, 3000):
#    if count==5:
#        trainx=train
#        testx=test

In [12]:
df.iloc[testx].shape

(3000, 110)