# Preprocessing pipeline - benchmark models

This notebook starts from the output at step 4 in order to prepare the data to be put into the benchmark models for predictions

In [1]:
import numpy as np
import pandas as pd
import pickle
import datetime
from preprocessing_pipeline import *

## Importing data and cleaning unnnecessary instruments

In [2]:
datafolder = ".."+"/data/"
inputfilename = '03_instrumentsdf_deg1stats.pkl'

#feature selection
feat_str = ['currency']
feat_quant = ['has_purchase', 'dd_value_date', 'cd_lent_c', 'cd_repaid_c', 'cd_impaired1_c', 'cd_pastdue90_c', 'cd_trend_a',  'c_lent_c', 
             'c_repaid_c', 'c_impaired1_c', 'c_pastdue90_c', 'c_trend_a', 'cd_lent_c', 'd_repaid_c', 'd_impaired1_c', 'd_pastdue90_c',
             'd_trend_a', 'd_we_payment_share']
feat_exp = ['invoice_amount', 'purchase_amount']
feat_date = ['invoice_date']

In [3]:
df = pd.read_pickle(datafolder+inputfilename)
df.head()

Unnamed: 0_level_0,customer_id,customer_name_1,debtor_id,debtor_name_1,invoice_number,invoice_date,due_date,invoice_amount,purchase_amount,purchase_amount_open,...,c_pastdue90_c,c_pastdue180_c,c_trend_a,c_we_payment_share,c_pd_mismatch_mean,c_pd_mismatch_std,c_repaid_r,c_impaired1_r,c_pastdue90_r,c_pastdue180_r
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2744:79/231,2004008,Castillo GmbH,79,Sana Hyannis Sarl,2744,2013-07-23,2013-08-02,913.7,0.0,0.0,...,0.0,0.0,0.0,,,,,,,
2861:79/232,2004008,Castillo GmbH,79,Sana Hyannis Sarl,2861,2013-07-30,2013-08-09,2233.45,0.0,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0
2932:79/233,2004008,Castillo GmbH,79,Sana Hyannis Sarl,2932,2013-08-06,2013-08-16,1370.5,0.0,0.0,...,0.0,0.0,7.185198,,,,0.0,0.0,0.0,0.0
1472:489/688,2004009,Orpheus Wyandotte Supply LLC,489,Isfahan SA,1472,2013-08-13,2013-08-23,9195.1,0.0,0.0,...,0.0,0.0,0.0,,,,,,,
2042:512/645,2004009,Orpheus Wyandotte Supply LLC,512,Aldrich Chloe GmbH,2042,2013-08-13,2013-08-23,4594.6,0.0,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0


In [4]:
#drop all instruments that are not due yet, since they can't be labelled
print("{:} instruments that are not due yet, dropping...".format(sum(~df.is_due)))
df=df.loc[df.is_due, :]
print("{:} instruments remaining".format(df.shape[0]))

2201 instruments that are not due yet, dropping...
57619 instruments remaining


## Pipeline

In [5]:
targets = ['has_impairment1', 'is_pastdue90', 'is_pastdue180']

### Generating preprocessed data for impairment, pastdue90 and pastdue180 credit events - Shuffle mode

In [6]:
pfixes = ['imp_', 'p90_', 'p180_']
output_path = datafolder+"/preproc_traintest/"

In [7]:
for t in range(len(targets)):
    y_train, X_train, y_test, X_test, feature_labels = preprocessing_pipeline(df, feat_str, feat_quant, feat_exp, feat_date, targets[t],
                                                                         'invoice_date', trainsize = int(df.shape[0]*.80), testsize = int(df.shape[0]*.20)-1,
                                                                         save_to_file=True, outputfolder=output_path, prefix=pfixes[t])

Sampling 46095 for train and 11522 for test sets by shuffling...
Running the pipeline, target feature is has_impairment1...
Train y: 46095 total, 968 class_1 observations (2.10%) > 0
pipeline fit_transform for train set...
Test y: 11522 total, 227 class_1 observations (1.97%) > 0
pipeline transform only for test set...
Saving with file name prefix shuffle_imp_ and postfix _19072_750...
...done.
Sampling 46095 for train and 11522 for test sets by shuffling...
Running the pipeline, target feature is is_pastdue90...
Train y: 46095 total, 3344 class_1 observations (7.25%) > 0
pipeline fit_transform for train set...
Test y: 11522 total, 850 class_1 observations (7.38%) > 0
pipeline transform only for test set...
Saving with file name prefix shuffle_p90_ and postfix _19072_750...
...done.
Sampling 46095 for train and 11522 for test sets by shuffling...
Running the pipeline, target feature is is_pastdue180...
Train y: 46095 total, 2865 class_1 observations (6.22%) > 0
pipeline fit_transform f

### Generating preprocessed data for impairment, pastdue90 and pastdue180 credit events - Timewise mode

In [8]:
tdate = datetime.datetime(2018, 4, 20)

In [9]:
for t in range(len(targets)):
    y_train, X_train, y_test, X_test, feature_labels = preprocessing_pipeline(df, feat_str, feat_quant, feat_exp, feat_date, targets[t],
                                                                         'invoice_date', timewise=True, testdate = tdate,
                                                                         save_to_file=True, outputfolder=output_path, prefix=pfixes[t])

Splitting train and test sets by time, test cutoff: 2018-04-20 00:00:00...
  45577(79.1%) train, 12042(20.9%) test
Running the pipeline, target feature is has_impairment1...
Train y: 45577 total, 695 class_1 observations (1.52%) > 0
pipeline fit_transform for train set...
Test y: 12042 total, 500 class_1 observations (4.15%) > 0
pipeline transform only for test set...
Saving with file name prefix time_2018-04-20_imp_ and postfix _19072_750...
...done.
Splitting train and test sets by time, test cutoff: 2018-04-20 00:00:00...
  45577(79.1%) train, 12042(20.9%) test
Running the pipeline, target feature is is_pastdue90...
Train y: 45577 total, 3958 class_1 observations (8.68%) > 0
pipeline fit_transform for train set...
Test y: 12042 total, 237 class_1 observations (1.97%) > 0
pipeline transform only for test set...
Saving with file name prefix time_2018-04-20_p90_ and postfix _19072_750...
...done.
Splitting train and test sets by time, test cutoff: 2018-04-20 00:00:00...
  45577(79.1%) 