# Preprocessing pipeline - benchmark models

This notebook starts from the output at step 4 in order to prepare the data to be put into the benchmark models for predictions

In [1]:
import numpy as np
import pandas as pd
import pickle
import datetime
from preprocessing_pipeline import *

## Importing data and cleaning unnnecessary instruments

In [2]:
datafolder = ".."+"/data/"
inputfilename = '03_instrumentsdf_deg1stats.pkl'

#feature selection
feat_str = ['currency']
feat_quant = ['has_purchase', 'dd_value_date', 'cd_lent_c', 'cd_repaid_c', 'cd_impaired1_c', 'cd_pastdue90_c', 'cd_trend_a',  'c_lent_c', 
             'c_repaid_c', 'c_impaired1_c', 'c_pastdue90_c', 'c_trend_a', 'cd_lent_c', 'd_repaid_c', 'd_impaired1_c', 'd_pastdue90_c',
             'd_trend_a', 'd_we_payment_share']
feat_exp = ['invoice_amount', 'purchase_amount']
feat_date = ['invoice_date']

In [3]:
df = pd.read_pickle(datafolder+inputfilename)
df.head()

Unnamed: 0_level_0,customer_id,customer_name_1,debtor_id,debtor_name_1,invoice_number,invoice_date,due_date,invoice_amount,purchase_amount,purchase_amount_open,...,c_pastdue90_c,c_pastdue180_c,c_trend_a,c_we_payment_share,c_pd_mismatch_mean,c_pd_mismatch_std,c_repaid_r,c_impaired1_r,c_pastdue90_r,c_pastdue180_r
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2744:79/231,2004008,Castillo GmbH,79,Sana Hyannis Sarl,2744,2013-07-23,2013-08-02,913.7,0.0,0.0,...,0.0,0.0,0.0,,,,,,,
2861:79/232,2004008,Castillo GmbH,79,Sana Hyannis Sarl,2861,2013-07-30,2013-08-09,2233.45,0.0,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0
2932:79/233,2004008,Castillo GmbH,79,Sana Hyannis Sarl,2932,2013-08-06,2013-08-16,1370.5,0.0,0.0,...,0.0,0.0,7.185198,,,,0.0,0.0,0.0,0.0
1472:489/688,2004009,Orpheus Wyandotte Supply LLC,489,Isfahan SA,1472,2013-08-13,2013-08-23,9195.1,0.0,0.0,...,0.0,0.0,0.0,,,,,,,
2042:512/645,2004009,Orpheus Wyandotte Supply LLC,512,Aldrich Chloe GmbH,2042,2013-08-13,2013-08-23,4594.6,0.0,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0


In [4]:
#drop all instruments that are not due yet, since they can't be labelled
print("{:} instruments that are not due yet, dropping...".format(sum(~df.is_due)))
ri=df.loc[df.is_due, :]
print("{:} instruments remaining".format(ri.shape[0]))

2201 instruments that are not due yet, dropping...
57619 instruments remaining


## Pipeline

In [7]:
preproc_pipeline = preprocessing_pipeline(feat_str, feat_quant, feat_exp, feat_date)

trainsize = int(df.shape[0]*.80)
testsize = int(df.shape[0]*.20)-1
train_all, test_all = shuffle_train_test(df, trainsize, testsize, 'invoice_date')

testdate = pd.to_datetime('2018-09-01', yearfirst=True)
train_all, test_all = time_train_test(df, trainsize, testsize, 'invoice_date', testdate)

y_train, X_train, y_test, X_test, feature_labels = transform_train_test(train_all, test_all, preproc_pipeline, target_feature)

## Train-test split (shuffle or time based)

In [8]:
# split training and test sets ('shuffle' and 'time' mode)

trainsize = int(ri.shape[0]*.80)
testsize = int(ri.shape[0]*.20)-1

#for time splitting
TestDate = pd.to_datetime('2018-09-01', yearfirst=True)
control_feature = 'invoice_date'

#for shuffle splitting 
testset_control_feature = 'invoice_date'

split_mode = 'shuffle' #other option is 'time'

if split_mode == 'shuffle':
    print("Sampling {:} for train and {:} for test sets by shuffling...".format(trainsize, testsize))

    ri["invoice_date_year"] = ri[testset_control_feature].apply(lambda x: x.year)

        
    split = StratifiedShuffleSplit(n_splits=1, 
                                    train_size = trainsize, 
                                    test_size = testsize, 
                                    random_state=42)

    ri = ri.reset_index(drop=True)
    
    #constructing oversampled class y=1 train and test sets:
    for train_index, test_index in split.split(ri, ri.invoice_date_year):
        train_all = ri.loc[train_index]
        test_all = ri.loc[test_index]
    
elif split_mode == 'time':
    print("Splitting train and test sets by time, test cutoff: {:}...".format(TestDate))
    test_all  = ri.loc[ri[control_feature] >= TestDate]
    train_all = ri.loc[ri[control_feature] <  TestDate]
    print("  {:}({:.1f}%) train, {:}({:.1f}%) test".format(train_all.shape[0], 100*train_all.shape[0]/ri.shape[0],
                                                            test_all.shape[0],   100*test_all.shape[0]/ri.shape[0]))

Sampling 46095 for train and 11522 for test sets by shuffling...


## Transform data depending on target feature

In [9]:
#DEFINING THE TARGET FEATURE (it could be 'has_impairment1', 'is_pastdue90', 'is_pastdue180')
targetfeature = 'has_impairment1'

### Train set

In [10]:
print("Running the pipeline, target feature is {:}...".format(targetfeature))

#prepare and save train sets
#separate features and labels
y_train = train_all[targetfeature].copy().values
print("Train y: {:} total, {:} class_1 observations ({:.2f}%) > 0".format(y_train.shape[0], sum(y_train>0), sum(y_train>0)/y_train.shape[0]*100))
#apply the pipeline to the training set
print("pipeline fit_transform for train set...")
X_train = preproc_pipeline.fit_transform(train_all)

Running the pipeline, target feature is has_impairment1...
Train y: 46095 total, 968 class_1 observations (2.10%) > 0
pipeline fit_transform for train set...


### Test set

In [11]:
#prepare and save test sets
#separate features and labels
y_test = test_all[targetfeature].copy().values
print("Test y: {:} total, {:} class_1 observations ({:.2f}%) > 0".format(y_test.shape[0], sum(y_test>0), sum(y_test>0)/y_test.shape[0]*100))
#apply the pipeline to the training set
print("pipeline transform only for test set...")
X_test = preproc_pipeline.transform(test_all) #will be a problem if new categories are encountered here

Test y: 11522 total, 227 class_1 observations (1.97%) > 0
pipeline transform only for test set...


## Output

In [12]:
#group the category labels together for charting
feature_labels = preproc_pipeline.transformed_names_

In [16]:
#creating reference for output file
year = str(datetime.datetime.now().year)[2:]
month = str(datetime.datetime.now().month)
if len(month)==1:
    month = '0'+month
day = str(datetime.datetime.now().day)

prefix = year+month+day+'_'+str(datetime.datetime.now().hour)+str(datetime.datetime.now().minute)
postfix = '_shuffle_imp1'
outputfolder = datafolder+'/preproc_traintest/'

#saving training and test sets
print("Saving with file name postfix {:}...".format(postfix))
pickle.dump([X_train, y_train, feature_labels], open(outputfolder+prefix+"_traindata" + postfix+'.pkl', "wb"), protocol=4)
pickle.dump([X_test, y_test, feature_labels], open(outputfolder+prefix+"_testdata" + postfix+'.pkl', "wb"), protocol=4)
pickle.dump(preproc_pipeline, open(outputfolder+prefix+"_preproc_pipeline" + postfix+'.pkl', "wb"))
pickle.dump(feature_labels, open(outputfolder+prefix+"_feature_labels" + postfix+'.pkl', "wb"))
print("...done.")

Saving with file name postfix _shuffle_imp1...
...done.
