In [3]:
## pipelines for GoFactoring receivables status prediction
# uses transactions aggregated into instruments and pre-processing from GF_analysis4.ipynb (@@to change to .py module)
import numpy as np
import pandas as pd
import pickle
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import Imputer, LabelBinarizer, StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn_pandas import DataFrameMapper, gen_features
import itertools
import sys
from os import environ

In [38]:
#from work
user = environ["USERNAME"]
datafolder= 'C:/Users/{:}/Tradeteq Dropbox/Tradeteq Team/Clients/#GoFactoring/data analysis/'.format(user)
inputfilename = '09272018_instruments2.pkl'

#feature selection
feat_str = ['currency']

feat_quant = ['has_purchase', 'dd_value_date', 'cd_lent_c', 'cd_repaid_c', 'cd_impaired1_c', 'cd_pastdue90_c', 'cd_trend_a',  'c_lent_c', 
             'c_repaid_c', 'c_impaired1_c', 'c_pastdue90_c', 'c_trend_a', 'cd_lent_c', 'd_repaid_c', 'd_impaired1_c', 'd_pastdue90_c',
             'd_trend_a', 'd_we_payment_share']
feat_exp = ['invoice_amount', 'purchase_amount']
feat_date = ['invoice_date']

In [6]:
ri = pd.read_pickle(datafolder+inputfilename)
ri.head()

Unnamed: 0_level_0,customer_id,customer_name_1,debtor_id,debtor_name_1,invoice_number,invoice_date,due_date,invoice_amount,purchase_amount,purchase_amount_open,...,c_pastdue180_c,c_trend_a,c_we_payment_share,c_pd_mismatch_mean,c_pd_mismatch_std,c_repaid_r,c_impaired1_r,c_impaired2_r,c_pastdue90_r,c_pastdue180_r
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2744:79/231,2004008,jobs united GmbH,79,Quadroni Linard,2744,2013-07-23,2013-08-02,913.7,0.0,0.0,...,0.0,0.0,,,,,,,,
2861:79/232,2004008,jobs united GmbH,79,Quadroni Linard,2861,2013-07-30,2013-08-09,2233.45,0.0,0.0,...,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0
2932:79/233,2004008,jobs united GmbH,79,Quadroni Linard,2932,2013-08-06,2013-08-16,1370.5,0.0,0.0,...,0.0,7.185198,,,,0.0,0.0,0.0,0.0,0.0
1472:489/688,2004009,PM Personal GmbH,489,Style Interiors,1472,2013-08-13,2013-08-23,9195.1,0.0,0.0,...,0.0,0.0,,,,,,,,
2042:512/645,2004009,PM Personal GmbH,512,Elektropartner AG,2042,2013-08-13,2013-08-23,4594.6,0.0,0.0,...,0.0,0.0,,,,0.0,0.0,0.0,0.0,0.0


In [7]:
feat_str = [[i] for i in feat_str]
feat_quant = [[j] for j in feat_quant]
feat_exp = [[k] for k in feat_exp]
feat_date = [[l] for l in feat_date]

In [10]:
#utils
#convert datetimes to float
class Date2Num(BaseEstimator, TransformerMixin):
    def __init__(self):
        return
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        #print(X[0])
        nanidx = pd.isnull(X)
        X1 = np.zeros(X.shape)*np.nan
        X1[~nanidx] = [float(pd.Timestamp(x).toordinal()) for x in X[~nanidx]]
        return X1

#nan replacer
class ReplaceImputer(BaseEstimator, TransformerMixin):
    def __init__(self, replacewith=999):
        self.replacewith = replacewith
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X1 = X.copy()
        X1[np.isnan(X1)] = self.replacewith
        return X1

#a log scaler to apply to some quant features
class LogScaler(BaseEstimator, TransformerMixin):
    def __init__(self, ZeroNegReplace=1e-5):
        self.ZeroNegReplace = ZeroNegReplace
        return
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X1=np.float32(X.copy())
        nanidx = np.isnan(X1)
        X2 = X1[~nanidx]
        X2[X2 < self.ZeroNegReplace] = self.ZeroNegReplace
        #badidx = X2 < 1e-10
        #print('LogScaler: {:} nans, {:} bads'.format(sum(nanidx), sum(badidx)))
        #if (sum(badidx)>0):
        #    print("many bad indices!")
        #    print(X2[badidx])

        X1[~nanidx] = np.log(X2)
        return X1

#cap the outliers greater than M std
class CapOutliers(BaseEstimator, TransformerMixin):
    def __init__(self, Maxstd=3.):
        self.Maxstd = Maxstd
    def fit(self, X, y=None):
        self.mean = np.nanmean(X)
        self.std = np.nanstd(X)
        return self
    def transform(self, X):
        X1 = np.float32(X.copy())
        nanidx = np.isnan(X1)
        X2 = X1[~nanidx]
        #print("CapOutliers: {:} nans, {:} mean, {:} std".format(sum(nanidx), self.mean, self.std))
        bigvals =  (np.abs(X2 - self.mean) > self.Maxstd * self.std)
        X2[bigvals] = self.mean + self.Maxstd * self.std * np.sign(X2[bigvals] - self.mean)
        X1[~nanidx] = X2
        return X1

In [15]:
#pipelines
trans_date = gen_features(columns = feat_date,
                          classes = [{'class': Date2Num},
                                     {'class': CapOutliers, 'Maxstd': 4},
                                     {'class': Imputer, 'strategy': "mean"},
                                     {'class': StandardScaler}])

trans_quant = gen_features(columns =  feat_quant, 
                               classes = [{'class': Imputer, 'strategy': "mean"},
                                          {'class': CapOutliers, 'Maxstd': 4},
                                          {'class': StandardScaler}])

trans_exp = gen_features(columns = feat_exp, 
                               classes = [{'class': LogScaler, 'ZeroNegReplace': 1e-3},
                                          {'class': CapOutliers, 'Maxstd': 4},
                                          {'class': Imputer, 'strategy': "mean"}, 
                                          {'class': StandardScaler}])

trans_str = gen_features(columns = feat_str, 
                             classes = [LabelBinarizer])

preproc_pipeline = DataFrameMapper(trans_quant + trans_exp + trans_str + trans_date)

In [16]:
preproc_pipeline

DataFrameMapper(default=False, df_out=False,
        features=[(['has_purchase'], [Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0), CapOutliers(Maxstd=4), StandardScaler(copy=True, with_mean=True, with_std=True)]), (['dd_value_date'], [Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0), CapOutlier...lues='NaN', strategy='mean', verbose=0), StandardScaler(copy=True, with_mean=True, with_std=True)])],
        input_df=False, sparse=False)

In [17]:
#drop all instruments that are not due yet
print("{:} instruments that are not due yet, dropping...".format(sum(~ri.is_due)))
ri=ri.loc[ri.is_due, :]
print("{:} instruments remaining".format(ri.shape[0]))

2201 instruments that are not due yet, dropping...
57619 instruments remaining


In [36]:
# split training and test sets ('shuffle' and 'time' mode)

trainsize = int(ri.shape[0]*.80)
testsize = int(ri.shape[0]*.20)-1

#for time splitting
TestDate = pd.to_datetime('2018-09-01', yearfirst=True)
control_feature = 'invoice_date'

#for shuffle splitting 
testset_control_feature = 'invoice_date'

split_mode = 'shuffle' #other option is 'time'

if split_mode == 'shuffle':
    print("Sampling {:} for train and {:} for test sets by shuffling...".format(trainsize, testsize))

    ri["invoice_date_year"] = ri[testset_control_feature].apply(lambda x: x.year)

        
    split = StratifiedShuffleSplit(n_splits=1, 
                                    train_size = trainsize, 
                                    test_size = testsize, 
                                    random_state=42)

    ri = ri.reset_index(drop=True)
    
    #constructing oversampled class y=1 train and test sets:
    for train_index, test_index in split.split(ri, ri.invoice_date_year):
        train_all = ri.loc[train_index]
        test_all = ri.loc[test_index]
    
elif split_mode == 'time':
    print("Splitting train and test sets by time, test cutoff: {:}...".format(TestDate))
    test_all  = ri.loc[ri[control_feature] >= TestDate]
    train_all = ri.loc[ri[control_feature] <  TestDate]
    print("  {:}({:.1f}%) train, {:}({:.1f}%) test".format(train_all.shape[0], 100*train_all.shape[0]/ri.shape[0],
                                                            test_all.shape[0],   100*test_all.shape[0]/ri.shape[0]))

Sampling 46095 for train and 11522 for test sets by shuffling...


In [44]:
#DEFINING THE TARGET FEATURE (it could be 'has_impairment1', 'is_pastdue90', 'is_pastdue180')
targetfeature = 'has_impairment1'

In [40]:
print("Running the pipeline, target feature is {:}...".format(targetfeature))

#prepare and save train sets
#separate features and labels
y_train = train_all[targetfeature].copy().values
print("Train y: {:} total, {:} ({:.2f}%) > 0".format(y_train.shape[0], sum(y_train>0), sum(y_train>0)/y_train.shape[0]*100))
#apply the pipeline to the training set
print("pipeline fit_transform for train set...")
X_train = preproc_pipeline.fit_transform(train_all)

Running the pipeline, target feature is has_impairment1...
Train y: 46095 total, 968 (2.10%) > 0
pipeline fit_transform for train set...


In [42]:
#prepare and save test sets
#separate features and labels
y_test = test_all[targetfeature].copy().values
print("Test y: {:} total, {:} ({:.2f}%) > 0".format(y_test.shape[0], sum(y_test>0), sum(y_test>0)/y_test.shape[0]*100))
#apply the pipeline to the training set
print("pipeline transform only for test set...")
X_test = preproc_pipeline.transform(test_all) #will be a problem if new categories are encountered here

Test y: 11522 total, 227 (1.97%) > 0
pipeline transform only for test set...


In [43]:
#group the category labels together for charting
feature_labels = preproc_pipeline.transformed_names_

In [None]:
postfix = '_imp1'

outputfolder = ''

#saving training and test sets
print("Saving with file name postfix {:}...".format(postfix))
pickle.dump([X_train, y_train, feature_labels], open(outputfolder+"traindata" + postfix, "wb"), protocol=4)
pickle.dump([X_test, y_test, feature_labels], open(outputfolder+"testdata" + postfix, "wb"), protocol=4)
pickle.dump(preproc_pipeline, open(outputfolder+"preproc_pipeline" + postfix, "wb"))
pickle.dump(feature_labels, open(outputfolder+"feature_labels" + postfix, "wb"))
print("...done.")