# Instruments related features

This notebook starts from the file generated at step 00 (notebook '00_analysis_instMapping.ipynb') and adds features that will be useful for the predictions.

## Data import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

In [2]:
#Instruments dataset
#import data
filename = "01_instrumentsdf.pkl"
datafolder = ".."+"/data/"
ReportDate = datetime.datetime(2018, 9, 28) #reference date on which data are received - all due dates after report date are related to open instruments

inst = pd.read_pickle(datafolder+filename)
inst.head()

Unnamed: 0_level_0,customer_id,customer_name_1,debtor_id,debtor_name_1,invoice_number,invoice_date,due_date,invoice_amount,purchase_amount,purchase_amount_open,...,value_date,currency,purchase_examination,prosecution,deduction_amount,payment_amount,payment_date,document_status,uid,ttype
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!17M529/1K:0350001/41405,2004035,Suffolk Armata Sarl,350001,Lombardy LLC,!17M529/1K,2017-10-27,2017-11-26,7263.27,7263.27,0.0,...,2017-10-27,Schweizer Franken,,Nein,,"[nan, nan, nan, nan, nan]","[NaT, NaT, NaT, NaT, NaT]",storniert,!17M529/1K:0350001/41405,"[0, 3, 2, 2, 7]"
.4078:62811/42717,2004022,Cooperative Inventors Corporation,62811,Haitian Deane Ltd,.4078,2017-11-07,2017-12-07,824.2,824.2,0.0,...,2017-11-07,Schweizer Franken,,Nein,0.0,"[824.2, 824.2]","[2017-12-11 00:00:00, 2017-12-11 00:00:00]",historisch,.4078:62811/42717,"[0, 1]"
0000138939:002/3266,002-1001,Universal Billies Limited,2,Sherrill Grayson & Son Ltd,0000138939,2016-03-18,2016-06-16,55566.0,55566.0,0.0,...,2016-03-18,US-Dollar,,Nein,0.0,"[55566.0, 55566.0]","[2016-09-14 00:00:00, 2016-09-14 00:00:00]",historisch,0000138939:002/3266,"[0, 1]"
0000140268:002/5158,002-1001,Universal Billies Limited,2,Sherrill Grayson & Son Ltd,0000140268,2016-05-26,2016-09-23,54595.8,54595.8,0.0,...,2016-05-26,US-Dollar,,Nein,0.0,"[54595.8, 54595.8]","[2016-11-10 00:00:00, 2016-11-10 00:00:00]",historisch,0000140268:002/5158,"[0, 1]"
0000140699:002/7114,002-1001,Universal Billies Limited,2,Sherrill Grayson & Son Ltd,0000140699,2016-06-22,2016-10-20,60150.0,60150.0,0.0,...,2016-06-22,US-Dollar,,Nein,0.0,"[50644.5, 557.89, 8947.61, 50644.5, 557.89, 89...","[2017-02-17 00:00:00, 2017-03-28 00:00:00, 201...",historisch,0000140699:002/7114,"[0, 0, 0, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5]"


In [3]:
inst.shape

(66593, 37)

### 1. Add some new fields to instruments

In [4]:
#add some fields to instruments
inst["nrecords"] = inst.posting_date.apply(lambda x: len(x))
#non-ordered transaction types without multiples
inst["ttypeset"] = inst.ttype.apply(lambda x: tuple(set(x)))

### 2. Instruments count, bad instruments removal, count of past due instruments

In [5]:
#count different types of instruments
print("Instruments total: ", inst.shape[0])
ii = [True]*inst.shape[0] #index of good instruments
badi = inst.ttypeset.apply(lambda x: (4 in x) or (16 in x)) #@@note the hardcoded transaction type codes - bulk confirmation and nans need to be removed
print("bulk_purchase or nan in transaction type: {:}".format(sum(badi)))
ii = ii & ~badi
print("Removing {:} instruments with invoice_amount<0...".format(sum(inst.invoice_amount<=0.009)))
inst = inst[ii & (inst.invoice_amount>0.009)].copy()

print("remaining ", inst.shape[0])
print("Non zero discharge_amount: ", inst[inst.discharge_amount>0].shape[0])
print("Non zero deducted_amount: ", inst[inst.deduction_amount>0].shape[0])
print("Non zero deducted or discharge_amount: ", 
      inst[(inst.discharge_amount>0) | (inst.deduction_amount>0)].shape[0])

dayst1 = 90
#deeming the following to be the only repayment transaction types:
#"incoming payments", "Subsequent assignement of the payment", "Cancel invoice client", 
#"Cancellation invoice", "Credit entry"
print("Past due more than {:} days and no repayments: {:}".format(dayst1, inst[
      inst.due_date.apply(lambda x: (ReportDate - x).days > dayst1) & \
      inst.ttypeset.apply(lambda x: set(x).isdisjoint({1,5,6,7,9}))].shape[0] ))

dayst2 = 180
print("Past due more than {:} days and no repayments: {:}".format(dayst2, inst[
      inst.due_date.apply(lambda x: (ReportDate - x).days > dayst2) & \
      inst.ttypeset.apply(lambda x: set(x).isdisjoint({1,5,6,7,9}))].shape[0] ))



Instruments total:  66593
bulk_purchase or nan in transaction type: 6770
Removing 3 instruments with invoice_amount<0...
remaining  59820
Non zero discharge_amount:  2696
Non zero deducted_amount:  1232
Non zero deducted or discharge_amount:  3916
Past due more than 90 days and no repayments: 3977
Past due more than 180 days and no repayments: 3396


### 3. impairments and past due

Definitions used:
impairment1: any non-zero discharge or deducted amount  
pastdueXX: instrument is dies more than XX days prior to report date and no transactions of type 4,5, or 10 recorded  

  
Note1: it is not clear if should not consider some other transaction types as repayments  
Note2: instruments with partial repayments will not be flagged. The current probelm is that we do not understand aggregation of paid amounts between repayements

In [6]:
#replace nan with 0
def _xor0(x):
    return 0. if np.isnan(x) else x
xor0 = np.vectorize(_xor0)

In [9]:
impthr = 0.009 #threshold for impairments

#define the discharge loss as difference between invoice_amount and discharge amount...
inst["discharge_loss"] = xor0(inst.invoice_amount - inst.discharge_amount)
inst.loc[pd.isnull(inst.discharge_amount), "discharge_loss"] = 0. #...but it is 0 for NaN discharge_amount

#define the presence of impairment1 as deduction_amount>0.009
inst["has_impairment1"] =  inst.deduction_amount>impthr

#instruments with prosecution
inst["has_prosecution"] = inst.prosecution.apply(lambda x: x=="Ja")

#instruments which open and past the due date
inst["is_pastdue"] = ((inst.due_date.apply(lambda x: (ReportDate - x).days > 0)) & \
                     (inst.document_status=="offen")) | ((inst.document_status!="offen") & ((inst.last_payment_date - inst.due_date).dt.days>0) & (inst.has_prosecution))

#instruments which are open and more than 30 days past the due date 
inst["is_pastdue30"] =  ((inst.due_date.apply(lambda x: (ReportDate - x).days > 30)) & \
                     (inst.document_status=="offen")) | ((inst.document_status!="offen") & ((inst.last_payment_date - inst.due_date).dt.days>30) & (inst.has_prosecution))

#instruments which are open and more than 90 days past the due date 
inst["is_pastdue90"] =  ((inst.due_date.apply(lambda x: (ReportDate - x).days > 90)) & \
                     (inst.document_status=="offen")) | ((inst.document_status!="offen") & ((inst.last_payment_date - inst.due_date).dt.days>90) & (inst.has_prosecution))

#instruments which are open and more than 180 days past the due date
inst["is_pastdue180"] =  ((inst.due_date.apply(lambda x: (ReportDate - x).days > 180)) & \
                     (inst.document_status=="offen")) | ((inst.document_status!="offen") & ((inst.last_payment_date - inst.due_date).dt.days>180) & (inst.has_prosecution))

#amount of the last payment for a certain instrument
inst["last_payment_amount"] = xor0(inst.payment_amount.apply(lambda x: x[-1]))

#sum of all the distinct entries for a single instrument
inst["total_repayment"] = xor0(inst.payment_amount.apply(lambda x: sum(list(set(x))))) #sum of distinct entries

#sum of discharge_loss and deduction_amount
inst["total_impairment"] = xor0(inst.discharge_loss) + xor0(inst.deduction_amount)

#field indicating if an instrument is open or not
inst["is_open"] = inst.document_status.apply(lambda x: x=="offen")

#sort instruments dataset by invoice date and debtor id
inst = inst.sort_values(by=["invoice_date", "debtor_id"], ascending=[True, True])

#WEEKEND COUNTS (weekend payments highlight is apparently useful for fraud detection)
#nan if all dates are none; fraction of weekday()==5 or 6 amond non-nan dates otherwise
def we_share(lst):
    """
    This function return the ratio of weekend payments for an instrument. nan if there's no weekend payment.
    """
    res = np.nan
    wec = 0
    datec = 0
    for x in lst:
        if not pd.isnull(x):
            #print(x.weekday())
            datec+=1
            if x.weekday()>4:
                wec+=1
    if datec>0:
        res=wec/datec
    return res

inst["we_payment_share"] = inst.payment_date.apply(lambda x: we_share(x))
print("Weekend payment shares: {:}".format(inst.we_payment_share.value_counts()))

#this indicates if an instrument has a purchase amount (if not, the client is not involved in repayment)
inst["has_purchase"] = inst.purchase_amount.apply(lambda x: x>0.009)

#this indicates if an instrument has a deduction amount
inst["has_deduction"] = inst.deduction_amount.apply(lambda x: x>0.009)

#this field indicates if an instrument is due
inst["is_due"] = inst.due_date.apply(lambda x: x < ReportDate)

#discharge amount
inst["has_discharge"] = inst.discharge_amount>0.001

Weekend payment shares: 0.000000    47431
1.000000      165
0.500000        5
0.200000        1
0.111111        1
Name: we_payment_share, dtype: int64


### 4. Unexplained cash flows

In [10]:
#unexplained cashflows - non open instruments, invoice_amount not matched by total_repayment+total_impairment
#likely probelms with total_impairment definiton
inst["cash_check"] = inst.invoice_amount - (inst.total_repayment + inst.total_impairment)
nrp = ((inst.cash_check > 0.01)&(~inst.is_open))
inst["unexpl"] = nrp

In [13]:
inst[['nrecords', 'ttypeset', 'discharge_loss', 'has_impairment1', 'has_prosecution', 'is_open', 'is_pastdue', 'is_pastdue30',
     'is_pastdue90', 'is_pastdue180', 'last_payment_amount', 'total_repayment', 'total_impairment', 'we_payment_share', 'has_purchase',
     'has_deduction', 'is_due', 'has_discharge', 'cash_check', 'unexpl']].head().transpose()

uid,2744:79/231,2861:79/232,2932:79/233,1472:489/688,2042:512/645
nrecords,2,2,2,2,4
ttypeset,"(0, 6)","(0, 6)","(0, 5)","(0, 6)","(0, 5, 6)"
discharge_loss,0,0,0,0,164.35
has_impairment1,False,False,False,False,False
has_prosecution,False,False,False,False,True
is_open,False,False,False,False,False
is_pastdue,False,False,False,False,True
is_pastdue30,False,False,False,False,True
is_pastdue90,False,False,False,False,True
is_pastdue180,False,False,False,False,True


In [8]:
#save instrument+features dataframe
filename = "02_instrumentsdf_2.pkl"
inst.to_pickle(datafolder + filename)