# Network feature engineering - node/edge stats

This notebook starts from the file generated at step 01 (notebook '01_instrumentsFeatures.ipynb') and performs feature engineering adding useful node/edge stats

## Data import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import os
import itertools

In [2]:
#Instruments dataset
#import data
user = os.environ["USERNAME"]

#from home
filename = "instrumentsdf_2.pkl"
datafolder = "C:/Users/{}/Dropbox/University/MscDataScience_Birkbeck/thesis_project/data/".format(user)

#from work
#filename = "181109_instruments_merged.pkl"
#datafolder = "C:/Users/{}/Tradeteq Dropbox/Tradeteq Team/Clients/#GoFactoring/data analysis/".format(user)

inst = pd.read_pickle(datafolder+filename)
inst.head()

Unnamed: 0_level_0,customer_id,customer_name_1,debtor_id,debtor_name_1,invoice_number,invoice_date,due_date,invoice_amount,purchase_amount,purchase_amount_open,...,total_repayment,total_impairment,is_open,we_payment_share,has_purchase,has_deduction,is_due,has_discharge,cash_check,unexpl
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2744:79/231,2004008,Castillo GmbH,79,Sana Hyannis Sarl,2744,2013-07-23,2013-08-02,913.7,0.0,0.0,...,0.0,0.0,False,,False,False,True,True,913.7,True
2861:79/232,2004008,Castillo GmbH,79,Sana Hyannis Sarl,2861,2013-07-30,2013-08-09,2233.45,0.0,0.0,...,0.0,0.0,False,,False,False,True,True,2233.45,True
2932:79/233,2004008,Castillo GmbH,79,Sana Hyannis Sarl,2932,2013-08-06,2013-08-16,1370.5,0.0,0.0,...,1370.5,0.0,False,1.0,False,False,True,False,0.0,False
1472:489/688,2004009,Orpheus Wyandotte Supply LLC,489,Isfahan SA,1472,2013-08-13,2013-08-23,9195.1,0.0,0.0,...,0.0,0.0,False,,False,False,True,True,9195.1,True
2042:512/645,2004009,Orpheus Wyandotte Supply LLC,512,Aldrich Chloe GmbH,2042,2013-08-13,2013-08-23,4594.6,0.0,0.0,...,164.35,164.35,False,1.0,False,False,True,True,4265.9,True


In [3]:
print("{:} instruments".format(inst.shape[0]))

59820 instruments


In [4]:
print("{:} open, {:} with no payments".format(sum(inst.document_status=="offen") ,sum(inst.last_payment_date.isnull())))

8213 open, 12181 with no payments


### 1. Retrieving first and last posting date and payment date mismatch

In [5]:
inst["first_posting_date"] = inst.posting_date.apply(lambda x: min(x))
inst["last_posting_date"] = inst.posting_date.apply(lambda x: max(x))
inst["payment_date_mismatch"] = (inst.last_payment_date - inst.due_date).dt.days

### 2. Isolating date columns and calculating offsets from invoice_date

In [6]:
datecol = [  'due_date', 
           #"invoice_date", #almost always the earliest
       'discharge_date',  'input_date', 'creation_date', 
       'debt_collection_date', 'last_payment_date', 'reminder_date',
       'cancellation_date', 'value_date',
       'first_posting_date', 'last_posting_date']

for c in datecol:
    inst["dd_"+c] = (inst[c] - inst.invoice_date).apply(lambda x: x.days)

In [7]:
inst[['dd_'+i for i in datecol]].head()

Unnamed: 0_level_0,dd_due_date,dd_discharge_date,dd_input_date,dd_creation_date,dd_debt_collection_date,dd_last_payment_date,dd_reminder_date,dd_cancellation_date,dd_value_date,dd_first_posting_date,dd_last_posting_date
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2744:79/231,10,973.0,921,921,,,48.0,,10,921,973
2861:79/232,10,966.0,914,914,,,55.0,,10,914,966
2932:79/233,10,,907,907,,907.0,48.0,,10,907,959
1472:489/688,10,952.0,900,900,,,69.0,,10,900,952
2042:512/645,10,952.0,900,900,,900.0,125.0,,10,900,952


In [8]:
inst.columns

Index(['customer_id', 'customer_name_1', 'debtor_id', 'debtor_name_1',
       'invoice_number', 'invoice_date', 'due_date', 'invoice_amount',
       'purchase_amount', 'purchase_amount_open', 'discharge_type',
       'discharge_amount', 'discharge_date', 'posting_date',
       'transaction_type', 'document_stack_id', 'booking_text', 'input_date',
       'creation_date', 'factoring_type', 'debt_collection_date',
       'last_payment_date', 'reminder_date', 'test_feature_tested',
       'test_characteristic_id', 'bill_id', 'cancellation_date', 'value_date',
       'currency', 'purchase_examination', 'prosecution', 'deduction_amount',
       'payment_amount', 'payment_date', 'document_status', 'uid', 'ttype',
       'nrecords', 'ttypeset', 'discharge_loss', 'has_impairment1',
       'has_prosecution', 'is_pastdue', 'is_pastdue30', 'is_pastdue90',
       'is_pastdue180', 'last_payment_amount', 'total_repayment',
       'total_impairment', 'is_open', 'we_payment_share', 'has_purchase',
    

### 3. Marking up instruments with buyer/seller relationship details properties known at instrument inception

For each instrument, stats will be added and classified depending on their nature.  
The type of stats can be of 3 types, and it will be named accordingly:  
- edge stats, referred to a customer/debtor pair - it will have the prefix 'cd'
- node stats referred to a customer - it will have the prefix 'c'
- node stats referred to a debtor - it will have the prefix 'd'

In [11]:
#UTILS

from scipy import stats

def series_trend(s, applylog=True):
    """
    This function defines a trend for a particular given series using linear regression.
    To be used with invoice_amount for the current dataset, in order to establish the entity of the transactions.
    """
    x=np.arange(s.shape[0])
    slope, intercept, r_value, p_value, std_err = stats.linregress(x,s)
    #print(slope)
    if applylog:
        res = 0 if np.abs(slope)<1e-8 else np.sign(slope) * np.log(np.abs(slope))
    else:
        res = slope
    return res

decision_date_col = "value_date" #this is the main reference date for each instrument to establish if it has been repaid on time or if it is pastdue

def add_node_stats(inst, igroup, idx, id, ii, prefix):
    """
    This function adds stats to each node.
    inst: instruments dataframe sorted by invoice_date
    igroup: group of instruments between a certain buyer and a certain seller
    idx: instrument index in the igroup 
    id: instrument id
    ii: instrument features (literally the dataset sliced in correspondence of that instrument)
    """
    #adding counter of previously lent in this customer/debtor pair (inst is sorted by invoice date)
    inst.loc[id, prefix+"lent_c"] = idx 
        
    #adding counter of previously repaid instruments in this customer/debtor pair
    #to be repaid, the last payment date needs to be smaller than all the instrument date and the instrument needs to not be open
    repaid = (igroup.loc[:, "last_payment_date"] < ii[decision_date_col]) & (~ igroup.loc[:, "is_open"]) #filter for repaid instruments in this customer/debtor pair
    inst.loc[id, prefix+"repaid_c"] = sum(repaid) 
            
    #adding counter of previously impaired in this customer/debtor pair
    inst.loc[id, prefix+"impaired1_c"] = sum(igroup.loc[repaid,"has_impairment1"])
        
    #counter of overdue in this customer/debtor pair (considering previous instruments)
    previous = igroup.index[:idx] #previous instruments selector
    inst.loc[id, prefix+"pastdue90_c"] = sum((igroup.loc[previous,"due_date"] < ii[decision_date_col] - datetime.timedelta(90)) & igroup.loc[previous,"is_pastdue90"])
    inst.loc[id, prefix+"pastdue180_c"] = sum((igroup.loc[previous,"due_date"] < ii[decision_date_col] - datetime.timedelta(180)) & igroup.loc[previous,"is_pastdue180"])
        
    #adding trend in amount lent in this customer/debtor pair
    inst.loc[id, prefix+"trend_a"] = 0 if idx<2 else series_trend(igroup.loc[previous,"invoice_amount"])
        
    #adding counter of weekend payments in this pair
    inst.loc[id, prefix+"we_payment_share"] = igroup.loc[repaid, "we_payment_share"].agg("mean")
        
    #adding payment_date_mismatch stats
    inst.loc[id, prefix+"pd_mismatch_mean"] = igroup.loc[repaid, "payment_date_mismatch"].agg("mean")
    inst.loc[id, prefix+"pd_mismatch_std"] = igroup.loc[repaid, "payment_date_mismatch"].agg("std") 

### 3.1 Adding buyer/seller pair attributes (cd)

In [13]:
#Adding buyer/seller pair attributes
print("Adding buyer/seller pair attributes...")
prefix = "cd_" #stands for customer/debtor
g_cb = inst.groupby(["customer_name_1", "debtor_name_1"])
for (customer, debtor), igroup in g_cb:
    #for each instrument in this group, already sorted by invoice_date
    for idx, (id, ii) in enumerate(igroup.iterrows()):
       add_node_stats(inst, igroup, idx, id, ii, prefix)

#Adding the ratio columns for the previously calculated stats
cl = [prefix+"repaid_", prefix+"impaired1_",
      prefix+"pastdue90_", prefix+"pastdue180_"]
for c in cl:
    inst[c+"r"] = inst[c+"c"] / inst[prefix+"lent_c"]

Adding buyer/seller pair attributes...


### 3.2 Adding buyer attributes (d)

In [14]:
print("Adding buyer attributes...")
#buyer attributes
prefix = "d_"
g_b = inst.groupby(["debtor_name_1"])
for _, igroup in g_b:
    #for each instrument in this group, already sorted by invoice_date
    for idx, (id, ii) in enumerate(igroup.iterrows()):
        add_node_stats(inst, igroup, idx, id, ii, prefix)      
#add the ratio columns
cl = [prefix+"repaid_", prefix+"impaired1_",
      prefix+"pastdue90_", prefix+"pastdue180_"]
for c in cl:
    inst[c+"r"] = inst[c+"c"] / inst[prefix+"lent_c"]

Adding buyer attributes...


### 3.3 Adding seller attributes (c)

In [15]:
#Adding seller attributes
print("Adding seller attributes...")
prefix = "c_"
g_b = inst.groupby(["customer_name_1"])
for _, igroup in g_b:
    #for each instrument in this group, already sorted by invoice_date
    for idx, (id, ii) in enumerate(igroup.iterrows()):
        add_node_stats(inst, igroup, idx, id, ii, prefix)      
#add the ratio columns
cl = [prefix+"repaid_", prefix+"impaired1_",  
      prefix+"pastdue90_", prefix+"pastdue180_"]
for c in cl:
    inst[c+"r"] = inst[c+"c"] / inst[prefix+"lent_c"]

Adding seller attributes...


In [16]:
outputfile = 'instrumentsdf_deg1stats.pkl'
inst.to_pickle(datafolder+outputfile)