# Trades feature engineering - Degree 1 buyers-sellers data

This notebook starts from the file generated at step 01 (notebook '01_instrumentsFeatures.ipynb') and performs feature engineering adding useful node/edge stats retrieved with the notebook '02_buyers_sellers_df.ipynb'.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import itertools

from scripts_preproc.features_utils import *

## Data import

In [2]:
filename = "02_instrumentsdf_2.pkl"
datafolder = "../data/"

inst = pd.read_pickle(datafolder+filename)
inst.head()

Unnamed: 0_level_0,customer_id,customer_name_1,debtor_id,debtor_name_1,invoice_number,invoice_date,due_date,invoice_amount,purchase_amount,purchase_amount_open,...,total_repayment,total_impairment,is_open,we_payment_share,has_purchase,has_deduction,is_due,has_discharge,cash_check,unexpl
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2744:79/231,2004008,Castillo GmbH,79,Sana Hyannis Sarl,2744,2013-07-23,2013-08-02,913.7,0.0,0.0,...,0.0,0.0,False,,False,False,True,True,913.7,True
2861:79/232,2004008,Castillo GmbH,79,Sana Hyannis Sarl,2861,2013-07-30,2013-08-09,2233.45,0.0,0.0,...,0.0,0.0,False,,False,False,True,True,2233.45,True
2932:79/233,2004008,Castillo GmbH,79,Sana Hyannis Sarl,2932,2013-08-06,2013-08-16,1370.5,0.0,0.0,...,1370.5,0.0,False,1.0,False,False,True,False,0.0,False
1472:489/688,2004009,Orpheus Wyandotte Supply LLC,489,Isfahan SA,1472,2013-08-13,2013-08-23,9195.1,0.0,0.0,...,0.0,0.0,False,,False,False,True,True,9195.1,True
2042:512/645,2004009,Orpheus Wyandotte Supply LLC,512,Aldrich Chloe GmbH,2042,2013-08-13,2013-08-23,4594.6,0.0,0.0,...,164.35,164.35,False,1.0,False,False,True,True,4265.9,True


In [3]:
print("{:} instruments".format(inst.shape[0]))

59820 instruments


In [4]:
print("{:} open, {:} with no payments".format(sum(inst.document_status=="offen") ,sum(inst.last_payment_date.isnull())))

8213 open, 12181 with no payments


### 1. Retrieving first and last posting date and payment date mismatch

In [5]:
inst["first_posting_date"] = inst.posting_date.apply(lambda x: min(x))
inst["last_posting_date"] = inst.posting_date.apply(lambda x: max(x))
inst["payment_date_mismatch"] = (inst.last_payment_date - inst.due_date).dt.days

### 2. Isolating date columns and calculating offsets from invoice_date

In [6]:
datecol = [  'due_date', 
           #"invoice_date", #almost always the earliest
       'discharge_date',  'input_date', 'creation_date', 
       'debt_collection_date', 'last_payment_date', 'reminder_date',
       'cancellation_date', 'value_date',
       'first_posting_date', 'last_posting_date']

for c in datecol:
    inst["dd_"+c] = (inst[c] - inst.invoice_date).apply(lambda x: x.days)

In [7]:
inst[['dd_'+i for i in datecol]].head().transpose()

uid,2744:79/231,2861:79/232,2932:79/233,1472:489/688,2042:512/645
dd_due_date,10.0,10.0,10.0,10.0,10.0
dd_discharge_date,973.0,966.0,,952.0,952.0
dd_input_date,921.0,914.0,907.0,900.0,900.0
dd_creation_date,921.0,914.0,907.0,900.0,900.0
dd_debt_collection_date,,,,,
dd_last_payment_date,,,907.0,,900.0
dd_reminder_date,48.0,55.0,48.0,69.0,125.0
dd_cancellation_date,,,,,
dd_value_date,10.0,10.0,10.0,10.0,10.0
dd_first_posting_date,921.0,914.0,907.0,900.0,900.0


In [8]:
inst.columns

Index(['customer_id', 'customer_name_1', 'debtor_id', 'debtor_name_1',
       'invoice_number', 'invoice_date', 'due_date', 'invoice_amount',
       'purchase_amount', 'purchase_amount_open', 'discharge_type',
       'discharge_amount', 'discharge_date', 'posting_date',
       'transaction_type', 'document_stack_id', 'booking_text', 'input_date',
       'creation_date', 'factoring_type', 'debt_collection_date',
       'last_payment_date', 'reminder_date', 'test_feature_tested',
       'test_characteristic_id', 'bill_id', 'cancellation_date', 'value_date',
       'currency', 'purchase_examination', 'prosecution', 'deduction_amount',
       'payment_amount', 'payment_date', 'document_status', 'uid', 'ttype',
       'nrecords', 'ttypeset', 'discharge_loss', 'has_impairment1',
       'has_prosecution', 'is_pastdue', 'is_pastdue30', 'is_pastdue90',
       'is_pastdue180', 'last_payment_amount', 'total_repayment',
       'total_impairment', 'is_open', 'we_payment_share', 'has_purchase',
    

### 3. Marking up instruments with buyer/seller relationship details properties known at instrument inception

For each instrument, stats will be added and classified depending on their nature.  
The type of stats can be of 3 types, and it will be named accordingly:  
- edge stats, referred to a customer/debtor pair - it will have the prefix 'cd'
- node stats referred to a customer - it will have the prefix 'c'
- node stats referred to a debtor - it will have the prefix 'd'

### 3.1 Adding buyer/seller pair attributes (cd)

In [13]:
#Adding buyer/seller pair attributes
print("Adding buyer/seller pair attributes...")
prefix = "cd_" #stands for customer/debtor
g_cb = inst.groupby(["customer_name_1", "debtor_name_1"])
for (customer, debtor), igroup in g_cb:
    #for each instrument in this group, already sorted by invoice_date
    for idx, (id, ii) in enumerate(igroup.iterrows()):
       add_node_stats(inst, igroup, idx, id, ii, prefix)

#Adding the ratio columns for the previously calculated stats
cl = [prefix+"repaid_", prefix+"impaired1_",
      prefix+"pastdue90_", prefix+"pastdue180_"]
for c in cl:
    inst[c+"r"] = inst[c+"c"] / inst[prefix+"lent_c"]

Adding buyer/seller pair attributes...


### 3.2 Adding buyer attributes (d)

In [14]:
print("Adding buyer attributes...")
#buyer attributes
prefix = "d_"
g_b = inst.groupby(["debtor_name_1"])
for _, igroup in g_b:
    #for each instrument in this group, already sorted by invoice_date
    for idx, (id, ii) in enumerate(igroup.iterrows()):
        add_node_stats(inst, igroup, idx, id, ii, prefix)      
#add the ratio columns
cl = [prefix+"repaid_", prefix+"impaired1_",
      prefix+"pastdue90_", prefix+"pastdue180_"]
for c in cl:
    inst[c+"r"] = inst[c+"c"] / inst[prefix+"lent_c"]

Adding buyer attributes...


### 3.3 Adding seller attributes (c)

In [15]:
#Adding seller attributes
print("Adding seller attributes...")
prefix = "c_"
g_b = inst.groupby(["customer_name_1"])
for _, igroup in g_b:
    #for each instrument in this group, already sorted by invoice_date
    for idx, (id, ii) in enumerate(igroup.iterrows()):
        add_node_stats(inst, igroup, idx, id, ii, prefix)      
#add the ratio columns
cl = [prefix+"repaid_", prefix+"impaired1_",  
      prefix+"pastdue90_", prefix+"pastdue180_"]
for c in cl:
    inst[c+"r"] = inst[c+"c"] / inst[prefix+"lent_c"]

Adding seller attributes...


In [16]:
outputfile = '03_instrumentsdf_deg1stats.pkl'
inst.to_pickle(datafolder+outputfile)

In [10]:
d = pd.read_pickle(datafolder+'03_instrumentsdf_deg1stats.pkl')

In [16]:
d.tail(5)[[c for c in d.columns if 'cd_' in c]].transpose()

uid,2017-3634:29/47558,2017-3635:29/47567,2017-3636:29/47559,2017-3637:29/47560,2017-3622:69/47546
cd_lent_c,431.0,432.0,433.0,434.0,801.0
cd_repaid_c,315.0,315.0,315.0,315.0,532.0
cd_impaired1_c,0.0,0.0,0.0,0.0,0.0
cd_pastdue90_c,3.0,3.0,3.0,3.0,1.0
cd_pastdue180_c,3.0,3.0,3.0,3.0,0.0
cd_trend_a,-1.443147,-1.444563,-1.442165,-1.439885,-3.244175
cd_we_payment_share,0.0,0.0,0.0,0.0,0.0
cd_pd_mismatch_mean,-17.298413,-17.298413,-17.298413,-17.298413,74.847744
cd_pd_mismatch_std,30.917393,30.917393,30.917393,30.917393,38.671066
cd_repaid_r,0.730858,0.729167,0.727483,0.725806,0.66417


In [33]:
d.filter(regex=("^d_")).tail().transpose()

uid,2017-3634:29/47558,2017-3635:29/47567,2017-3636:29/47559,2017-3637:29/47560,2017-3622:69/47546
d_lent_c,431.0,432.0,433.0,434.0,801.0
d_repaid_c,315.0,315.0,315.0,315.0,532.0
d_impaired1_c,0.0,0.0,0.0,0.0,0.0
d_pastdue90_c,3.0,3.0,3.0,3.0,1.0
d_pastdue180_c,3.0,3.0,3.0,3.0,0.0
d_trend_a,-1.443147,-1.444563,-1.442165,-1.439885,-3.244175
d_we_payment_share,0.0,0.0,0.0,0.0,0.0
d_pd_mismatch_mean,-17.298413,-17.298413,-17.298413,-17.298413,74.847744
d_pd_mismatch_std,30.917393,30.917393,30.917393,30.917393,38.671066
d_repaid_r,0.730858,0.729167,0.727483,0.725806,0.66417


In [34]:
d.filter(regex=("^c_")).tail().transpose()

uid,2017-3634:29/47558,2017-3635:29/47567,2017-3636:29/47559,2017-3637:29/47560,2017-3622:69/47546
c_lent_c,3801.0,3802.0,3803.0,3804.0,3805.0
c_repaid_c,2988.0,2988.0,2988.0,2988.0,2988.0
c_impaired1_c,62.0,62.0,62.0,62.0,62.0
c_pastdue90_c,269.0,269.0,269.0,269.0,269.0
c_pastdue180_c,202.0,202.0,202.0,202.0,202.0
c_trend_a,-1.471188,-1.473267,-1.474443,-1.475649,-1.472599
c_we_payment_share,0.0,0.0,0.0,0.0,0.0
c_pd_mismatch_mean,46.00502,46.00502,46.00502,46.00502,46.00502
c_pd_mismatch_std,97.133022,97.133022,97.133022,97.133022,97.133022
c_repaid_r,0.786109,0.785902,0.785696,0.785489,0.785283
