# Time snapshots for network analysis

This notebook creates time snapshots of the given portfolio, in order to study how network structure could influence impairments/overdues diffusion.
In the previous steps, impairments and overdues has been calculated using as report date the date in which the data was received.
Using snapshots, it is possible to perform this analysis overtime observing diffusion.

## Data import

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import os

from features_utils import *

In [2]:
#Instruments dataset
#import data
user = os.environ["USERNAME"]

#from home
#filename = "instrumentsdf.pkl"
#datafolder = "C:/Users/{}/Dropbox/University/MscDataScience_Birkbeck/thesis_project/data/".format(user)

#from work
filename = "09272018_instruments.pkl"
datafolder = "C:/Users/{}/Tradeteq Dropbox/Tradeteq Team/Clients/#GoFactoring/data analysis/".format(user)

inst = pd.read_pickle(datafolder+filename)
inst.head().transpose()

uid,2744:79/231,2861:79/232,2932:79/233,1472:489/688,2042:512/645
customer_id,2004008,2004008,2004008,2004009,2004009
customer_name_1,jobs united GmbH,jobs united GmbH,jobs united GmbH,PM Personal GmbH,PM Personal GmbH
debtor_id,79,79,79,489,512
debtor_name_1,Quadroni Linard,Quadroni Linard,Quadroni Linard,Style Interiors,Elektropartner AG
invoice_number,2744,2861,2932,1472,2042
invoice_date,2013-07-23 00:00:00,2013-07-30 00:00:00,2013-08-06 00:00:00,2013-08-13 00:00:00,2013-08-13 00:00:00
due_date,2013-08-02 00:00:00,2013-08-09 00:00:00,2013-08-16 00:00:00,2013-08-23 00:00:00,2013-08-23 00:00:00
invoice_amount,913.7,2233.45,1370.5,9195.1,4594.6
purchase_amount,0,0,0,0,0
purchase_amount_open,0,0,0,0,0


In [3]:
inst[['invoice_date','debtor_id']]

Unnamed: 0_level_0,invoice_date,debtor_id
uid,Unnamed: 1_level_1,Unnamed: 2_level_1
2744:79/231,2013-07-23,79
2861:79/232,2013-07-30,79
2932:79/233,2013-08-06,79
1472:489/688,2013-08-13,489
2042:512/645,2013-08-13,512
2998:79/234,2013-08-13,79
3043:506/229,2013-08-20,506
3098:79/235,2013-08-20,79
1533:489/689,2013-08-27,489
1603:527/651,2013-09-03,527


In [4]:
inst[inst['has_prosecution']][[inst.columns[c] for c in range(len(inst.columns)) if c<50]].transpose()

uid,2042:512/645,2043:512/646,2044:512/647,2045:512/648,2046:512/649,2047:512/650,1063:INTER715/11390,1108:717/1153,1109:717/1154,1110:717/1155,...,101516:101790/62383,101517:101790/62384,101518:101790/62385,101659:101786/62958,101660:101786/62959,101685:101790/63380,101794:101786/63686,101795:101786/63687,101970:101786/64436,102031:101786/65082
customer_id,2004009,2004009,2004009,2004009,2004009,2004009,2004019,2004016,2004016,2004016,...,2004078,2004078,2004078,2004078,2004078,2004078,2004078,2004078,2004078,2004078
customer_name_1,PM Personal GmbH,PM Personal GmbH,PM Personal GmbH,PM Personal GmbH,PM Personal GmbH,PM Personal GmbH,United Personal Management AG,inter personal GmbH,inter personal GmbH,inter personal GmbH,...,PS Schweiz AG,PS Schweiz AG,PS Schweiz AG,PS Schweiz AG,PS Schweiz AG,PS Schweiz AG,PS Schweiz AG,PS Schweiz AG,PS Schweiz AG,PS Schweiz AG
debtor_id,512,512,512,512,512,512,INTER715,717,717,717,...,101790,101790,101790,101786,101786,101790,101786,101786,101786,101786
debtor_name_1,Elektropartner AG,Elektropartner AG,Elektropartner AG,Elektropartner AG,Elektropartner AG,Elektropartner AG,Pergola Design AG,Team Fortis GmbH,Team Fortis GmbH,Team Fortis GmbH,...,Malergeschäft Ferati GmbH,Malergeschäft Ferati GmbH,Malergeschäft Ferati GmbH,Wood Living AG,Wood Living AG,Malergeschäft Ferati GmbH,Wood Living AG,Wood Living AG,Wood Living AG,Wood Living AG
invoice_number,2042,2043,2044,2045,2046,2047,1063,1108,1109,1110,...,101516,101517,101518,101659,101660,101685,101794,101795,101970,102031
invoice_date,2013-08-13 00:00:00,2013-09-10 00:00:00,2013-09-17 00:00:00,2013-09-24 00:00:00,2013-09-30 00:00:00,2013-10-08 00:00:00,2014-04-16 00:00:00,2014-05-14 00:00:00,2014-05-14 00:00:00,2014-05-14 00:00:00,...,2018-06-25 00:00:00,2018-06-25 00:00:00,2018-06-25 00:00:00,2018-07-02 00:00:00,2018-07-02 00:00:00,2018-07-03 00:00:00,2018-07-09 00:00:00,2018-07-09 00:00:00,2018-07-17 00:00:00,2018-07-23 00:00:00
due_date,2013-08-23 00:00:00,2013-09-20 00:00:00,2013-09-27 00:00:00,2013-10-04 00:00:00,2013-10-10 00:00:00,2013-10-18 00:00:00,2014-04-26 00:00:00,2014-05-24 00:00:00,2014-05-24 00:00:00,2014-05-24 00:00:00,...,2018-07-05 00:00:00,2018-07-05 00:00:00,2018-07-05 00:00:00,2018-07-12 00:00:00,2018-07-12 00:00:00,2018-07-13 00:00:00,2018-07-19 00:00:00,2018-07-19 00:00:00,2018-07-27 00:00:00,2018-08-02 00:00:00
invoice_amount,4594.6,2751.85,2801,2850.1,3120.4,2555.3,2257.2,1542.25,8655.3,1542.25,...,1277.05,1277.05,1277.05,1933.75,1933.75,1277.05,1890.8,1890.8,1117.3,1001.25
purchase_amount,0,0,0,0,0,0,0,1542.25,8655.3,1542.25,...,0,0,0,0,0,0,0,0,0,0
purchase_amount_open,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
pamount = inst.payment_amount.apply(lambda x:len(x))
pdate = inst.payment_amount.apply(lambda x:len(x))

## 1. Defining snapshot slices

In order to create snapshots of different time frames, the report date will be progressively changed and used to slice the dataframe.  

In [11]:
ReportDate = datetime.datetime(2018, 9, 28) #date data was received

daterange = pd.date_range(start=inst.invoice_date.min(), end=ReportDate, freq='M')

In [12]:
len(daterange)

62

In [13]:
pd.to_datetime(str(daterange[0]).split(' ')[0], yearfirst=True)

Timestamp('2013-07-31 00:00:00')

In [14]:
daterange[0]<ReportDate

True

In [15]:
#this is very slow

for snap in range(len(daterange)):
    label = "sshot_"+str(snap)+'_'
    repdate = pd.to_datetime(str(daterange[snap]).split(' ')[0], yearfirst=True)
    inst[label]=False
    inst.loc[inst.invoice_date<repdate, label]=True
    add_main_features(inst, repdate, prefix=label)

#from work
datafolder2 = 'C:/Users/{0}/Tradeteq Dropbox/Davide Mariani/thesis_project/'.format(user)
filename2 = 'snapshots.pkl'
inst.to_pickle(datafolder2+filename2)


#just load the file snapshots.pkl
#inst = pd.read_pickle(datafolder2+filename2)

Addding main network features for snapshot with date < 2013-07-31 00:00:00
Addding main network features for snapshot with date < 2013-08-31 00:00:00
Addding main network features for snapshot with date < 2013-09-30 00:00:00
Addding main network features for snapshot with date < 2013-10-31 00:00:00
Addding main network features for snapshot with date < 2013-11-30 00:00:00
Addding main network features for snapshot with date < 2013-12-31 00:00:00
Addding main network features for snapshot with date < 2014-01-31 00:00:00
Addding main network features for snapshot with date < 2014-02-28 00:00:00
Addding main network features for snapshot with date < 2014-03-31 00:00:00
Addding main network features for snapshot with date < 2014-04-30 00:00:00
Addding main network features for snapshot with date < 2014-05-31 00:00:00
Addding main network features for snapshot with date < 2014-06-30 00:00:00
Addding main network features for snapshot with date < 2014-07-31 00:00:00
Addding main network feat

KeyboardInterrupt: 

In [20]:
def select_date(x):
    """
    This function is used to select the payment dates in a time snapshot, where we need to exlude dates happening after the reportdate of that particular snapshot.
    It is meant to be used inside an 'apply' with axis=1 to be rowwise.
    """
    dates = x.payment_date
    last_item = int(x.tmp_dates_to_count)
    return dates[:last_item]

prefix = 'sshot_0_'
inst.loc[inst['sshot_0_'],'tmp_dates_to_count'] = inst.loc[inst['sshot_0_'],"payment_date"].apply(lambda x:sum(pd.Series(x)<ReportDate)) #this retrieve the index of the last payment snapshot to snapshot (it is a temp column)
inst.loc[inst['sshot_0_'],prefix+"payment_date"] = inst.loc[inst['sshot_0_'],["payment_date", "tmp_dates_to_count"]].apply(select_date, axis=1)

In [21]:
inst.columns

Index(['customer_id', 'customer_name_1', 'debtor_id', 'debtor_name_1',
       'invoice_number', 'invoice_date', 'due_date', 'invoice_amount',
       'purchase_amount', 'purchase_amount_open', 'discharge_type',
       'discharge_amount', 'discharge_date', 'posting_date',
       'transaction_type', 'document_stack_id', 'booking_text', 'input_date',
       'creation_date', 'factoring_type', 'debt_collection_date',
       'last_payment_date', 'reminder_date', 'test_feature_tested',
       'test_characteristic_id', 'bill_id', 'cancellation_date', 'value_date',
       'currency', 'purchase_examination', 'prosecution', 'deduction_amount',
       'payment_amount', 'payment_date', 'document_status', 'uid', 'ttype',
       'nrecords', 'ttypeset', 'discharge_loss', 'has_impairment1',
       'has_impairment2', 'has_impairment3', 'is_pastdue90', 'is_pastdue180',
       'has_prosecution', 'last_payment_amount', 'total_repayment',
       'total_impairment', 'is_open', 'we_payment_share', 'has_purchas

In [22]:
inst[["payment_date", "tmp_dates_to_count"]].apply(select_date, axis=1)

ValueError: ('cannot convert float NaN to integer', 'occurred at index 2932:79/233')

In [17]:
inst.loc[inst['sshot_0_'],["payment_date", "tmp_dates_to_count"]]

Unnamed: 0_level_0,payment_date,tmp_dates_to_count
uid,Unnamed: 1_level_1,Unnamed: 2_level_1
2744:79/231,"[NaT, NaT]",0.0
2861:79/232,"[NaT, NaT]",0.0


In [16]:
inst['sshot_0_']

uid
2744:79/231             True
2861:79/232             True
2932:79/233            False
1472:489/688           False
2042:512/645           False
2998:79/234            False
3043:506/229           False
3098:79/235            False
1533:489/689           False
1603:527/651           False
1604:527/652           False
1623:489/690           False
2043:512/646           False
3348:79/236            False
3378:506/230           False
2044:512/647           False
3399:79/237            False
1748:489/691           False
2045:512/648           False
3541:79/238            False
1764:489/692           False
2046:512/649           False
1793:489/693           False
3627:79/239            False
1811:489/694           False
2047:512/650           False
3842:79/240            False
1894:489/695           False
1915:489/696           False
3938:79/241            False
                       ...  
71181:582154/55287     False
71182:582154/55288     False
2017-3638:102/47561    False
2017-3639:

In [11]:
#this is a check cell
selnum=56
print(daterange[selnum])
selector = 'sshot_'+str(selnum)+'_'
inst[inst[selector]][[c for c in inst.columns if selector in c]+ \
                     ['invoice_date','payment_date','tmp_dates_to_count', 
                      'purchase_amount', 'has_purchase', 'due_date']].transpose()

2018-03-31 00:00:00


uid,2744:79/231,2861:79/232,2932:79/233,1472:489/688,2042:512/645,2998:79/234,3043:506/229,3098:79/235,1533:489/689,1603:527/651,...,3066:23/54992,3081:29/55953,3071:30/54994,3082:34/55001,3077:35/54999,3076:55/54998,3058:56/54988,3059:59/54989,3070:7/54993,3072:71/54995
sshot_56_,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
sshot_56_payment_date,[],[],"[2016-01-30 00:00:00, 2016-01-30 00:00:00]",[],"[2016-01-30 00:00:00, 2016-01-30 00:00:00, 201...",[],"[2016-01-30 00:00:00, 2016-01-30 00:00:00, 201...",[],[],[],...,[],[],[],[],[],[],[],[],[],[]
sshot_56_payment_amount,[],[],"[1370.5, 1370.5]",[],"[164.35, 164.35, 164.35, 164.35]",[],"[1119.0, 1119.0, 1119.0, 1119.0]",[],[],[],...,[],[],[],[],[],[],[],[],[],[]
sshot_56_last_payment_amount,0,0,1370.5,0,164.35,0,1119,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sshot_56_last_payment_date,NaT,NaT,2016-01-30 00:00:00,NaT,2016-01-30 00:00:00,NaT,2016-01-30 00:00:00,NaT,NaT,NaT,...,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT,NaT
sshot_56_total_repayment,0,0,1370.5,0,164.35,0,1119,0,0,0,...,0,0,0,0,0,0,0,0,0,0
sshot_56_is_pastdue90,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
sshot_56_is_pastdue180,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
sshot_56_payment_date_mismatch,,,897,,890,,883,,,,...,,,,,,,,,,
sshot_56_is_open,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True


In [29]:
inst.columns

Index(['customer_id', 'customer_name_1', 'debtor_id', 'debtor_name_1',
       'invoice_number', 'invoice_date', 'due_date', 'invoice_amount',
       'purchase_amount', 'purchase_amount_open',
       ...
       'sshot_0_cd_pd_mismatch_mean', 'sshot_0_cd_pd_mismatch_std',
       'sshot_1_cd_lent_c', 'sshot_1_cd_repaid_c', 'sshot_1_cd_pastdue90_c',
       'sshot_1_cd_pastdue180_c', 'sshot_1_cd_trend_a',
       'sshot_1_cd_we_payment_share', 'sshot_1_cd_pd_mismatch_mean',
       'sshot_1_cd_pd_mismatch_std'],
      dtype='object', length=1047)

In [32]:
#this much more slow
#Adding buyer/seller pair attributes for a certain snapshot

for selnum in range(0,8): #test#(len(daterange)):
    prefix = "cd_" #stands for customer/debtor
    g_cb = inst[inst['sshot_'+str(selnum)+'_']].groupby(["customer_name_1", "debtor_name_1"])
    decision_date_col = "value_date"


    print("Adding buyer/seller pair attributes to snapshot {}...".format(selnum))
    selector = 'sshot_'+str(selnum)+'_'
    for (customer, debtor), igroup in g_cb:
        #for each instrument in this group, already sorted by invoice_date
        for idx, (id, ii) in enumerate(igroup.iterrows()):
            add_node_stats(inst, igroup, idx, id, ii, decision_date_col, prefix, prefix_read=selector)

#Adding the ratio columns for the previously calculated stats
#cl = [prefix+"repaid_", prefix+"impaired1_", prefix+"impaired2_", 
#      prefix+"pastdue90_", prefix+"pastdue180_"]
#for c in cl:
#    inst[c+"r"] = inst[c+"c"] / inst[prefix+"lent_c"]

Adding buyer/seller pair attributes to snapshot 0...
Adding buyer/seller pair attributes to snapshot 1...
Adding buyer/seller pair attributes to snapshot 2...
Adding buyer/seller pair attributes to snapshot 3...
Adding buyer/seller pair attributes to snapshot 4...
Adding buyer/seller pair attributes to snapshot 5...
Adding buyer/seller pair attributes to snapshot 6...
Adding buyer/seller pair attributes to snapshot 7...


In [33]:
inst[inst['sshot_7_']].transpose()

uid,2744:79/231,2861:79/232,2932:79/233,1472:489/688,2042:512/645,2998:79/234,3043:506/229,3098:79/235,1533:489/689,1603:527/651,...,5443:381/287,5500:381/288,5501:381/289,5507:381/290,5508:381/291,5509:381/292,5561:381/293,5562:381/294,5568:381/295,5569:381/296
customer_id,2004008,2004008,2004008,2004009,2004009,2004008,2004008,2004008,2004009,2004009,...,2004008,2004008,2004008,2004008,2004008,2004008,2004008,2004008,2004008,2004008
customer_name_1,jobs united GmbH,jobs united GmbH,jobs united GmbH,PM Personal GmbH,PM Personal GmbH,jobs united GmbH,jobs united GmbH,jobs united GmbH,PM Personal GmbH,PM Personal GmbH,...,jobs united GmbH,jobs united GmbH,jobs united GmbH,jobs united GmbH,jobs united GmbH,jobs united GmbH,jobs united GmbH,jobs united GmbH,jobs united GmbH,jobs united GmbH
debtor_id,79,79,79,489,512,79,506,79,489,527,...,381,381,381,381,381,381,381,381,381,381
debtor_name_1,Quadroni Linard,Quadroni Linard,Quadroni Linard,Style Interiors,Elektropartner AG,Quadroni Linard,Elektro DOM GmbH,Quadroni Linard,Style Interiors,Ilanz Keramik GmbH,...,Trigon Elektro AG,Trigon Elektro AG,Trigon Elektro AG,Trigon Elektro AG,Trigon Elektro AG,Trigon Elektro AG,Trigon Elektro AG,Trigon Elektro AG,Trigon Elektro AG,Trigon Elektro AG
invoice_number,2744,2861,2932,1472,2042,2998,3043,3098,1533,1603,...,5443,5500,5501,5507,5508,5509,5561,5562,5568,5569
invoice_date,2013-07-23 00:00:00,2013-07-30 00:00:00,2013-08-06 00:00:00,2013-08-13 00:00:00,2013-08-13 00:00:00,2013-08-13 00:00:00,2013-08-20 00:00:00,2013-08-20 00:00:00,2013-08-27 00:00:00,2013-09-03 00:00:00,...,2014-02-11 00:00:00,2014-02-18 00:00:00,2014-02-18 00:00:00,2014-02-18 00:00:00,2014-02-18 00:00:00,2014-02-18 00:00:00,2014-02-25 00:00:00,2014-02-25 00:00:00,2014-02-25 00:00:00,2014-02-25 00:00:00
due_date,2013-08-02 00:00:00,2013-08-09 00:00:00,2013-08-16 00:00:00,2013-08-23 00:00:00,2013-08-23 00:00:00,2013-08-23 00:00:00,2013-08-30 00:00:00,2013-08-30 00:00:00,2013-09-06 00:00:00,2013-09-13 00:00:00,...,2014-02-21 00:00:00,2014-02-28 00:00:00,2014-02-28 00:00:00,2014-02-28 00:00:00,2014-02-28 00:00:00,2014-02-28 00:00:00,2014-03-07 00:00:00,2014-03-07 00:00:00,2014-03-07 00:00:00,2014-03-07 00:00:00
invoice_amount,913.7,2233.45,1370.5,9195.1,4594.6,2233.45,1880.8,1370.5,2589.3,1066.8,...,885.05,2670.3,2918.7,2293.9,2293.9,2229.1,3098,2864.15,768.95,722.5
purchase_amount,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
purchase_amount_open,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
