# Trades features - Data Exploration

This notebook starts from the file generated at step 04 (notebook '04_trades_featureEng.ipynb') and performs data exploration and visualization tasks

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import itertools

from bokeh.io import show, output_notebook, output_file

from scripts_viz.visualization_utils import *

from scripts_viz.visualization_utils import TTQcolor


output_notebook()

## Data import

In [3]:
filename = '03_instrumentsdf_deg1stats.pkl'
datafolder = "../data/"

viz_outputs = "../project_report/"

inst = pd.read_pickle(datafolder+filename)

#date we received the data
ReportDate = pd.to_datetime('2018-09-28', yearfirst=True) 

inst.head()

Unnamed: 0_level_0,customer_id,customer_name_1,debtor_id,debtor_name_1,invoice_number,invoice_date,due_date,invoice_amount,purchase_amount,purchase_amount_open,...,c_pastdue90_c,c_pastdue180_c,c_trend_a,c_we_payment_share,c_pd_mismatch_mean,c_pd_mismatch_std,c_repaid_r,c_impaired1_r,c_pastdue90_r,c_pastdue180_r
uid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2744:79/231,2004008,Castillo GmbH,79,Sana Hyannis Sarl,2744,2013-07-23,2013-08-02,913.7,0.0,0.0,...,0.0,0.0,0.0,,,,,,,
2861:79/232,2004008,Castillo GmbH,79,Sana Hyannis Sarl,2861,2013-07-30,2013-08-09,2233.45,0.0,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0
2932:79/233,2004008,Castillo GmbH,79,Sana Hyannis Sarl,2932,2013-08-06,2013-08-16,1370.5,0.0,0.0,...,0.0,0.0,7.185198,,,,0.0,0.0,0.0,0.0
1472:489/688,2004009,Orpheus Wyandotte Supply LLC,489,Isfahan SA,1472,2013-08-13,2013-08-23,9195.1,0.0,0.0,...,0.0,0.0,0.0,,,,,,,
2042:512/645,2004009,Orpheus Wyandotte Supply LLC,512,Aldrich Chloe GmbH,2042,2013-08-13,2013-08-23,4594.6,0.0,0.0,...,0.0,0.0,0.0,,,,0.0,0.0,0.0,0.0


In [3]:
#calculating instruments amount in euro (uniform currency)
inst['invoice_amount_eur'] = [convert_to_eur(inst.iloc[i]['invoice_amount'], inst.iloc[i]['currency']) for i in range(len(inst))]

In [4]:
show(distplot(np.log(inst['invoice_amount_eur']), xaxisname='invoice amount (log)', yaxisname='distribution'))

In [5]:
#creation of the dataset to count number of invoices and amount
years = range(2013,2019)
months = range(1,13)

count_dict = {}
amount_dict = {}

for y in years:
    for m in months:
        if (y!=2013 or m not in range(1,8)):
            if (y!=2018 or m<10):
                count_dict[str(y)+'_'+str(m)] = len(inst[inst['value_date'].apply(lambda x:(x.year==y) & (x.month==m))])
                amount_dict[str(y)+'_'+str(m)] = inst[inst['value_date'].apply(lambda x:(x.year==y) & (x.month==m))]['invoice_amount_eur'].sum()
                
#dataframes creation
count_df = pd.DataFrame(count_dict, index = ['count'])
amount_df = pd.DataFrame(amount_dict, index = ['amount'])

#normalization for comparison
gen_df_norm = pd.concat([count_df, amount_df])
gen_df_norm.loc['count'] = gen_df_norm.loc['count']/max(gen_df_norm.loc['count'])
gen_df_norm.loc['amount'] = gen_df_norm.loc['amount']/max(gen_df_norm.loc['amount'])
gen_df_norm

count_amount_ts = timeSeries(gen_df_norm, colors = [TTQcolor['azureBlue'], TTQcolor['yellowOrange']],
           title='Invoices from 2013 to 2018', rightY=True, Xlabel = 'Year_Month',
          custom_Y_left = (min(count_df.loc['count']), max(count_df.loc['count'])), Ylabel = 'Number of instruments',
           Ylabel_2 = 'Mln of Euro (€)', custom_Y_right = (min(amount_df.loc['amount'])/1000000,
                                                           max(amount_df.loc['amount'])/1000000),
          showYGrid=False, legend_names = ['count (LHS)', 'amount (RHS)'], plot_w=1800, plot_h=650,
           legend_font_size = '16pt', axis_label_font_size = '16pt', y_axis_ticks_font_size = '12pt',
          x_axis_ticks_font_size = '9pt')

In [6]:
#output_file(viz_outputs+'instruments_count_amount.html', title='Count and amount of invoices from 2013 to 2018')
show(count_amount_ts)

In [7]:
#credit events count timeseries

has_imp1 = inst[inst['has_impairment1']]
has_imp1_count, has_imp1_amount, has_imp1_ca = count_and_amount(has_imp1)

pastdue90 = inst[inst['is_pastdue90']]
pastdue90_count, pastdue90_amount, pastdue90_ca = count_and_amount(pastdue90)

pastdue180 = inst[inst['is_pastdue180']]
pastdue180_count, pastdue180_amount, pastdue180_ca = count_and_amount(pastdue180)

#concatenation of credit events count dataset
count_df = pd.concat([count_df, has_imp1_count, pastdue90_count, pastdue180_count])
names = ['count', 'count_hasimp1','count_over90', 'count_over180']
count_df.index = names

colors = [TTQcolor['azureBlue'], TTQcolor['redBrown'], TTQcolor['bloodRed'], TTQcolor['yell'], TTQcolor['richOrange'], 
         TTQcolor['richPeach']]

In [8]:
output_file(viz_outputs+'events_count.html', title='Count of credit events from 2013 to 2018')
show(timeSeries(count_df.loc[names], plot_h=600, colors = colors,legend_font_size = '16pt', axis_label_font_size = '16pt', y_axis_ticks_font_size = '12pt',
          x_axis_ticks_font_size = '9pt', title='Credit events frequency from 2013 to 2018'))

In [9]:
output_file(viz_outputs+'instruments_count_amount.html', title='Count and amount of invoices from 2013 to 2018')
show(count_amount_ts)

In [10]:
ov90_only = inst[(inst['is_pastdue90']) & (~inst['is_pastdue180'])].invoice_amount_eur
ov180 = inst[inst['is_pastdue180']].invoice_amount_eur
not_ov = inst[~inst['is_pastdue90']].invoice_amount_eur


In [12]:
show(stacked_distplot(np.log(ov90_only), np.log(ov180), colors=colors[2:5], legendnames=['is_pastdue90', 'is_pastdue180', 'is_not_overdue'],
                     yaxisname='distribution', xaxisname='number of pastdues (log)', boxtextsize='9pt'), logscale=True)