In [1]:
import os
import sys
import pandas as pd
from pathlib import Path
import numpy as np
from kmodes.kmodes import KModes
from kmodes.kprototypes import KPrototypes
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
'''
Fraction of invoice fully paid (# cleared invoices / # invoices per customer) = sum(IsInvoicePaidFully) / count(numInvoices)
fraction of invoice fully paid on time  (# cleared invoices before or on duedate / # invoices per customer) = sum(IsClearedOnTime) / count(numInvoices)
Average time until first payment = mean(RaisedToFirstPayment)
Average payment amount = mean(AveragePaymentValue)
- Average number of payments = mean(NumPayments)
- Number of invoices (grouby count)
'''

df = pd.read_csv('./Engineered_Data/invoice_data.csv')

In [3]:
df

Unnamed: 0,AveragePaymentValue,CustomerKey,DueDate,EarliestPaymentDate,InvoiceKey,NumPayments,OriginalInvoiceAmount,LatestPaymentDate,PaymentType,TotalPaymentValue,RaisedDate,StatementTransactionType,IsInvoicePaidFully,RaisedToFirstPayment,RaisedToCleared,IsClearedOnTime
0,168.000,16745090,2018-03-02,2018-11-23,195519971,1.0,168.00,2018-11-23,Barclays Multi,168.00,2018-01-03,INV,1,324,324.0,0
1,819.600,16743947,2018-02-03,2018-10-26,196442471,1.0,819.60,2018-10-26,Barclays Multi,819.60,2018-01-04,INV,1,295,295.0,0
2,718.200,16836264,2018-02-08,2018-11-05,196563186,1.0,749.40,2018-11-05,Barclays Multi,718.20,2018-01-09,INV,0,300,,0
3,1873.080,16843107,2018-03-02,2018-09-04,196711370,1.0,1873.08,2018-09-04,Barclays Multi,1873.08,2018-01-11,INV,1,236,236.0,0
4,1316.090,16739410,2018-02-11,2018-09-26,197134638,1.0,1316.09,2018-09-26,Barclays Multi,1316.09,2018-01-12,INV,1,257,257.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
272237,24.815,16849111,2019-07-30,2019-07-30,196911580,2.0,49.63,2019-08-30,Barclays Multi,49.63,2019-06-30,INV,1,30,61.0,0
272238,178.165,16847651,2019-07-30,2019-09-24,195362531,2.0,356.33,2019-10-07,Barclays Multi,356.33,2019-06-30,INV,1,86,99.0,0
272239,36.310,16917308,2019-07-30,2019-07-29,196871858,2.0,72.62,2019-08-09,Barclays Multi,72.62,2019-06-30,INV,1,29,40.0,0
272240,311.440,16782671,2019-07-30,2019-07-22,195891626,2.0,311.44,2019-09-18,CheckM8,622.88,2019-06-30,INV,1,22,80.0,0


In [4]:
df['NumInvoices'] = 1

In [5]:
df = df.groupby('CustomerKey').agg({'IsClearedOnTime': 'sum',
                               'IsInvoicePaidFully': 'sum',
                             'AveragePaymentValue': 'mean',
                              'NumPayments': 'mean',
                              'AveragePaymentValue': 'mean',
                              'NumInvoices': 'count'
                              }).reset_index()

In [6]:
df

Unnamed: 0,CustomerKey,IsClearedOnTime,IsInvoicePaidFully,AveragePaymentValue,NumPayments,NumInvoices
0,16721154,4,11,59.680000,1.0,11
1,16721167,0,12,190.075000,1.0,12
2,16721203,0,12,64.442500,1.0,12
3,16721207,0,24,515.975417,1.0,24
4,16721212,9,11,411.229091,1.0,11
...,...,...,...,...,...,...
12846,16980240,16,22,426.218182,1.0,22
12847,16980268,16,16,437.421250,1.0,16
12848,16980375,0,62,167.441129,1.0,62
12849,16980435,0,12,342.846667,1.0,12


In [7]:
df['FractionFullyPaid'] = df['IsInvoicePaidFully'] / df['NumInvoices']
df['FractionClearedOnTime'] = df['IsClearedOnTime'] /  df['NumInvoices']

In [8]:
df

Unnamed: 0,CustomerKey,IsClearedOnTime,IsInvoicePaidFully,AveragePaymentValue,NumPayments,NumInvoices,FractionFullyPaid,FractionClearedOnTime
0,16721154,4,11,59.680000,1.0,11,1.0,0.363636
1,16721167,0,12,190.075000,1.0,12,1.0,0.000000
2,16721203,0,12,64.442500,1.0,12,1.0,0.000000
3,16721207,0,24,515.975417,1.0,24,1.0,0.000000
4,16721212,9,11,411.229091,1.0,11,1.0,0.818182
...,...,...,...,...,...,...,...,...
12846,16980240,16,22,426.218182,1.0,22,1.0,0.727273
12847,16980268,16,16,437.421250,1.0,16,1.0,1.000000
12848,16980375,0,62,167.441129,1.0,62,1.0,0.000000
12849,16980435,0,12,342.846667,1.0,12,1.0,0.000000


In [9]:
df.rename(columns={"IsClearedOnTime": "NumClearedOnTime", 
                   "IsInvoicePaidFully": "NumPaidFully",
                  "AveragePaymentValue": "AveragePaymentAmount",
                  "NumPayments": "AverageNumPayments"}, inplace=True)

In [10]:
df

Unnamed: 0,CustomerKey,NumClearedOnTime,NumPaidFully,AveragePaymentAmount,AverageNumPayments,NumInvoices,FractionFullyPaid,FractionClearedOnTime
0,16721154,4,11,59.680000,1.0,11,1.0,0.363636
1,16721167,0,12,190.075000,1.0,12,1.0,0.000000
2,16721203,0,12,64.442500,1.0,12,1.0,0.000000
3,16721207,0,24,515.975417,1.0,24,1.0,0.000000
4,16721212,9,11,411.229091,1.0,11,1.0,0.818182
...,...,...,...,...,...,...,...,...
12846,16980240,16,22,426.218182,1.0,22,1.0,0.727273
12847,16980268,16,16,437.421250,1.0,16,1.0,1.000000
12848,16980375,0,62,167.441129,1.0,62,1.0,0.000000
12849,16980435,0,12,342.846667,1.0,12,1.0,0.000000


In [11]:
df.describe()

Unnamed: 0,CustomerKey,NumClearedOnTime,NumPaidFully,AveragePaymentAmount,AverageNumPayments,NumInvoices,FractionFullyPaid,FractionClearedOnTime
count,12851.0,12851.0,12851.0,12851.0,12851.0,12851.0,12851.0,12851.0
mean,16822130.0,7.12715,21.116645,318.011819,1.00292,21.184499,0.997031,0.403839
std,80986.59,20.233905,72.337124,314.160163,0.050506,72.693455,0.021205,0.335002
min,16721150.0,0.0,0.0,0.01,1.0,10.0,0.0,0.0
25%,16742370.0,1.0,11.0,99.437576,1.0,11.0,1.0,0.083333
50%,16819800.0,5.0,12.0,192.359091,1.0,12.0,1.0,0.363636
75%,16885810.0,9.0,13.0,434.215606,1.0,13.0,1.0,0.692308
max,16984800.0,1116.0,3635.0,1804.255833,4.5,3636.0,1.0,1.0


In [12]:
df.to_csv(path_or_buf='./Engineered_Data/customer_data.csv', index=False)