In [1]:
import os
import sys
import pandas as pd
from pathlib import Path
import numpy as np
from kmodes.kmodes import KModes
from kmodes.kprototypes import KPrototypes

In [2]:
DATA_DIR = "data"
data_files = [str(filePath) for filePath in Path(DATA_DIR).glob("**/*") if filePath.is_file()]
data_labels = ['Debtor', 'Invoice', 'Payments']
data_dict = {data_labels[i]: pd.read_csv(data_files[i]) for i in range(len(data_labels))}

In [3]:
df = pd.merge(data_dict['Invoice'], data_dict['Payments'], on = ['InvoiceKey', 'CustomerKey'])

In [4]:
df

Unnamed: 0,InvoiceKey,CustomerKey,IsCreditInvoice,RaisedDate,DueDate,OriginalInvoiceAmount,AmountOutstanding,StatementTransactionType,ExtractDate,PaymentValue,PaymentDate,PaymentType
0,197057173,16776992,1,2018-01-02,2018-01-02,-645.66,0.0,JRN,2020-09-18 16:21:06.333,-645.66,2018-12-17,Barclays Multi
1,196559812,16837363,1,2018-01-02,2018-01-02,-139.37,0.0,JRN,2020-09-18 16:21:06.333,-139.37,2018-11-01,Barclays Multi
2,195300967,16724474,1,2018-01-03,2018-01-03,-88.50,0.0,JRN,2020-09-18 16:21:06.333,-88.50,2018-09-24,Barclays Multi
3,195519971,16745090,0,2018-01-03,2018-03-02,168.00,0.0,INV,2020-09-18 16:21:06.333,168.00,2018-11-23,Barclays Multi
4,196697217,16817529,1,2018-01-03,2018-01-03,-626.38,0.0,JRN,2020-09-18 16:21:06.333,-626.38,2019-12-17,Barclays Multi
...,...,...,...,...,...,...,...,...,...,...,...,...
361514,196340182,16725749,0,2019-06-30,2019-07-30,95.90,0.0,INV,2020-09-18 16:21:06.333,95.90,2019-08-14,Barclays Multi
361515,196340751,16728765,0,2019-06-30,2019-07-30,406.09,0.0,INV,2020-09-18 16:21:06.333,406.09,2019-08-05,Barclays Multi
361516,196341240,16971160,0,2019-06-30,2019-07-30,130.56,0.0,INV,2020-09-18 16:21:06.333,130.56,2019-07-24,CheckM8
361517,196341411,16764236,0,2019-06-30,2019-07-30,15.00,0.0,INV,2020-09-18 16:21:06.333,15.00,2019-07-08,Barclays Multi


In [5]:
categorical_feats = ['IsCreditInvoice', 'StatementTransactionType', 'PaymentType']
continuous_feats = ['OriginalInvoiceAmount', 'AmountOutstanding', 'PaymentValue']

In [6]:
df[categorical_feats]

Unnamed: 0,IsCreditInvoice,StatementTransactionType,PaymentType
0,1,JRN,Barclays Multi
1,1,JRN,Barclays Multi
2,1,JRN,Barclays Multi
3,0,INV,Barclays Multi
4,1,JRN,Barclays Multi
...,...,...,...
361514,0,INV,Barclays Multi
361515,0,INV,Barclays Multi
361516,0,INV,CheckM8
361517,0,INV,Barclays Multi


In [7]:
# encoding categorical variables as integers, not needed
'''
df["StatementTransactionType"] = df["StatementTransactionType"].astype('category').cat.codes
df["PaymentType"] = df["PaymentType"].astype('category').cat.codes
df["IsCreditInvoice"] = df["IsCreditInvoice"].astype('category').cat.codes
'''


'\ndf["StatementTransactionType"] = df["StatementTransactionType"].astype(\'category\').cat.codes\ndf["PaymentType"] = df["PaymentType"].astype(\'category\').cat.codes\ndf["IsCreditInvoice"] = df["IsCreditInvoice"].astype(\'category\').cat.codes\n'

In [36]:
df[categorical_feats + continuous_feats].describe()

Unnamed: 0,IsCreditInvoice,OriginalInvoiceAmount,AmountOutstanding,PaymentValue
count,361519.0,361519.0,361519.0,361519.0
mean,0.023741,3306.604,0.064838,3176.434
std,0.152243,54101.08,15.607989,53589.45
min,0.0,-1165304.0,-308.28,-1165304.0
25%,0.0,81.42,0.0,80.76
50%,0.0,216.0,0.0,214.18
75%,0.0,715.8,0.0,708.37
max,1.0,7738426.0,7714.15,7738426.0


In [9]:
km =  KPrototypes(n_clusters=3, init='Huang', verbose=1, n_init=2)

results = []

for _ in range(5):
    km =  KPrototypes(n_clusters=3, init='Huang', verbose=1, n_init=2)
    clusters = km.fit_predict(df[categorical_feats + continuous_feats].sample(frac=0.2, random_state=1), categorical=[0,1,2])
    
    results.append([km, clusters])

Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 53, ncost: 392121146114826.56
Run: 1, iteration: 2/100, moves: 24, ncost: 391575849154200.6
Run: 1, iteration: 3/100, moves: 22, ncost: 391530463407313.06
Run: 1, iteration: 4/100, moves: 8, ncost: 391514759730932.3
Run: 1, iteration: 5/100, moves: 2, ncost: 391513184541050.6
Run: 1, iteration: 6/100, moves: 0, ncost: 391513184541050.6
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/100, moves: 716, ncost: 246464296326100.34
Run: 2, iteration: 2/100, moves: 164, ncost: 192242770115936.44
Run: 2, iteration: 3/100, moves: 44, ncost: 170651321022738.7
Run: 2, iteration: 4/100, moves: 22, ncost: 149674553262530.1
Run: 2, iteration: 5/100, moves: 10, ncost: 137392656926656.39
Run: 2, iteration: 6/100, moves: 3, ncost: 135424866796063.38
Run: 2, iteration: 7/100, moves: 0, ncost: 135424866796063.38
Best run was number 2
Init: in

In [13]:
for result in results:
    print(result[0].cluster_centroids_[0])

[[ 2.63799772e+03  1.65602745e-01  2.51642091e+03]
 [ 2.78753608e+06  0.00000000e+00  2.70994735e+06]
 [-8.51641860e+05  0.00000000e+00 -8.39953880e+05]]
[[1.94427534e+03 1.65777039e-01 1.85395771e+03]
 [3.34391234e+06 0.00000000e+00 3.26613394e+06]
 [7.23465262e+05 0.00000000e+00 6.90715583e+05]]
[[3.34391234e+06 0.00000000e+00 3.26613394e+06]
 [7.23465262e+05 1.02729070e-15 6.90715583e+05]
 [1.94427534e+03 1.65777039e-01 1.85395771e+03]]
[[1.94427534e+03 1.65777039e-01 1.85395771e+03]
 [7.23465262e+05 0.00000000e+00 6.90715583e+05]
 [3.34391234e+06 0.00000000e+00 3.26613394e+06]]
[[-8.51641860e+05  0.00000000e+00 -8.39953880e+05]
 [ 2.78753608e+06  0.00000000e+00  2.70994735e+06]
 [ 2.63799772e+03  1.65602745e-01  2.51642091e+03]]


In [20]:
cluster_centers = [result[0].cluster_centroids_[0] for result in results]

In [37]:
cluster_centers

[array([[   2637.99772044,       0.16560274,    2516.42091359],
        [2787536.07941179,       0.        , 2709947.34823531],
        [-851641.86      ,       0.        , -839953.88      ]]),
 array([[   1944.2753386 ,       0.16577704,    1853.9577132 ],
        [3343912.34363638,       0.        , 3266133.93727273],
        [ 723465.26180723,       0.        ,  690715.5833735 ]]),
 array([[3343912.34363637,       0.        , 3266133.93727274],
        [ 723465.26180723,       0.        ,  690715.58337349],
        [   1944.2753386 ,       0.16577704,    1853.9577132 ]]),
 array([[   1944.2753386 ,       0.16577704,    1853.9577132 ],
        [ 723465.26180723,       0.        ,  690715.5833735 ],
        [3343912.34363637,       0.        , 3266133.93727273]]),
 array([[-851641.86      ,       0.        , -839953.88      ],
        [2787536.07941177,       0.        , 2709947.3482353 ],
        [   2637.99772044,       0.16560274,    2516.42091359]])]

In [25]:
cluster_centers[0].shape

(3, 3)

In [27]:
cont_clusters = np.mean(cluster_centers, axis=0)

In [31]:
np.set_printoptions(suppress=True)

In [34]:
for i in range(len(continuous_feats)):
    print(f"Cluster {i}: {np.round(cont_clusters[i], 2)}")

Cluster 0: [499759.41      0.1  486480.88]
Cluster 1: [2073183.01       0.   2013491.96]
Cluster 2: [644063.6       0.07 624253.2 ]


In [35]:
continuous_feats
categorical_feats

['IsCreditInvoice', 'StatementTransactionType', 'PaymentType']

In [None]:
'''
results: 
['IsCreditInvoice', 'StatementTransactionType', 'PaymentType']
cluster 1: ['0' 'INV' 'Barclays Multi']
cluster 2: ['1' 'CSH' 'Barclays Multi']

'OriginalInvoiceAmount', 'AmountOutstanding', 'PaymentValue'
Cluster 0: [499759.41      0.1  486480.88]
Cluster 1: [2073183.01       0.   2013491.96]
Cluster 2: [644063.6       0.07 624253.2 ]

'''

