In [13]:
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.mixture import GaussianMixture # https://pythonmachinelearning.pro/clustering-with-gaussian-mixture-models/

In [4]:
DATA_DIR = "data"
data_files = [str(filePath) for filePath in Path(DATA_DIR).glob("**/*") if filePath.is_file()]

In [5]:
data_files

['data/DebtorExtract.csv',
 'data/InvoiceExtract.csv',
 'data/PaymentsExtract.csv']

In [6]:
pd.read_csv('data/DebtorExtract.csv')

Unnamed: 0,CustomerKey,PostcodeOuter
0,16721147,B11
1,16721148,DN31
2,16721154,BA11
3,16721165,IP18
4,16721166,B79
...,...,...
88094,16987538,HU12
88095,16987539,S3
88096,16987540,FK1
88097,16989241,PE2


In [9]:
pd.read_csv('data/InvoiceExtract.csv')

Unnamed: 0,InvoiceKey,CustomerKey,IsCreditInvoice,RaisedDate,DueDate,OriginalInvoiceAmount,AmountOutstanding,StatementTransactionType,ExtractDate
0,197017110,16744809,1,2018-01-01,2018-08-31,-61589.70,-61589.70,CRN,2020-09-18 16:21:06.333
1,197188287,16744809,1,2018-01-01,2018-12-30,-2826.19,-2826.19,CRN,2020-09-18 16:21:06.333
2,197057183,16798371,1,2018-01-02,2018-01-02,-150.00,0.00,CSH,2020-09-18 16:21:06.333
3,195398038,16732292,0,2018-01-02,2018-02-01,485.04,0.00,INV,2020-09-18 16:21:06.333
4,197057173,16776992,1,2018-01-02,2018-01-02,-645.66,0.00,JRN,2020-09-18 16:21:06.333
...,...,...,...,...,...,...,...,...,...
1009322,196341240,16971160,0,2019-06-30,2019-07-30,130.56,0.00,INV,2020-09-18 16:21:06.333
1009323,196341244,16829160,0,2019-06-30,2019-07-30,63.60,0.00,INV,2020-09-18 16:21:06.333
1009324,196341411,16764236,0,2019-06-30,2019-07-30,15.00,0.00,INV,2020-09-18 16:21:06.333
1009325,196341507,16832368,0,2019-06-30,2019-07-30,74.76,0.00,INV,2020-09-18 16:21:06.333


In [7]:
pd.read_csv('data/PaymentsExtract.csv')

Unnamed: 0,InvoiceKey,CustomerKey,PaymentValue,PaymentDate,PaymentType
0,196327726,16909966,137.82,2018-08-31,Barclays Multi
1,195354898,16758142,123.05,2020-09-15,Barclays Multi
2,196819246,16758142,145.76,2020-09-15,Barclays Multi
3,196385894,16765211,1149.72,2020-09-15,Barclays Multi
4,197226397,16907262,39.94,2020-09-15,Barclays Multi
...,...,...,...,...,...
549743,195439402,16836504,78.16,2018-08-31,Barclays Multi
549744,197095238,16772250,1771.79,2018-08-31,Barclays Multi
549745,195527079,16802894,41.89,2018-08-31,Barclays Multi
549746,195695090,16965351,1377.00,2018-08-31,Barclays Multi


In [8]:
data_labels = ['Debtor', 'Invoice', 'Payments']
data_dict = {data_labels[i]: pd.read_csv(data_files[i]) for i in range(len(data_labels))}

In [10]:
df = pd.merge(data_dict['Invoice'], data_dict['Payments'], on = ['InvoiceKey', 'CustomerKey'])

In [11]:
df

Unnamed: 0,InvoiceKey,CustomerKey,IsCreditInvoice,RaisedDate,DueDate,OriginalInvoiceAmount,AmountOutstanding,StatementTransactionType,ExtractDate,PaymentValue,PaymentDate,PaymentType
0,197057173,16776992,1,2018-01-02,2018-01-02,-645.66,0.0,JRN,2020-09-18 16:21:06.333,-645.66,2018-12-17,Barclays Multi
1,196559812,16837363,1,2018-01-02,2018-01-02,-139.37,0.0,JRN,2020-09-18 16:21:06.333,-139.37,2018-11-01,Barclays Multi
2,195300967,16724474,1,2018-01-03,2018-01-03,-88.50,0.0,JRN,2020-09-18 16:21:06.333,-88.50,2018-09-24,Barclays Multi
3,195519971,16745090,0,2018-01-03,2018-03-02,168.00,0.0,INV,2020-09-18 16:21:06.333,168.00,2018-11-23,Barclays Multi
4,196697217,16817529,1,2018-01-03,2018-01-03,-626.38,0.0,JRN,2020-09-18 16:21:06.333,-626.38,2019-12-17,Barclays Multi
...,...,...,...,...,...,...,...,...,...,...,...,...
361514,196340182,16725749,0,2019-06-30,2019-07-30,95.90,0.0,INV,2020-09-18 16:21:06.333,95.90,2019-08-14,Barclays Multi
361515,196340751,16728765,0,2019-06-30,2019-07-30,406.09,0.0,INV,2020-09-18 16:21:06.333,406.09,2019-08-05,Barclays Multi
361516,196341240,16971160,0,2019-06-30,2019-07-30,130.56,0.0,INV,2020-09-18 16:21:06.333,130.56,2019-07-24,CheckM8
361517,196341411,16764236,0,2019-06-30,2019-07-30,15.00,0.0,INV,2020-09-18 16:21:06.333,15.00,2019-07-08,Barclays Multi


In [12]:
categorical_feats = ['IsCreditInvoice', 'StatementTransactionType', 'PaymentType']
continuous_feats = ['OriginalInvoiceAmount', 'AmountOutstanding', 'PaymentValue']

In [14]:
gmm = GaussianMixture(n_components=2)

In [18]:
df[['IsCreditInvoice'] + continuous_feats]

Unnamed: 0,IsCreditInvoice,OriginalInvoiceAmount,AmountOutstanding,PaymentValue
0,1,-645.66,0.0,-645.66
1,1,-139.37,0.0,-139.37
2,1,-88.50,0.0,-88.50
3,0,168.00,0.0,168.00
4,1,-626.38,0.0,-626.38
...,...,...,...,...
361514,0,95.90,0.0,95.90
361515,0,406.09,0.0,406.09
361516,0,130.56,0.0,130.56
361517,0,15.00,0.0,15.00


In [19]:
gmm.fit(df[['IsCreditInvoice'] + continuous_feats])

GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
                means_init=None, n_components=2, n_init=1, precisions_init=None,
                random_state=None, reg_covar=1e-06, tol=0.001, verbose=0,
                verbose_interval=10, warm_start=False, weights_init=None)

In [None]:
def distanceFunc(X, MU):
    # Inputs
    # X: is an NxD matrix (N observations and D dimensions)
    # MU: is an KxD matrix (K means and D dimensions)
    # Outputs
    # pair_dist: is the squared pairwise distance matrix (NxK)
    # TODO
    pair_dist = tf.transpose(tf.reduce_sum(tf.square(X - MU), axis=2))
    return pair_dist

In [None]:
def log_GaussPDF(X, mu, sigma):
    # Inputs
    # X: N X D
    # mu: K X D
    # sigma: K X 1

    # Outputs:
    # log Gaussian PDF N X K

    # TODO
    dim = data.shape[1]
    dist = distanceFunc(X, mu)
    sigma = tf.transpose(sigma)
    gauss_coeff = tf.log(2 * np.pi * sigma)
    pdf = -(1/2)*dim*gauss_coeff - (dist/(2*sigma))
    return pdf

In [None]:
def log_posterior(log_PDF, log_pi):
    # Input
    # log_PDF: log Gaussian PDF N X K
    # log_pi: K X 1

    # Outputs
    # log_post: N X K

    # TODO
    log_pi = tf.transpose(log_pi)
    prob = log_pi + log_PDF
    prob_sum = hlp.reduce_logsumexp(prob, keep_dims=True)
    posterior = prob - prob_sum
    return posterior

In [None]:
def loss_function(log_PDF, log_pi):
    loss = -tf.reduce_sum(hlp.reduce_logsumexp(log_PDF + log_pi, 1, keep_dims=True), axis =0)
    return loss