In [20]:
# pip install pandas
# pip install scikit-learn
# pip install coremltools
# pip install matplotlib
# pip install seaborn
# pip install kmodes

In [21]:
import numpy as np  # for numerical computation 
import pandas as pd # for data manipulation for tabular data

import matplotlib.pyplot as plt # quick data visualization
import matplotlib as mpl

import seaborn as sns # visualization with modeling information

from sklearn.preprocessing import StandardScaler # Standardization of Your Variables
from sklearn.cluster import KMeans # for K-Means clustering algorithm
from sklearn.pipeline import make_pipeline

import time # for our customized timer function

from coremltools.converters import sklearn as coreml_converter

In [22]:
# Load data
data =\
(
    pd
    .read_csv("transactions.csv")
)

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   id           30 non-null     object 
 1   Amount       30 non-null     float64
 2   Category     30 non-null     object 
 3   Description  30 non-null     object 
 4   date         30 non-null     object 
dtypes: float64(1), object(4)
memory usage: 1.3+ KB


In [23]:
data.head(5)

Unnamed: 0,id,Amount,Category,Description,date
0,T0-28,23.6,Dining,Deliveroo,2024-11-16
1,T0-27,7.5,Dining,Breakfast,2024-11-16
2,T0-26,230.02,Shopping,Apple Monthly Instalment,2024-11-14
3,T0-25,5.0,Dining,Koufu,2024-11-13
4,T0-24,3.8,Dining,Coffeebot,2024-11-12


### WRANGLE

In [24]:
# Min-max scaling for the 'Amount' column to range 1-100
data['Amount-Adj'] = ((data['Amount'] - data['Amount'].min()) / (data['Amount'].max() - data['Amount'].min())) * 99 + 1
data.head(5)

Unnamed: 0,id,Amount,Category,Description,date,Amount-Adj
0,T0-28,23.6,Dining,Deliveroo,2024-11-16,10.550731
1,T0-27,7.5,Dining,Breakfast,2024-11-16,3.574017
2,T0-26,230.02,Shopping,Apple Monthly Instalment,2024-11-14,100.0
3,T0-25,5.0,Dining,Koufu,2024-11-13,2.490677
4,T0-24,3.8,Dining,Coffeebot,2024-11-12,1.970673


In [25]:
scaler = StandardScaler()

data_Z =\
(
    data
    .copy()
)

In [26]:
data_Z["Amount"] =\
(
    #chaining
    #flow-programming
    scaler
    .fit_transform(data_Z
                   [["Amount"]]
                  )
)

In [27]:
data_Z.head(5)

Unnamed: 0,id,Amount,Category,Description,date,Amount-Adj
0,T0-28,0.082696,Dining,Deliveroo,2024-11-16,10.550731
1,T0-27,-0.307137,Dining,Breakfast,2024-11-16,3.574017
2,T0-26,5.080788,Shopping,Apple Monthly Instalment,2024-11-14,100.0
3,T0-25,-0.36767,Dining,Koufu,2024-11-13,2.490677
4,T0-24,-0.396725,Dining,Coffeebot,2024-11-12,1.970673


In [39]:
selected_attributes = ["Amount", "Category"]

In [29]:
# dependencies for kmodes & kprototypes

from kmodes.kprototypes import KPrototypes

In [40]:
kprotypes_algo =\
    KPrototypes(n_clusters = 3,
                random_state = 240424) # k-principle algorithm = semi-supervised learning

data_Z["clusters"] =\
(
    kprotypes_algo
    .fit_predict(data_Z[selected_attributes],
                 categorical = [1]
                )
)
data_Z \
    [["Amount", "Category", "date", "clusters"]]

Unnamed: 0,Amount,Category,date,clusters
0,0.082696,Dining,2024-11-16,0
1,-0.307137,Dining,2024-11-16,0
2,5.080788,Shopping,2024-11-14,2
3,-0.36767,Dining,2024-11-13,0
4,-0.396725,Dining,2024-11-12,0
5,0.288509,Dining,2024-11-11,0
6,0.419986,Shopping,2024-11-10,0
7,-0.101324,Dining,2024-11-10,0
8,-0.290187,Dining,2024-11-09,0
9,-0.450963,Transportation,2024-11-08,1


In [31]:
# (
#     data
#     .plot
#     .scatter(x = "Amount-Adj",
#              y = "spend")
# )

In [32]:


# # Save to CoreML
# coreml_model = coreml_converter.convert(pipeline)
# coreml_model.save("TransactionClustering.mlmodel")
