## Import Libraries

In [1]:
import pandas as pd
import nltk as lp
import string
import re
import numpy as np

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

## Data processing

### Load in data

In [5]:
def load_expenses(file_path: str) -> pd.DataFrame:
    """
    Load expenses from a CSV file.

    Parameters
    ----------
    file_path : str
        The path to the CSV file.

    Returns
    -------
    pd.DataFrame
        A DataFrame containing the expenses.
    """
    df = pd.read_csv("expenses.csv", sep=',', parse_dates=['date'], names=['date','account_name','from_acc_num','to_acc_num','code','af_bij','amount_(eur)','type','description'], header=0)
    df['amount_(eur)'] = [float(i.replace(',','.')) for i in df['amount_(eur)']]
    df = df.fillna(0)
    return df

## Model Training

### Define categories

In [32]:
categories = ['entertainment','food','gifts_donation','housing','income','insurance','legal','loans','personal','savings_investments','subscriptions','taxes','transportation']
sub_categories = [['video/dvd','games','movies','concerts','sporting_events','live_theater','other'],
                  ['groceries','dine_out','other'],
                  ['birthday'],
                  ['mortgage_rent','phone','energy','water','gas','cable','waste_removal','maintenance_repairs','supplies','other'],
                  ['investments','salary','stocks','allowance','commission','interest','government_payments','gifts','payback'],
                  ['home','health','life','other'],
                  ['attorney','alimony','other'],
                  ['personal','credit_card','student','housing','other'],
                  ['medical','personal_care','clothing','tech','music','sports','other'],
                  ['retirement','investment','bank'],
                  ['phone','entertainment','fitness','learning','other'],
                  ['federal','state','local','income','other'],
                  ['vehicle_payment','public_transport','licensing','fuel','maintenance','other']]

#### Combine lists into dictionary

In [104]:
categories_dict = {categories[i]: sub_categories[i] for i in range(len(categories))}
categories_dict

{'entertainment': ['video/dvd',
  'games',
  'movies',
  'concerts',
  'sporting_events',
  'live_theater',
  'other'],
 'food': ['groceries', 'dine_out', 'other'],
 'gifts_donation': ['birthday'],
 'housing': ['mortgage_rent',
  'phone',
  'energy',
  'water',
  'gas',
  'cable',
  'waste_removal',
  'maintenance_repairs',
  'supplies',
  'other'],
 'income': ['investments',
  'salary',
  'stocks',
  'allowance',
  'commission',
  'interest',
  'government_payments',
  'gifts',
  'payback'],
 'insurance': ['home', 'health', 'life', 'other'],
 'legal': ['attorney', 'alimony', 'other'],
 'loans': ['personal', 'credit_card', 'student', 'housing', 'other'],
 'personal': ['medical',
  'personal_care',
  'clothing',
  'tech',
  'music',
  'sports',
  'other'],
 'savings_investments': ['retirement', 'investment', 'bank'],
 'subscriptions': ['phone', 'entertainment', 'fitness', 'learning', 'other'],
 'taxes': ['federal', 'state', 'local', 'income', 'other'],
 'transportation': ['vehicle_payme

### Convert text to numbers

#### Text preprocessing

##### Define columns with texts as object_columns

In [27]:
object_columns = df.loc[:,df.dtypes == 'object'].columns

for i, value in enumerate(object_columns):
    df[value] = df[value].astype(dtype='str')

##### Remove acc numbers columns

In [59]:
object_columns = object_columns.drop(['from_acc_num','to_acc_num'])

##### Convert all strings to lowercase, remove all punctuations, remove words combined with digits

In [60]:
for i, value in enumerate(object_columns):
    df[value] = df[value].str.lower() ## converts all strings to lowercase
    df[value] = df[value].apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x)) ## removes all punctuation from text
    df[value] = df[value].apply(lambda x: re.sub('\w*\d\w*', '', x)) # removes words with digits like m34f1f31 from each row

##### Tokenization of words

In [64]:
for i, value in enumerate(object_columns):
    df[f'{value}_tokens'] = df[value].apply(lambda x: word_tokenize(x)) ## converts all strings to lowercase and store as new column

In [65]:
df.loc[:,'account_name_tokens':]

Unnamed: 0,account_name_tokens,code_tokens,af_bij_tokens,type_tokens,description_tokens
0,"[cbr, via, ing]",[id],[af],[ideal],"[naam, cbr, via, ing, omschrijving, betalen, m..."
1,"[hr, zy, lian]",[gt],[bij],"[online, bankieren]","[van, oranje, spaarrekening, valutadatum]"
2,"[sports, supplements, ltd]",[id],[af],[ideal],"[naam, sports, supplements, ltd, omschrijving,..."
3,"[mw, e, guerrero, mavrou]",[gt],[af],"[online, bankieren]","[naam, mw, e, guerrero, mavrou, omschrijving, ..."
4,"[jumbo, overwinningspl, groningen]",[ba],[af],[betaalautomaat],"[pasvolgnr, transactie, term, valutadatum]"
...,...,...,...,...,...
5953,"[abnamro, groningen, nld]",[gm],[af],[geldautomaat],"[pasvolgnr, transactie, term, valutadatum]"
5954,"[kantine, scheikunde, groningen, nld]",[ba],[af],[betaalautomaat],"[pasvolgnr, transactie, term, valutadatum]"
5955,"[kantine, scheikunde, groningen, nld]",[ba],[af],[betaalautomaat],"[pasvolgnr, transactie, term, valutadatum]"
5956,[globalcollect],[id],[af],[ideal],"[naam, globalcollect, omschrijving, iban, kenm..."


##### Stopword removal

In [66]:
english_stopwords = stopwords.words('english')
dutch_stopwords = stopwords.words('dutch')

In [75]:
for i,columns in enumerate(object_columns):
    for j,rows in enumerate(df.index):
        print(columns, rows)
        print(df[columns][rows])
        # df.replace(df[columns][rows], [word for word in df[columns][rows].split() if word not in english_stopwords or word not in dutch_stopwords])
        break
    break

account_name 0
cbr via ing


##### Stemming and Lemmatization

##### Part of Speech Tagging

##### Information Retrieval

# Use sklearn-hierarchical-classification to classify category and subcategory

In [2]:
import pandas as pd
import re
import numpy as np

from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

## Read data

In [92]:
df = pd.read_excel("expenses.xlsx", sheet_name='Ledger', names=['date','name','from_acc_num','to_acc_num','code','debit_credit','amount','transaction_type','description','category','sub_category'], header=0)

## Read categories and subcategories

In [93]:
from sql_connector import CreateURL
from sqlalchemy import create_engine
from sqlalchemy import text
from dotenv import dotenv_values

# connect to azure sql database using sqlalchemy
url = CreateURL(database='main_db')
engine = create_engine(url, echo=True)

with engine.connect() as conn:
    result = conn.execute(text("SELECT categories.name as category, subcategories.name as subcategory FROM categories join subcategories on categories.id = subcategories.category_id"))
    result = result.fetchall()
    result = pd.DataFrame(result,)
    categories_dict = result.groupby('category')['subcategory'].apply(list).to_dict()

2023-06-16 23:11:44,928 INFO sqlalchemy.engine.Engine SELECT CAST(SERVERPROPERTY('ProductVersion') AS VARCHAR)
2023-06-16 23:11:44,929 INFO sqlalchemy.engine.Engine [raw sql] ()
2023-06-16 23:11:44,940 INFO sqlalchemy.engine.Engine SELECT schema_name()
2023-06-16 23:11:44,941 INFO sqlalchemy.engine.Engine [generated in 0.00062s] ()
2023-06-16 23:11:44,976 INFO sqlalchemy.engine.Engine SELECT CAST('test max support' AS NVARCHAR(max))
2023-06-16 23:11:44,977 INFO sqlalchemy.engine.Engine [generated in 0.00079s] ()
2023-06-16 23:11:44,988 INFO sqlalchemy.engine.Engine SELECT 1 FROM fn_listextendedproperty(default, default, default, default, default, default, default)
2023-06-16 23:11:44,988 INFO sqlalchemy.engine.Engine [generated in 0.00059s] ()
2023-06-16 23:11:45,016 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2023-06-16 23:11:45,017 INFO sqlalchemy.engine.Engine SELECT categories.name as category, subcategories.name as subcategory FROM categories join subcategories on categories.id

## Preprocess data

In [94]:
df[['amount']] = df[['amount']].replace(',', '.', regex=True)
df[['amount']] = df[['amount']].astype(dtype='float')
df['to_acc_num'].replace(np.nan, '0', inplace=True)

In [95]:
# Remove all rows with credit transactions
df = df[df['debit_credit'] == 'Debit']

# preprocess text
columns_to_preprocess = ['name', 'code', 'debit_credit', 'transaction_type', 'description']
df[columns_to_preprocess] = df[columns_to_preprocess].apply(lambda x: x.str.lower()) ## converts all strings to lowercase
df[columns_to_preprocess] = df[columns_to_preprocess].applymap(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x)) ## removes all punctuation from text
df[columns_to_preprocess] = df[columns_to_preprocess].applymap(lambda x: re.sub('\w*\d\w*', '', x)) # removes words with digits like m34f1f31 from each row

# tokenize words - only needed if not using sklearn vectorizer
# for i, value in enumerate(columns_to_preprocess):
#     df[f'{value}_tokens'] = df[value].apply(lambda x: word_tokenize(x)) ## converts all strings to lowercase and store as new column

english_stopwords = stopwords.words('english')
dutch_stopwords = stopwords.words('dutch')

df

Unnamed: 0,date,name,from_acc_num,to_acc_num,code,debit_credit,amount,transaction_type,description,category,sub_category
1,2017-09-15,adyen,NL58INGB0658210033,NL48RABO0132394782,id,debit,18.00,ideal,name adyen description pokerstars adyenpayme...,,
2,2017-09-15,nationalenederlanden,NL58INGB0658210033,NL38INGB0703076094,ic,debit,8.12,sepa direct debit,name nationalenederlanden description ing stud...,,
4,2017-09-15,stichting derdengelden buckaroo,NL58INGB0658210033,NL19DEUT0319821366,id,debit,207.83,ideal,name stichting derdengelden buckaroo descripti...,,
5,2017-09-18,cmjump xl groningen groningen,NL58INGB0658210033,0,ba,debit,2.75,payment terminal,card sequence no transaction term value d...,,
6,2017-09-18,jump xl groningen bv groningen,NL58INGB0658210033,0,ba,debit,12.00,payment terminal,card sequence no transaction term value d...,,
...,...,...,...,...,...,...,...,...,...,...,...
7042,2023-06-11,amazon eu sarl,NL58INGB0658210033,DE64202208000093111064,id,debit,16.99,ideal,name amazon eu sarl description amazonretail...,,
7043,2023-06-11,parking wsc bruxelles bel,NL58INGB0658210033,0,ba,debit,1.50,payment terminal,card sequence no transaction term apple p...,Transportation,Other
7044,2023-06-11,parking front zaventem bel,NL58INGB0658210033,0,ba,debit,5.00,payment terminal,card sequence no transaction term apple p...,Transportation,Other
7045,2023-06-11,starbucks pz zaventem bel,NL58INGB0658210033,0,ba,debit,22.40,payment terminal,card sequence no transaction term apple p...,Food,Dine Out


## Split to features and target variable

In [15]:
# Split the data into features and target variable
X = df[['date', 'name', 'from_acc_num', 'to_acc_num', 'code', 'debit_credit', 'amount', 'transaction_type', 'description']]
y_category = df['category']
y_subcategory = df['sub_category']

In [None]:
# Define the preprocessing for the text and numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('text', TfidfVectorizer(), ['name', 'to_acc_num', 'code', 'debit_credit', 'transaction_type', 'description']),
        ('num', StandardScaler(), ['amount_(eur)'])
    ])
preprocessor

In [18]:
# Create a pipeline that combines the preprocessor with a classifier for the top-level categories
category_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])
category_pipeline

## Train model

In [None]:
# Train the model for the top-level categories
X_train, X_test, y_train, y_test = train_test_split(X, y_category, test_size=0.2, random_state=42)
category_pipeline.fit(X_train, y_train)

## Predict and Evaluate

In [None]:
# Evaluate the model for the top-level categories
y_pred = category_pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

# Now create a separate model for each top-level category to predict the subcategories
for category in y_category.unique():
    # Filter the data for this category
    X_category = X[y_category == category]
    y_subcategory_category = y_subcategory[y_category == category]

    # Create a pipeline for this category
    subcategory_pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier())
    ])

    # Train the model for this category
    X_train, X_test, y_train, y_test = train_test_split(X_category, y_subcategory_category, test_size=0.2, random_state=42)
    subcategory_pipeline.fit(X_train, y_train)

    # Evaluate the model for this category
    y_pred = subcategory_pipeline.predict(X_test)
    print(f"Classification report for {category}:")
    print(classification_report(y_test, y_pred))


# Unsupervised

In [96]:
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans

# Assume 'df' is your DataFrame.
X = df[['name', 'description', 'amount', 'transaction_type', 'to_acc_num']]

# Create transformers
vectorizer = TfidfVectorizer()
scaler = StandardScaler()
encoder = OneHotEncoder()

# Define preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('text_name', vectorizer, 'name'),
        ('text_desc', vectorizer, 'description'),
        ('text_to_acc_num', vectorizer, 'to_acc_num'),
        ('num', scaler, ['amount']),
        ('transaction_type', encoder, ['transaction_type']),
    ])

# Fit and transform the data
X_processed = preprocessor.fit_transform(X)

## Fit and Cluster

In [97]:
# Fit KMeans model
kmeans = KMeans(n_clusters=10, random_state=42).fit(X_processed)

## Assign cluster labels

In [98]:
# Assign clusters back to our dataframe
df['cluster'] = kmeans.labels_

In [123]:
feature_names = []

# Get the transformers of the preprocessor
for name, transformer, _ in preprocessor.transformers_:
    if name in ['text_name', 'text_desc', 'text_to_acc_num']: # if transformer is a vectorizer
        feature_names.extend([f'{name}_{f}' for f in transformer.get_feature_names_out()])
    elif name == 'num':  # if transformer is a scaler
        feature_names.extend(transformer.get_feature_names_out())
    elif name == 'transaction_type':  # if transformer is an encoder
        feature_names.extend(transformer.get_feature_names_out())

len(feature_names)

2598

In [None]:
# Get the cluster centroids
centroids = kmeans.cluster_centers_
feature_names = np.array(feature_names)
# Get the top 10 features (words) for each cluster
top_n = 20
cluster_top_features = []
for i, centroid in enumerate(centroids):
    # Get the indices that would sort the centroid array in descending order and select top_n indices
    top_indices = centroid.argsort()[::-1][:top_n]
    # Get corresponding features
    top_features = feature_names[[top_indices]]

    # Append the top features for this cluster to the list of cluster top features
    cluster_top_features.append(top_features)

# Create a DataFrame with the cluster number and top features
df_cluster_top_features = pd.DataFrame({'cluster': range(10), 'top_features': cluster_top_features})

In [None]:
clustered_df = pd.merge(df, df_cluster_top_features, on='cluster', how='left')
clustered_df.to_excel('clustered_df.xlsx', index=False, sheet_name='10_clusters')

## Define function to test n number of clusters

In [149]:
# Function to fit kmeans, get top features and assign clusters to the data
def cluster_data(df, n_clusters=10, top_n=20):
    # Assume 'df' is your DataFrame.
    X = df[['name', 'description', 'amount', 'transaction_type', 'to_acc_num']]

    # Create transformers
    vectorizer = TfidfVectorizer()
    scaler = StandardScaler()
    encoder = OneHotEncoder()

    # Define preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('name', vectorizer, 'name'),
            ('desc', vectorizer, 'description'),
            # ('text_to_acc_num', vectorizer, 'to_acc_num'),
            ('num', scaler, ['amount']),
            ('transaction_type', encoder, ['transaction_type']),
        ])

    # Fit and transform the data
    X_processed = preprocessor.fit_transform(X)

    # Fit KMeans model
    kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X_processed)

    # Assign clusters back to our dataframe
    df['cluster'] = kmeans.labels_

    feature_names = []

    # Get the transformers of the preprocessor
    for name, transformer, _ in preprocessor.transformers_:
        if name in ['name', 'desc',]: # if transformer is a vectorizer
            feature_names.extend([f'{name}_{f}' for f in transformer.get_feature_names_out()])
        elif name == 'num':  # if transformer is a scaler
            feature_names.extend(transformer.get_feature_names_out())
        elif name == 'transaction_type':  # if transformer is an encoder
            feature_names.extend(transformer.get_feature_names_out())

    feature_names = np.array(feature_names)
    # Get the cluster centroids
    centroids = kmeans.cluster_centers_
    # Get the top 10 features (words) for each cluster
    cluster_top_features = []
    for i, centroid in enumerate(centroids):
        # Get the indices that would sort the centroid array in descending order and select top_n indices
        top_indices = centroid.argsort()[::-1][:top_n]
        # Get corresponding features
        top_features = feature_names[[top_indices]]

        # Append the top features for this cluster to the list of cluster top features
        cluster_top_features.append(top_features)

    # Create a DataFrame with the cluster number and top features
    df_cluster_top_features = pd.DataFrame({'cluster': range(n_clusters), 'top_features': cluster_top_features})

    clustered_df = pd.merge(df, df_cluster_top_features, on='cluster', how='left')
    return X_processed, kmeans, df_cluster_top_features, clustered_df


In [136]:
# Call the function for a list of cluster numbers, first create a list of clusters, call the function, then save the results into excel on different sheets
cluster_numbers = [5, 10, 15, 20, 25, 30]
clustered_dfs = []
for n in cluster_numbers:
    _, _, _, clustered_df = cluster_data(df, n_clusters=n)
    clustered_dfs.append(clustered_df)

with pd.ExcelWriter('clustered_dfs.xlsx') as writer:
    for i, clustered_df in enumerate(clustered_dfs):
        clustered_df.to_excel(writer, sheet_name=f'{cluster_numbers[i]}_clusters', index=False)


In [156]:
import plotly.graph_objects as go
from sklearn.decomposition import TruncatedSVD
n_clusters = 13
# Run the clustering function to get the reduced features, cluster labels, top features, and the clustered dataframe
X_processed, kmeans, df_cluster_top_features, clustered_df = cluster_data(df, n_clusters=n_clusters, top_n=30)

# Use TruncatedSVD
svd = TruncatedSVD(n_components=3)
reduced_features = svd.fit_transform(X_processed)

# Create a trace for each cluster
traces = []
for i in range(n_clusters):  # assuming you have 10 clusters
    cluster_i = reduced_features[clustered_df['cluster']==i]
    
    top_features_i = df_cluster_top_features.loc[i, 'top_features']  # get the top features for cluster i
    
    # Convert arrays to lists of strings
    top_features_i = ['<br>'.join(feature.tolist()) for feature in top_features_i]
    
    trace = go.Scatter3d(
        x=cluster_i[:,0], 
        y=cluster_i[:,1], 
        z=cluster_i[:,2],
        mode='markers',
        name='Cluster '+str(i),
        text=[top_features_i]*len(cluster_i),  # set the tooltip text
        hoverinfo='text',  # show only the tooltip text
    )
    traces.append(trace)

# Define layout
layout = go.Layout(
    title='Clusters after Truncated SVD in 3D',
    scene=dict(
        xaxis=dict(title='Component 1'),
        yaxis=dict(title='Component 2'),
        zaxis=dict(title='Component 3'),
    ),
    autosize=False,
    width=500,
    height=500,
    margin=dict(l=0, r=0, b=0, t=0)
)

fig = go.Figure(data=traces, layout=layout)
fig.show()

