# Categorizing Banking Transactions with Machine Learning

    by Gerrit Nowald

In [7]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## input

In [8]:
file_db = 'transactions'

# column names in transactions file
clm = dict(
    text     = 'description' ,
    category = 'category'    ,
    )

## load data

In [9]:
transactions = pd.read_csv( file_db + '.csv', encoding = 'ISO-8859-1')

## definitions

In [10]:
def PreProcText(texts):
    """extracts individual words from transaction text
    input & output: pandas series"""
    texts = texts.str.lower()
    texts = texts.str.replace('ä','ae')
    texts = texts.str.replace('ö','oe')
    texts = texts.str.replace('ü','ue')
    texts = texts.str.replace('ß','ss')
    texts = texts.str.replace('[^a-z ]', ' ', regex=True)     # removes all non-alphabetical characters
    texts = texts.str.split()
    texts = texts.apply(lambda keywords: { word for word in keywords if len(word) >= 3 } )    # every word only once
    texts = texts.str.join(' ')
    return texts

vectorizer = CountVectorizer(ngram_range=(1,1), max_features = 500)
classifier = RandomForestClassifier(100)

## train classifier

In [11]:
# text pre-processing
keywords = PreProcText(transactions[clm['text']])

# split data
ind_sep            = int(transactions.shape[0] / 10) # part of data for testing
transactions_train = transactions.iloc[ind_sep:]
transactions_test  = transactions.iloc[:ind_sep]
keywords_train     =     keywords.iloc[ind_sep:]
keywords_test      =     keywords.iloc[:ind_sep]

# feature extraction  
y_train = transactions_train[clm['category']]
X_train = vectorizer.fit_transform(keywords_train).toarray()

# list of keywords used for training with frequency (for development)
keyword_list = pd.DataFrame(np.column_stack(( vectorizer.get_feature_names_out() , sum(X_train) )) )

# train classifier
classifier.fit(X_train, y_train)

## test classifier

In [12]:
# feature extraction
y_test = transactions_test[clm['category']]
X_test = vectorizer.transform(keywords_test).toarray()

# classification
y_pred = classifier.predict(X_test)

# compare prediction & real data
print(f'accuracy: {int(100*accuracy_score(y_test, y_pred))} %')

accuracy: 73 %
