In [24]:
# !pip3 install langchain_community

In [25]:
from langchain_community.llms import Ollama

llm = Ollama(model="llama3")
llm.invoke("The first man on the moon was ...")

'A classic question!\n\nThe first two men to walk on the moon were Neil Armstrong and Edwin "Buzz" Aldrin. They landed on July 20, 1969, as part of the Apollo 11 mission.\n\nNeil Armstrong stepped out of the lunar module Eagle and became the first person to set foot on the moon\'s surface at 2:56 UTC on July 21, 1969. He famously declared, "That\'s one small step for man, one giant leap for mankind" as he took his first steps.\n\nBuzz Aldrin joined Armstrong on the surface shortly afterwards, and they spent about two and a half hours collecting samples and conducting experiments.'

In [26]:
llm.invoke("Can you add an appropriate category next to each of the following expenses. Respond with a list of categories separated by commas. For example, Birtat Restaurant Amsterdam,Spotify AB by Adyen - \
Entertainment, Beta Boulders Ams Amsterdam Nld - Sports, etc.: \
ISS Catering Services De Meern, Vishandel Sier AMSTELVEEN, Ministerie van Justitie en Veiligheid, Etos AMSTERDAM NLD, Bistro Bar Amsterdam")

'Here is the list with categories added:\n\nISS Catering Services De Meern,Food and Beverage, Vishandel Sier AMSTELVEEN,Food and Beverage, Ministerie van Justitie en Veiligheid,Taxes, Etos AMSTERDAM NLD,Health and Wellness, Bistro Bar Amsterdam,Entertainment'

# Read transaction data

In [27]:
# Read the transactions_2022_2023.csv file 
import pandas as pd
df = pd.read_csv("https://raw.githubusercontent.com/thu-vu92/local-llms-analyse-finance/refs/heads/main/transactions_2022_2023.csv")
df.head()

Unnamed: 0,Date,Name / Description,Expense/Income,Amount (EUR)
0,2023-12-30,Belastingdienst,Expense,9.96
1,2023-12-30,Tesco Breda,Expense,17.53
2,2023-12-30,Monthly Appartment Rent,Expense,451.0
3,2023-12-30,Vishandel Sier Amsterdam,Expense,12.46
4,2023-12-29,Selling Paintings,Income,13.63


In [28]:
# Get unique transactions in the Name / Description column
unique_transactions = df["Name / Description"].unique()
len(unique_transactions)

23

In [29]:
unique_transactions[1:10]

array(['Tesco Breda', 'Monthly Appartment Rent',
       'Vishandel Sier Amsterdam', 'Selling Paintings',
       'Spotify Ab By Adyen', 'Tk Maxx Amsterdam Da', 'Consulting',
       'Aidsfonds', 'Tls Bv Inz Ov-Chipkaart'], dtype=object)

# Categorise bank transactions with Llama3

In [30]:
# Get index list
#https://stackoverflow.com/questions/47518609/for-loop-range-and-interval-how-to-include-last-step
def hop(start, stop, step):
    for i in range(start, stop, step):
        yield i
    yield stop

index_list = list(hop(0, len(unique_transactions), 30))
index_list

[0, 23]

In [31]:
# Output validation
from pydantic import BaseModel, field_validator
from typing import List

# Validate response format - check if it actually contains hyphen ("-")
class ResponseChecks(BaseModel):
    data: List[str]

    @field_validator("data")
    def check(cls, value):
        for item in value:
            if len(item) > 0:
                assert "-" in item, "String does not contain hyphen."

# Test validation
ResponseChecks(data = ['Hello - World', 'Hello - there!'])

ResponseChecks(data=None)

In [32]:
llm.invoke("Can you add an appropriate category next to each of the following expenses. Respond with a list of categories separated by commas. For example, Spotify AB by Adyen - \
Entertainment, Beta Boulders Ams Amsterdam Nld - Sports, etc.: \
ISS Catering Services De Meern, Vishandel Sier AMSTELVEEN, Ministerie van Justitie en Veiligheid, Etos AMSTERDAM NLD, Bistro Bar Amsterdam")

'Here is the list with categories added:\n\nISS Catering Services De Meern - Food, Vishandel Sier AMSTELVEEN - Home, Ministerie van Justitie en Veiligheid - Taxes, Etos AMSTERDAM NLD - Health and Beauty, Bistro Bar Amsterdam - Entertainment'

In [33]:
# Test out the function
categorize_transactions('ISS Catering Services De Meern, Vishandel Sier AMSTELVEEN, Etos AMSTERDAM NLD, Bistro Bar Amsterdam',
                        llm)

['* ISS Catering Services De Meern - Food', '* Vishandel Sier AMSTELVEEN - Food', '* Etos AMSTERDAM NLD - Shopping', '* Bistro Bar Amsterdam - Entertainment']


Unnamed: 0,Transaction vs category,Transaction,Category
0,* ISS Catering Services De Meern - Food,* ISS Catering Services De Meern,Food
1,* Vishandel Sier AMSTELVEEN - Food,* Vishandel Sier AMSTELVEEN,Food
2,* Etos AMSTERDAM NLD - Shopping,* Etos AMSTERDAM NLD,Shopping
3,* Bistro Bar Amsterdam - Entertainment,* Bistro Bar Amsterdam,Entertainment


In [34]:
# Intialise the categories_df_all dataframe
categories_df_all = pd.DataFrame()
max_tries = 7

# Loop through the index_list
for i in range(0, len(index_list)-1):
    transaction_names = unique_transactions[index_list[i]:index_list[i+1]]
    transaction_names = ','.join(transaction_names)

    # Try and validate output, if it fails, try again for max_tries=7 times
    for j in range(1, max_tries):
        try:
            categories_df = categorize_transactions(transaction_names, llm)
            categories_df_all = pd.concat([categories_df_all, categories_df], ignore_index=True)
            
        except:
            if j < max_tries:
                continue
            else:
                raise Exception(f"Cannot categorise transactions indexes {i} to {i+1}.")
        break

['1. Belastingdienst - Taxes', '2. Tesco Breda - Grocery', '3. Monthly Appartment Rent - Housing', '4. Vishandel Sier Amsterdam - Fish', '5. Selling Paintings - Artwork', '6. Spotify Ab By Adyen - Entertainment', '7. Tk Maxx Amsterdam Da - Clothing', '8. Consulting - Work', '9. Aidsfonds - Charity', '10. Tls Bv Inz Ov-Chipkaart - Transportation', '11. Etos Amsterdam - Pharmacy', '12. Beta Boulders Ams Amsterdam Nld - Sport', '13. Salary - Income', '14. Bouldermuur Bv Amsterdam - Housing', '15. Bratat Restaurant Amsterdam - Dining', '16. Freelancing - Work', '17. Tikkie - Food Delivery', '18. Blogging - Business', '19. Taxi Utrecht - Transportation', '20. Apple Services - Technology', '21. Amazon Lux - E-commerce', '22. Classpass* Monthly - Fitness', '23. Audible Uk AdblCo/Pymt Gbr - Entertainment']


In [35]:
categories_df_all.head()

Unnamed: 0,Transaction vs category,Transaction,Category
0,1. Belastingdienst - Taxes,1. Belastingdienst,Taxes
1,2. Tesco Breda - Grocery,2. Tesco Breda,Grocery
2,3. Monthly Appartment Rent - Housing,3. Monthly Appartment Rent,Housing
3,4. Vishandel Sier Amsterdam - Fish,4. Vishandel Sier Amsterdam,Fish
4,5. Selling Paintings - Artwork,5. Selling Paintings,Artwork


In [36]:
# categories_df_all.to_csv("categories_df_all.csv", index=False)

In [37]:
# Get unique categories in categories_df_all
unique_categories = categories_df_all["Category"].unique()
unique_categories

array(['Taxes', 'Grocery', 'Housing', 'Fish', 'Artwork', 'Entertainment',
       'Clothing', 'Work', 'Charity', 'Transportation', 'Pharmacy',
       'Sport', 'Income', 'Dining', 'Food Delivery', 'Business',
       'Technology', 'E-commerce', 'Fitness'], dtype=object)

In [38]:
# Drop NA values
categories_df_all = categories_df_all.dropna()

# If category contains "Food", then categorise as "Food and Drinks"
categories_df_all.loc[categories_df_all['Category'].str.contains("Food"), 'Category'] = "Food and Drinks"
# If category contains "Clothing", then categorise as "Clothing"
categories_df_all.loc[categories_df_all['Category'].str.contains("Clothing"), 'Category'] = "Clothing"
# If category contains "Services", then categorise as "Services"
categories_df_all.loc[categories_df_all['Category'].str.contains("Services"), 'Category'] = "Services"
# If category contains "Health" or "Wellness", then categorise as "Health and Wellness"
categories_df_all.loc[categories_df_all['Category'].str.contains("Health"), 'Category'] = "Health and Wellness"
# If category contains "Sport", then categorise as "Sport and Fitness"
categories_df_all.loc[categories_df_all['Category'].str.contains("Sport"), 'Category'] = "Sport and Fitness"
# If category contains "Fitness", then categorise as "Sport and Fitness"
categories_df_all.loc[categories_df_all['Category'].str.contains("Fitness"), 'Category'] = "Sport and Fitness"
# If category contains "Travel", then categorise as "Travel"
categories_df_all.loc[categories_df_all['Category'].str.contains("Travel"), 'Category'] = "Travel"

In [39]:
# Remove the numbering eg "1. " from Transaction column
categories_df_all['Transaction'] = categories_df_all['Transaction'].str.replace(r'\d+\.\s+', '')
categories_df_all

Unnamed: 0,Transaction vs category,Transaction,Category
0,1. Belastingdienst - Taxes,1. Belastingdienst,Taxes
1,2. Tesco Breda - Grocery,2. Tesco Breda,Grocery
2,3. Monthly Appartment Rent - Housing,3. Monthly Appartment Rent,Housing
3,4. Vishandel Sier Amsterdam - Fish,4. Vishandel Sier Amsterdam,Fish
4,5. Selling Paintings - Artwork,5. Selling Paintings,Artwork
5,6. Spotify Ab By Adyen - Entertainment,6. Spotify Ab By Adyen,Entertainment
6,7. Tk Maxx Amsterdam Da - Clothing,7. Tk Maxx Amsterdam Da,Clothing
7,8. Consulting - Work,8. Consulting,Work
8,9. Aidsfonds - Charity,9. Aidsfonds,Charity
9,10. Tls Bv Inz Ov-Chipkaart - Transportation,10. Tls Bv Inz Ov-Chipkaart,Transportation


In [41]:
# Merge the categories_df_all with the transactions_2022_2023.csv dataframe (df)

df = pd.read_csv("https://raw.githubusercontent.com/thu-vu92/local-llms-analyse-finance/refs/heads/main/transactions_2022_2023.csv")
df.loc[df['Name / Description'].str.contains("Spotify"), 'Name / Description'] = "Spotify Ab By Adyen"
df = pd.merge(df, categories_df_all, left_on='Name / Description', right_on='Transaction', how='left')
df

Unnamed: 0,Date,Name / Description,Expense/Income,Amount (EUR),Transaction vs category,Transaction,Category
0,2023-12-30,Belastingdienst,Expense,9.96,,,
1,2023-12-30,Tesco Breda,Expense,17.53,,,
2,2023-12-30,Monthly Appartment Rent,Expense,451.0,,,
3,2023-12-30,Vishandel Sier Amsterdam,Expense,12.46,,,
4,2023-12-29,Selling Paintings,Income,13.63,,,
5,2023-12-29,Spotify Ab By Adyen,Expense,12.19,,,
6,2023-12-23,Tk Maxx Amsterdam Da,Expense,27.08,,,
7,2023-12-22,Consulting,Income,541.57,,,
8,2023-12-22,Aidsfonds,Expense,10.7,,,
9,2023-12-20,Consulting,Income,2641.93,,,
