In [None]:
# %pip install langchain_community
# %pip install pandas
%pip install google-generativeai langchain-google-genai python-dotenv pandas

In [None]:
from IPython.display import display
from IPython.display import Markdown
import textwrap


def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [1]:
import google.generativeai as genai
import os
from google.colab import userdata

genai.configure(api_key=userdata.get('GOOGLE_API_KEY'))

ModuleNotFoundError: No module named 'google.generativeai'

In [None]:
# from langchain_community.llms import Ollama

In [None]:
# llm = Ollama(model="gemma")
# llm.invoke("1+1 = ?")

In [None]:
# llm.invoke("Can you add an appropriate category next to each of the following expenses. Respond with a list of categories separated by commas. For example, Spotify AB by Adyen - \
# Entertainment, Beta Boulders Ams Amsterdam Nld - Sports, etc.: \
# Taxi Utrecht, Ministerie van Justitie en Veiligheid, Etos AMSTERDAM NLD, Bistro Bar Amsterdam")

In [None]:
model = genai.GenerativeModel(model_name = "gemini-pro")
prompt_parts = [
    "Can you add an appropriate category next to each of the following expenses. Respond with a list of categories separated by commas. For example, Spotify AB by Adyen - \
Entertainment, Beta Boulders Ams Amsterdam Nld - Sports, etc.: \
Taxi Utrecht, Ministerie van Justitie en Veiligheid, Etos AMSTERDAM NLD, Bistro Bar Amsterdam",
]

response = model.generate_content(prompt_parts)
print(response.text)

### Read transaction data

In [None]:
# Read the transactions_2022_2023.csv file 
import pandas as pd
df = pd.read_csv("transactions_2022_2023.csv")
df.head()

In [None]:
# Get unique transactions in the Name / Description column
unique_transactions = df["Name / Description"].unique()
len(unique_transactions)

In [None]:
unique_transactions[1:10]

### Categorise bank transactions with Llama2

In [None]:
# Get index list
#https://stackoverflow.com/questions/47518609/for-loop-range-and-interval-how-to-include-last-step
def hop(start, stop, step):
    for i in range(start, stop, step):
        yield i
    yield stop

index_list = list(hop(0, len(unique_transactions), 30))
index_list

In [None]:
def categorize_transactions(transaction_names, llm):
    response = llm.invoke("Can you add an appropriate category to the following expenses. For example: Spotify AB by Adyen - Entertainment, Beta Boulders Ams Amsterdam Nld - Sport, etc.. Categories should be less than 4 words. " + transaction_names)
    response = response.split('\n')

    print(response)

    # Put in dataframe
    categories_df = pd.DataFrame({'Transaction vs category': response})
    categories_df[['Transaction', 'Category']] = categories_df['Transaction vs category'].str.split(' - ', expand=True)
    
    return categories_df

In [None]:
# Test out the function
categorize_transactions('ISS Catering Services De Meern, Taxi Utrecht, Etos AMSTERDAM NLD, Bistro Bar Amsterdam',
                        llm)

In [None]:
# Intialise the categories_df_all dataframe
categories_df_all = pd.DataFrame()

# Loop through the index_list
for i in range(0, len(index_list)-1):
    transaction_names = unique_transactions[index_list[i]:index_list[i+1]]
    transaction_names = ','.join(transaction_names)

    categories_df = categorize_transactions(transaction_names, llm)
    categories_df_all = pd.concat([categories_df_all, categories_df], ignore_index=True)

In [None]:
categories_df_all

In [None]:
# categories_df_all.to_csv("categories_df_all.csv", index=False)

In [None]:
# Get unique categories in categories_df_all
unique_categories = categories_df_all["Category"].unique()
unique_categories

In [None]:
# Drop NA values
categories_df_all = categories_df_all.dropna()

# If category contains "Food", then categorise as "Food and Drinks"
categories_df_all.loc[categories_df_all['Category'].str.contains("Food"), 'Category'] = "Food and Drinks"
# If category contains "Clothing", then categorise as "Clothing"
categories_df_all.loc[categories_df_all['Category'].str.contains("Clothing"), 'Category'] = "Clothing"
# If category contains "Services", then categorise as "Services"
categories_df_all.loc[categories_df_all['Category'].str.contains("Services"), 'Category'] = "Services"
# If category contains "Health" or "Wellness", then categorise as "Health and Wellness"
categories_df_all.loc[categories_df_all['Category'].str.contains("Health|Wellness"), 'Category'] = "Health and Wellness"
# If category contains "Sport", then categorise as "Sport
#  and Fitness"
categories_df_all.loc[categories_df_all['Category'].str.contains("Sport"), 'Category'] = "Sport and Fitness"
# If category contains "Travel", then categorise as "Travel"
categories_df_all.loc[categories_df_all['Category'].str.contains("Travel"), 'Category'] = "Travel"

In [None]:
# Remove the numbering eg "1. " from Transaction column
categories_df_all['Transaction'] = categories_df_all['Transaction'].str.replace(r'\d+\.\s+', '')
categories_df_all

In [None]:
# Merge the categories_df_all with the transactions_2022_2023.csv dataframe (df)
df = pd.read_csv("transactions_2022_2023.csv")
df.loc[df['Name / Description'].str.contains("Spotify"), 'Name / Description'] = "Spotify Ab By Adyen"
df = pd.merge(df, categories_df_all, left_on='Name / Description', right_on='Transaction', how='left')
df

In [None]:
df.to_csv("transactions_2022_2023_categorized.csv", index=False)