In [18]:
!pip3 install langchain_community



In [19]:
from langchain_community.llms import Ollama

In [20]:
llm = Ollama(model="llama2")
llm.invoke("The first man on the moon was ...")

'\nThe first man to walk on the moon was Neil Armstrong, who landed on the moon on July 20, 1969 as part of the Apollo 11 mission. Armstrong stepped out of the lunar module Eagle and onto the moon\'s surface at 2:56 UTC on July 20, 1969, famously declaring "That\'s one small step for man, one giant leap for mankind" as he took his first steps. He was followed by fellow astronaut Edwin "Buzz" Aldrin, who also walked on the moon during the mission.'

In [21]:
llm.invoke("Can you add an appropriate category next to each of the following expenses. Respond with a list of categories separated by commas. For example, Software Engineer - \
Salary, PVR Cinemas - Entertainment, etc.: \
Software Engineer, PVR Cinemas, Dream Apartment, Disney+ Hotstar, Corner House, Cult.Fit, Electricity Bill, Website Design, Big Bazaar, ACT Fibernet, Water Bill")

'\nSure! Here are some categories that could be added next to each of the expenses you mentioned:\n\nSoftware Engineer - Salary: Professional Services\nPVR Cinemas - Entertainment: Recreation\nDream Apartment - Rent/Mortgage: Housing\nDisney+ Hotstar - Subscription: Entertainment\nCorner House - Restaurant: Food and Beverage\nCult.Fit - Gym Membership: Health and Wellness\nElectricity Bill - Utilities: Bills and Fees\nWebsite Design - Professional Services: Technology\nBig Bazaar - Groceries: Shopping\nACT Fibernet - Internet Service Provider: Technology\nWater Bill - Utilities: Bills and Fees\n\nI hope this helps! Let me know if you have any other questions.'

### Read transaction data

In [22]:
# Read the transactions_2022_2023.csv file 
import pandas as pd
df = pd.read_csv("/Users/darshan/Documents/Projects/DataScience/Personal/FinancialDashboard/Transactions_2023-24.csv")
df

Unnamed: 0,Date,Transaction Names,Transaction Type,Amount (INR)
0,02-01-2023,Software Engineer,Income,75000
1,12-01-2023,PVR Cinemas,Expense,1307
2,14-01-2023,Dream Apartment,Expense,28000
3,17-01-2023,Disney+ Hotstar,Expense,800
4,21-01-2023,Corner House,Expense,2330
...,...,...,...,...
259,21-12-2024,Easyday,Expense,3837
260,21-12-2024,MTR 1924,Expense,3708
261,23-12-2024,ACT Fibernet,Expense,1500
262,24-12-2024,Website Design,Income,15000


In [23]:
# Get unique transactions in the Name / Description column
unique_transactions = df["Transaction Names"].unique()
len(unique_transactions)

26

In [24]:
unique_transactions[1:10]

array(['PVR Cinemas', 'Dream Apartment', 'Disney+ Hotstar',
       'Corner House', 'Cult.Fit', 'Electricity Bill', 'Website Design',
       'Big Bazaar', 'ACT Fibernet'], dtype=object)

### Categorise bank transactions with Llama2

In [25]:
# Get index list
#https://stackoverflow.com/questions/47518609/for-loop-range-and-interval-how-to-include-last-step
def hop(start, stop, step):
    for i in range(start, stop, step):
        yield i
    yield stop

index_list = list(hop(0, len(unique_transactions), 30))
index_list

[0, 26]

In [26]:
def categorize_transactions(transaction_names, llm):
    response = llm.invoke("Can you add an appropriate category to the following expenses. For example: Disney+ Hotstar - Entertainment, Software Engineer - Salary, etc.. Categories should be less than 4 words. " + transaction_names)
    response = response.split('\n')

    print(response)

    # Put in dataframe
    categories_df = pd.DataFrame({'Transaction vs category': response})
    categories_df[['Transaction', 'Category']] = categories_df['Transaction vs category'].str.split(' - ', expand=True)
    
    return categories_df

In [27]:
# Test out the function
categorize_transactions(' PVR Cinemas, Dream Apartment, Corner House, Cult.Fit, Electricity Bill, Website Design, Big Bazaar, ACT Fibernet, Water Bill',
                        llm)

['', 'Sure! Here are the appropriate categories for each expense:', '', '1. PVR Cinemas - Entertainment', '2. Dream Apartment - Housing', '3. Corner House - Rent/Mortgage', '4. Cult.Fit - Fitness', '5. Electricity Bill - Utilities', '6. Website Design - Business', '7. Big Bazaar - Groceries', '8. ACT Fibernet - Internet', '9. Water Bill - Utilities']


Unnamed: 0,Transaction vs category,Transaction,Category
0,,,
1,Sure! Here are the appropriate categories for ...,Sure! Here are the appropriate categories for ...,
2,,,
3,1. PVR Cinemas - Entertainment,1. PVR Cinemas,Entertainment
4,2. Dream Apartment - Housing,2. Dream Apartment,Housing
5,3. Corner House - Rent/Mortgage,3. Corner House,Rent/Mortgage
6,4. Cult.Fit - Fitness,4. Cult.Fit,Fitness
7,5. Electricity Bill - Utilities,5. Electricity Bill,Utilities
8,6. Website Design - Business,6. Website Design,Business
9,7. Big Bazaar - Groceries,7. Big Bazaar,Groceries


In [28]:
# Intialise the categories_df_all dataframe
categories_df_all = pd.DataFrame()

# Loop through the index_list
for i in range(0, len(index_list)-1):
    transaction_names = unique_transactions[index_list[i]:index_list[i+1]]
    transaction_names = ','.join(transaction_names)

    categories_df = categorize_transactions(transaction_names, llm)
    categories_df_all = pd.concat([categories_df_all, categories_df], ignore_index=True)

[' Sure! Here are the expenses you provided, categorized into appropriate categories:', '', '1. Disney+ Hotstar - Entertainment', '2. Software Engineer - Salary', '3. PVR Cinemas - Entertainment', '4. Dream Apartment - Housing', '5. Corner House - Food/Dining', '6. Cult.Fit - Fitness', '7. Electricity Bill - Utilities', '8. Website Design - Technology', '9. Big Bazaar - Grocery Shopping', '10. ACT Fibernet - Internet Service Provider', '11. Water Bill - Utilities', '12. Zee5 - Entertainment', '13. Easyday - Grocery Shopping', "14. Koshy's - Food/Dining", '15. MTR 1924 - Restaurant', '16. Reliance Fresh - Grocery Shopping', '17. Netflix - Entertainment', '18. Barbecue Nation - Restaurant', "19. Spencer's - Gifts/Souvenirs", '20. Toit - Food/Dining', '21. The Permit Room - Restaurant', '22. Foodworld - Grocery Shopping', '23. Amazon Prime - Entertainment', '24. More Supermarket - Grocery Shopping', '25. Truffles - Gifts/Souvenirs', '26. Chai Point - Food/Dining', '', 'I hope this helps! 

In [29]:
categories_df_all

Unnamed: 0,Transaction vs category,Transaction,Category
0,"Sure! Here are the expenses you provided, cat...","Sure! Here are the expenses you provided, cat...",
1,,,
2,1. Disney+ Hotstar - Entertainment,1. Disney+ Hotstar,Entertainment
3,2. Software Engineer - Salary,2. Software Engineer,Salary
4,3. PVR Cinemas - Entertainment,3. PVR Cinemas,Entertainment
5,4. Dream Apartment - Housing,4. Dream Apartment,Housing
6,5. Corner House - Food/Dining,5. Corner House,Food/Dining
7,6. Cult.Fit - Fitness,6. Cult.Fit,Fitness
8,7. Electricity Bill - Utilities,7. Electricity Bill,Utilities
9,8. Website Design - Technology,8. Website Design,Technology


In [None]:
categories_df_all.to_csv("Categories_df_all.csv", index=False)

In [31]:
# Get unique categories in categories_df_all
unique_categories = categories_df_all["Category"].unique()
unique_categories

array([None, 'Entertainment', 'Salary', 'Housing', 'Food/Dining',
       'Fitness', 'Utilities', 'Technology', 'Grocery Shopping',
       'Internet Service Provider', 'Restaurant', 'Gifts/Souvenirs'],
      dtype=object)

In [32]:
# Drop NA values
categories_df_all = categories_df_all.dropna()

# If category contains "Food/Dining" or "Restaurant", then categorise as "Food and Drinks"
categories_df_all.loc[categories_df_all['Category'].str.contains("Food/Dining|Restaurant"), 'Category'] = "Food and Drinks"

In [33]:
# Remove the numbering eg "1. " from Transaction column
categories_df_all['Transaction'] = categories_df_all['Transaction'].str.replace(r'^\d+\.\s+', '', regex=True)
categories_df_all

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categories_df_all['Transaction'] = categories_df_all['Transaction'].str.replace(r'^\d+\.\s+', '', regex=True)


Unnamed: 0,Transaction vs category,Transaction,Category
2,1. Disney+ Hotstar - Entertainment,Disney+ Hotstar,Entertainment
3,2. Software Engineer - Salary,Software Engineer,Salary
4,3. PVR Cinemas - Entertainment,PVR Cinemas,Entertainment
5,4. Dream Apartment - Housing,Dream Apartment,Housing
6,5. Corner House - Food/Dining,Corner House,Food and Drinks
7,6. Cult.Fit - Fitness,Cult.Fit,Fitness
8,7. Electricity Bill - Utilities,Electricity Bill,Utilities
9,8. Website Design - Technology,Website Design,Technology
10,9. Big Bazaar - Grocery Shopping,Big Bazaar,Grocery Shopping
11,10. ACT Fibernet - Internet Service Provider,ACT Fibernet,Internet Service Provider


In [37]:
# Merge the categories_df_all with the transactions_2022_2023.csv dataframe (df)
df = pd.read_csv("/Users/darshan/Documents/Projects/DataScience/Personal/FinancialDashboard/Transactions_2023-24.csv")
df.loc[df['Transaction Names'].str.contains("Disney"), 'Transaction Names'] = "Disney+ Hotstar"
df = pd.merge(df, categories_df_all, left_on='Transaction Names', right_on='Transaction', how='left')
df

Unnamed: 0,Date,Transaction Names,Transaction Type,Amount (INR),Transaction vs category,Transaction,Category
0,02-01-2023,Software Engineer,Income,75000,2. Software Engineer - Salary,Software Engineer,Salary
1,12-01-2023,PVR Cinemas,Expense,1307,3. PVR Cinemas - Entertainment,PVR Cinemas,Entertainment
2,14-01-2023,Dream Apartment,Expense,28000,4. Dream Apartment - Housing,Dream Apartment,Housing
3,17-01-2023,Disney+ Hotstar,Expense,800,1. Disney+ Hotstar - Entertainment,Disney+ Hotstar,Entertainment
4,21-01-2023,Corner House,Expense,2330,5. Corner House - Food/Dining,Corner House,Food and Drinks
...,...,...,...,...,...,...,...
259,21-12-2024,Easyday,Expense,3837,13. Easyday - Grocery Shopping,Easyday,Grocery Shopping
260,21-12-2024,MTR 1924,Expense,3708,15. MTR 1924 - Restaurant,MTR 1924,Food and Drinks
261,23-12-2024,ACT Fibernet,Expense,1500,10. ACT Fibernet - Internet Service Provider,ACT Fibernet,Internet Service Provider
262,24-12-2024,Website Design,Income,15000,8. Website Design - Technology,Website Design,Technology


In [38]:
df.to_csv("Categorized_transactions.csv", index=False)