# Process
1. Install llama_cpp
2. Install llama model from Hugging Face
3. Test model with CLI
4. Test model from Jupyter
  a. Import model
  b. Add model path
5. Import data
6. Clean data
7. Test LLM with fake transactions
8. Test LLL with sample of transactions
9. Run with full list of transaction names
10. Relabel categories back to the data frame for further analysis

## Initialize the LLM

In [1]:
import torch
print(torch.__version__)

ModuleNotFoundError: No module named 'torch'

In [457]:
from transformers import AutoModelForCausalLM
 
model = AutoModelForCausalLM.from_pretrained("mistralai/Mixtral-8x7B-Instruct-v0.1")
model.to("cuda")
 
generated_ids = model.generate(tokens, max_new_tokens=1000, do_sample=True)

# decode with mistral tokenizer
result = tokenizer.decode(generated_ids[0].tolist())
print(result)

ImportError: 
AutoModelForCausalLM requires the PyTorch library but it was not found in your environment. Checkout the instructions on the
installation page: https://pytorch.org/get-started/locally/ and follow the ones that match your environment.
Please note that you may need to restart your runtime after installation.


## Test connection to model by generating text
---

In [306]:
prompt = "Write a poem about cats."
output = llm(prompt)

print(output['choices'][0]['text'])

llama_perf_context_print:        load time =    2162.49 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     7 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    15 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    3236.25 ms /    22 tokens


 independent and mysterious, yet soft and affectionate
Eyes like lanterns in


## Read the transactions
---

In [365]:
# Read the credittransactions_2023_2024.csv file 
import pandas as pd
df_cc = pd.read_csv("data/rawdata/expenses_2024/2024-12-14_credit_card_transaction.csv")
df_bc = pd.read_csv("data/rawdata/expenses_2024/2024-12-14_bank_card_transaction.csv")
df_bc['Name'] = df_bc['Name'].str.capitalize()


In [366]:
categories_string = ', '.join(df_cc["Category"].unique().tolist()) # Convert to a string

## Function to categorize transactions
---

In [367]:
# Get unique transactions in the Name / Description column
unique_transactions = df_bc["Name"].unique()
len(unique_transactions)

18

In [440]:
import pandas as pd
import json
import re

def hop(start, end, step):
    """
    Generator that yields tuples representing start and end indices for each chunk.
    """
    for i in range(start, end, step):
        yield i, min(i + step, end)

def categorize_transactions(transaction_names, categories_string, llm):
    """Categorizes transactions with robust error handling and improved prompt."""

    system_prompt = "You are a financial assistant. You classify expenses and income.\n"
    prompt = (
        system_prompt +
        f"Categories:\n{categories_string}\n\n"
        f"Transactions:\n{transaction_names}\n\n"
        "Categorize EACH transaction. Return ONLY a valid JSON list of objects, where each object has a 'Transaction' and 'Category' key. If a category can't be determined, use 'Other'. Ensure the JSON is correctly formatted. Example:\n"
        '[{"Transaction": "Netflix", "Category": "Entertainment"}, {"Transaction": "Unusual Transaction", "Category": "Other"}]\n'
        "Do not include any other text or explanations outside of the JSON. If you cannot produce valid JSON, return an empty JSON list:"
    )

    try:
        response = llm(prompt)

        if isinstance(response, dict) and 'choices' in response and response['choices']:
            response_text = response['choices'][0]['text']
        elif isinstance(response, str):
            response_text = response
        else:
            print(f"Unexpected LLM response format: {type(response), response}")
            return None

        response_text = response_text.strip()

        # Extract the first valid JSON-like substring
        match = re.search(r"\[.*\]", response_text, re.DOTALL) #Find the first thing that looks like a json list
        if match:
            response_text = match.group(0)
        else:
            print("No JSON-like string found in LLM response")
            print(f"LLM Response: {response_text}")
            return pd.DataFrame() #Return an empty dataframe if nothing is found

        response_text = response_text.strip()
        response_text = re.sub(r'\n', '', response_text)
        response_text = re.sub(r',\s*}', '}', response_text)

        try:
            data = json.loads(response_text)
            if not isinstance(data, list):
                print("LLM did not return a list")
                print(f"LLM Response: {response_text}")
                return None
            for item in data:
                if not isinstance(item, dict) or 'Transaction' not in item or 'Category' not in item:
                    print("LLM returned an invalid list format")
                    print(f"LLM Response: {response_text}")
                    return None
            categories_df = pd.DataFrame(data)
            return categories_df

        except json.JSONDecodeError as e:
            print(f"Invalid JSON from LLM: {e}")
            print(f"LLM Response: {response_text}")
            return pd.DataFrame() #Return an empty dataframe if parsing fails

    except Exception as e:
        print(f"An error occurred: {e}")
        return None


In [441]:
def categorize_transactions_chunked(unique_transactions, categories_string, llm, chunk_size=5):
    """
    Categorizes transactions in chunks using an LLM and builds a DataFrame.

    Args:
        unique_transactions: List of unique transaction names.
        categories_string: String listing the possible categories.
        llm: The LLM function.
        chunk_size: Number of transactions to process in each chunk.

    Returns:
        A pandas DataFrame with Transaction and Category columns, or None on error.
    """

    all_categorized_transactions = []
    for start_index, end_index in hop(0, len(unique_transactions), chunk_size):
        transaction_chunk = unique_transactions[start_index:end_index]
        transaction_names_string = ", ".join(transaction_chunk)

        try:
            categorized_chunk = categorize_transactions(transaction_names_string, categories_string, llm)
            if categorized_chunk is not None:
                all_categorized_transactions.extend(categorized_chunk.to_dict('records'))
        except Exception as e:
            print(f"Error categorizing chunk {start_index} to {end_index - 1}: {e}")

    if all_categorized_transactions:
        return pd.DataFrame(all_categorized_transactions)

In [442]:
# Example Usage
transactions = ["Netflix", "Salary", "Groceries", "Amazon Prime", "Rent", "Coffee", "Gas", "Extra Transaction 1", "Extra Transaction 2", "Extra Transaction 3", "Extra Transaction 4", "Extra Transaction 5", "Extra Transaction 6"]
unique_transactions = list(set(transactions))
categories_string = "Entertainment, Income, Groceries, Subscriptions, Housing, Food, Transportation, Other"

df = categorize_transactions_chunked(unique_transactions, categories_string, llm)
if df is not None:
    print(df)

Llama.generate: 39 prefix-match hit, remaining 119 prompt tokens to eval
llama_perf_context_print:        load time =    3026.49 ms
llama_perf_context_print: prompt eval time =       0.00 ms /   119 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    15 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    2163.64 ms /   134 tokens


Invalid JSON from LLM: Extra data: line 1 column 3 (char 2)
LLM Response: []]


Llama.generate: 36 prefix-match hit, remaining 122 prompt tokens to eval
llama_perf_context_print:        load time =    3026.49 ms
llama_perf_context_print: prompt eval time =       0.00 ms /   122 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    15 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    2286.62 ms /   137 tokens


Invalid JSON from LLM: Extra data: line 1 column 4 (char 3)
LLM Response: [] [] [] [] [] [] [] [] [] [] [] [] [] [] [] []


Llama.generate: 36 prefix-match hit, remaining 117 prompt tokens to eval
llama_perf_context_print:        load time =    3026.49 ms
llama_perf_context_print: prompt eval time =       0.00 ms /   117 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    15 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    2179.51 ms /   132 tokens


Invalid JSON from LLM: Extra data: line 1 column 3 (char 2)
LLM Response: []]}}]]]]]]]]]]]]]]]]]]]]]]]]]]


In [439]:
df

In [429]:
transactions = ["Netflix"]
unique_transactions = list(set(transactions))
categories_string = "Entertainment, Other"
df = categorize_transactions_chunked(unique_transactions, categories_string, llm)
print(df)

Llama.generate: 18 prefix-match hit, remaining 111 prompt tokens to eval
llama_perf_context_print:        load time =    3155.54 ms
llama_perf_context_print: prompt eval time =       0.00 ms /   111 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    15 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    2253.22 ms /   126 tokens


Invalid JSON from LLM: Extra data: line 1 column 3 (char 2)
LLM Response: []]
None


In [446]:
system_prompt = "You are a helpful AI assistant that only returns JSON."
test_prompt = "```json\n[{'test': 'value'}]\n```"
# Assuming your llm function supports system prompts:
test_response = llm(test_prompt, system_prompt=system_prompt) 

TypeError: Llama.__call__() got an unexpected keyword argument 'system_prompt'

### Sources

https://www.youtube.com/watch?v=h_GTxRFYETY

https://huggingface.co/meta-llama/Llama-3.2-1B