# Transaction Categorization with Ollama

This notebook reads uncategorized transactions from the PostgreSQL database and uses a local Ollama model to categorize them.

In [None]:
import os
import psycopg2
import requests
import json
import time

# Configuration
OLLAMA_URL = "http://ollama.mapletyne.com/api/generate"
MODEL_NAME = "gpt-oss" # Change this to your installed model

CATEGORIES = [
    "Groceries",
    "Dining Out",
    "Rent/Mortgage",
    "Utilities",
    "Transportation",
    "Entertainment",
    "Shopping",
    "Health",
    "Insurance",
    "Income",
    "Money Transfer",
    "Subscription",
    "Banking Fees",
    "Cloud Infrastructure",
    "Investment",
    "Debt/Credit Card",
    "LLM Tools",
    "Other"
]

# === MAPPING RULES SECTION ===
# Rules are processed in order (Rank 1 is top of list). First match wins.
# 'keywords': list of strings to search for in description (case-insensitive)
# 'logic': 'OR' (match any keyword) or 'AND' (match all keywords)
CATEGORY_RULES = [
    {
        "category": "Utilities",
        "logic": "OR",
        "keywords": ["hydro", "freedom mobile", "rogers", "bell", "water", "electricity"]
    },
    {
        "category": "Dining Out",
        "logic": "OR",
        "keywords": ["uber eats", "ubereats", "mcdonalds", "starbucks", "tim hortons", "subway", "restaurant", "cafe"]
    },
    {
        "category": "Shopping",
        "logic": "OR",
        "keywords": ["amazon", "amzn", "walmart", "yorkdale", "hudson bay", "sport check", "h and m", "h&m", "best buy"]
    },
    {
        "category": "Health",
        "logic": "OR",
        "keywords": ["shoppers drug mart", "pharmacy", "doctor", "dental", "hospital"]
    },
    {
        "category": "Insurance",
        "logic": "OR",
        "keywords": ["resp", "life insurance", "house insurance", "c insurance", "auto insurance"]
    },
    {
        "category": "Rent/Mortgage",
        "logic": "OR",
        "keywords": ["1570 lawrence", "rent", "mortgage"]
    },
    {
        "category": "Subscription",
        "logic": "OR", 
        "keywords": ["recurring", "netflix", "spotify", "adobe", "apple.com/bill", "sqsp"]
    },
    {
        "category": "Cloud Infrastructure",
        "logic": "OR",
        "keywords": ["amazon web services", "aws", "microsoft", "azure", "google cloud"]
    },
    {
        "category": "LLM Tools",
        "logic": "OR",
        "keywords": ["openai", "grok", "anthropic", "claude", "midjourney", "warp.dev", "cursor"]
    },
    {
        "category": "Investment",
        "logic": "OR",
        "keywords": ["managed_buy", "managed_sell", "managed buy", "managed sell", "dividend", "gic", "investment", "wealthsimple", "questrade"]
    },
    {
        "category": "Banking Fees",
        "logic": "OR",
        "keywords": ["plan fee", "overdraft", "over draft", "monthly fee", "service fee"]
    },
    {
        "category": "Money Transfer",
        "logic": "OR",
        "keywords": ["interac", "etransfer", "e-transfer", "lemfi", "sent", "received", "transfer", "misc payment"]
    },
    {
        "category": "Debt/Credit Card",
        "logic": "OR",
        "keywords": ["loan", "interest payment", "credit card", "line of credit"]
    },
    {
        "category": "Transportation",
        "logic": "OR",
        "keywords": ["presto", "uber", "lyft", "parking", "gas"]
    },
    {
        "category": "Groceries",
        "logic": "OR",
        "keywords": ["metro", "whole foods", "longos", "loblaws", "nofrills", "sobeys"]
    },
    {
        "category": "Income",
        "logic": "OR",
        "keywords": ["payroll", "deposit"]
    },
    {
        "category": "Other",
        "logic": "OR",
        "keywords": ["reimbursement", "fee", "discount"]
    }
]

def get_db_connection():
    # Load .env variables manually
    env_vars = {}
    possible_paths = [
        '.env',
        os.path.join(os.getcwd(), '.env'),
        '/home/danielaroko/applications/data_exploration/simplefin/.env'
    ]
    env_path = None
    for path in possible_paths:
        if os.path.exists(path):
            env_path = path
            break
            
    if env_path:
        try:
            with open(env_path, 'r') as f:
                for line in f:
                    if '=' in line:
                        key, value = line.strip().split('=', 1)
                        env_vars[key] = value
        except Exception as e:
            print(f"Error reading .env: {e}")
            return None

    try:
        conn = psycopg2.connect(
            host=env_vars.get('DB_HOST', 'localhost'),
            database=env_vars.get('DB_NAME'),
            user=env_vars.get('DB_USER'),
            password=env_vars.get('DB_PASS')
        )
        return conn
    except Exception as e:
        print(f"Database connection error: {e}")
        return None

def check_rules(description):
    desc_lower = description.lower()
    
    for rule in CATEGORY_RULES:
        keywords = [k.lower() for k in rule.get('keywords', [])]
        logic = rule.get('logic', 'OR').upper()
        category = rule.get('category')
        
        if not keywords:
            continue
            
        if logic == 'OR':
            # Match if ANY keyword is present
            if any(k in desc_lower for k in keywords):
                return category
        elif logic == 'AND':
            # Match only if ALL keywords are present
            if all(k in desc_lower for k in keywords):
                return category
                
    return None

def categorize_transaction(description, amount):
    # 1. Check Rule Mappings first
    rule_category = check_rules(description)
    if rule_category:
        return rule_category

    # 2. Start constructing Ollama Request
    prompt = f"""
    You are a financial assistant. Categorize the following transaction into exactly one of these categories: {', '.join(CATEGORIES)}.
    
    Transaction Description: "{description}"
    Amount: {amount}
    
    Reply ONLY with the category name. Do not add any punctuation or extra text.
    """
    
    payload = {
        "model": MODEL_NAME,
        "prompt": prompt,
        "stream": False
    }
    
    try:
        # Use a short timeout so we don't hang if API is down, but be reasonable
        response = requests.post(OLLAMA_URL, json=payload, timeout=30)
        response.raise_for_status()
        result = response.json()
        category = result.get("response", "").strip()
        
        # Simple validation
        for cat in CATEGORIES:
            if cat.lower() in category.lower():
                return cat
        return "Other"
    except Exception as e:
        print(f"Ollama error: {e}")
        return None

def process_uncategorized_transactions():
    conn = get_db_connection()
    if not conn:
        return
        
    try:
        cur = conn.cursor()
        
        # Fetch transactions with NULL category
        # Also can re-process 'Other' if we want, but for now just NULL or empty
        cur.execute("SELECT transaction_id, description, amount FROM transactions WHERE category IS NULL OR category = ''")
        rows = cur.fetchall()
        
        print(f"Found {len(rows)} uncategorized transactions.")
        
        count = 0
        for row in rows:
            t_id, desc, amt = row
            print(f"Processing: {desc} ({amt})...", end=" ")
            
            category = categorize_transaction(desc, amt)
            
            if category:
                print(f"-> {category}")
                cur.execute("UPDATE transactions SET category = %s WHERE transaction_id = %s", (category, t_id))
                conn.commit() # Commit each one or batch commit
                count += 1
            else:
                print("-> Failed")
            
        print(f"Done. Categorized {count} transactions.")
        
    except Exception as e:
        print(f"Processing error: {e}")
    finally:
        if conn:
            conn.close()

process_uncategorized_transactions()