In [None]:
"""
Tag and categorize the historical expenses from splitwise
The expense data is cached locally in expenses.json.
The tags are stored locally in tags.json in the form of
{
  "wants": [<list of str of expense names in lower case>]
}
PS: There's no ML. Its just honest labour to categorize based on ones own thoughts
"""

In [None]:
import json
import os
from datetime import datetime, timedelta

import dotenv
import requests

dotenv.load_dotenv()

BASE_URL = "https://secure.splitwise.com/api/v3.0/get_expenses"
HEADERS = {"Authorization": f"Bearer {os.getenv('SPLITWISE_TOKEN')}"}


def fetch_expenses(start_date, end_date, limit=200):
    params = {
        "dated_after": start_date.isoformat() + "Z",
        "dated_before": end_date.isoformat() + "Z",
        "limit": limit,
    }
    response = requests.get(BASE_URL, headers=HEADERS, params=params)
    response.raise_for_status()  # Ensure no enigmatic errors
    return response.json()


# Embark on fetching data in monthly batches
def get_all_expenses():
    start_date = datetime(2022, 4, 1)
    end_date = datetime(2025, 1, 10)
    all_expenses = []

    while start_date < end_date:
        next_date = start_date + timedelta(days=30)  # Fetch monthly
        print(f"Fetching expenses from {start_date} to {next_date}...")
        try:
            monthly_expenses = fetch_expenses(start_date, next_date)
            all_expenses.extend(monthly_expenses.get("expenses", []))
        except requests.exceptions.RequestException as e:
            print(f"Error fetching data: {e}")
            break
        start_date = next_date

    return all_expenses


# Save the combined mosaic to a JSON file
def save_to_file(data, filename="expenses.json"):
    with open(filename, "w") as file:
        json.dump(data, file, indent=4)


# Read the verdant data back from the file
def read_from_file(filename="expenses.json"):
    with open(filename, "r") as file:
        return json.load(file)


In [None]:
all_expenses = get_all_expenses()

In [None]:
len(all_expenses)

In [None]:
save_to_file(all_expenses)

In [None]:
# Verify by reading back
loaded_expenses = read_from_file()
len(loaded_expenses)

In [None]:
from dateutil import parser, tz
from typing import Optional
from requests import HTTPError

USER_ID: Optional[str] = None


def get_user_id() -> str:
    """
    Get the splitwise user id
    :return:
    """
    global USER_ID

    if USER_ID:
        return USER_ID

    url = "https://secure.splitwise.com/api/v3.0/get_current_user"

    headers = {
        'Authorization': f'Bearer {os.getenv("SPLITWISE_TOKEN")}'
    }

    response = requests.request("GET", url, headers=headers, data={})

    if response.status_code == 200:
        USER_ID = response.json()["user"]["id"]
    else:
        raise HTTPError(f'Invalid Notion response {response.status_code} {response.text}', response=response)

    return USER_ID


items = []
for item in loaded_expenses:
    created = parser.parse(item['date'])
    deleted = item['deleted_at']
    name = item['description'].strip()
    if deleted:
        continue

    if name == 'Payment':
        continue
    if name == 'Settle all balances':
        continue
    created = created.astimezone(tz.tzlocal())
    result = {
        "date": created.strftime("%Y-%m-%d"),
        "name": name
    }
    for user in item['users']:
        if user['user_id'] == get_user_id():
            result["cost"] = float(user['owed_share'].strip())
            items.append(result)

In [None]:
len(items)

In [None]:
from collections import defaultdict
all_expense_names_count = defaultdict(int)
for item in items:
    all_expense_names_count[item[1].lower()] += 1
dict(sorted(all_expense_names_count.items(), key=lambda item: -item[1]))

In [None]:
import re

TAG_MAPPING = None


def get_tags():
    global TAG_MAPPING

    if TAG_MAPPING:
        return TAG_MAPPING

    TAG_MAPPING = read_from_file('tags.json') or {}
    return TAG_MAPPING


def tag_expense(expense_name):
    """
    Assign tags to an expense based on its name.

    Args:
        expense_name (str): The name of the expense.

    Returns:
        list: A list of tags that match the expense.
    """
    tags = []
    lower_name = expense_name.lower()

    for tag, keywords in get_tags().items():
        if any(re.search(keyword.lower(), lower_name) for keyword in keywords):
            tags.append(tag)

    return tags if tags else ["other"]
    # return tags

In [None]:
from collections import defaultdict
# unclassified = defaultdict(int)
for expense in items:
    tags = tag_expense(expense["name"])
    # if not tags:
        # print(expense)
    expense["tags"] = tags
        # unclassified[expense[1]] += 1

# dict(sorted(unclassified.items(), key=lambda item: -item[1]))
# unclassified.keys()

In [None]:
import pandas as pd

df = pd.DataFrame(items)

# Parse dates and extract the month
df['date'] = pd.to_datetime(df['date'])
df['month'] = df['date'].dt.to_period('M')
df = df.explode('tags')
monthly_data = df.groupby(['month', 'tags'])['cost'].sum().reset_index()
pivot_data = monthly_data.pivot(index='month', columns='tags', values='cost').fillna(0)
pivot_data

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Calculate the 95th percentile for all data and cap values above the threshold
threshold = np.quantile(pivot_data.values.flatten(), 0.95)
capped_data = pivot_data.clip(upper=threshold)

In [None]:
from scipy.ndimage import gaussian_filter1d

plt.figure(figsize=(15, 9))

for tag in capped_data.columns:
    # Apply Gaussian smoothing to emulate smooth curves
    smoothed_y = gaussian_filter1d(capped_data[tag], sigma=2)

    # Plot the smoothed data
    plt.plot(capped_data.index.to_timestamp(), smoothed_y, label=tag)

plt.title('Monthly Aggregate Cost')
plt.xlabel('Month')
plt.ylabel('Total Cost')
plt.legend(title='Tags')
plt.grid()
plt.tight_layout()
plt.show()

In [None]:
capped_data