# Spendee Data

In [5]:
import pandas as pd
import altair as alt

PATH = "../data/transactions_export_2025-07-24_unfcu.csv"

df = pd.read_csv(PATH)

# Explore categories, totals, and time ranges
df

Unnamed: 0,Date,Wallet,Type,Category name,Amount,Currency,Note,Labels,Author
0,2021-11-11T21:02:30+00:00,UNFCU 🌏,Expense,Shopping,-24.654016,USD,Kenko UV FILTER 67MM,photography,Eduardo Cifuentes
1,2023-12-22T03:03:38+00:00,UNFCU 🌏,Expense,Gifts,-50.000000,USD,Vino matrimonio julu montelig,,Eduardo Cifuentes
2,2023-12-25T13:09:10+00:00,UNFCU 🌏,Expense,Restaurant,-13.119902,USD,,,Eduardo Cifuentes
3,2023-12-25T13:09:49+00:00,UNFCU 🌏,Expense,Shopping,-15.306552,USD,Almohada avipm cuello,,Eduardo Cifuentes
4,2023-12-25T13:10:07+00:00,UNFCU 🌏,Expense,Coffee,-2.951978,USD,,,Eduardo Cifuentes
...,...,...,...,...,...,...,...,...,...
2051,2025-07-07T13:40:16+00:00,UNFCU 🌏,Expense,Restaurant,-11.976334,USD,,,Eduardo Cifuentes
2052,2025-07-07T20:36:54+00:00,UNFCU 🌏,Expense,Healthcare,-94.743076,USD,Vavuna qdenga 1 dosis,,Eduardo Cifuentes
2053,2025-07-08T00:39:42+00:00,UNFCU 🌏,Expense,Acommodation,-17.519964,USD,,,Eduardo Cifuentes
2054,2025-07-08T14:44:39+00:00,UNFCU 🌏,Expense,Coffee,-3.011120,USD,,,Eduardo Cifuentes


In [None]:
df.describe()

Unnamed: 0,Amount
count,2056.0
mean,-17.948601
std,120.913962
min,-3115.0
25%,-9.761
50%,-4.739202
75%,-2.074212
max,1342.1375


# Cleaning

In [9]:
# cleaning

# category
df.rename(columns = {'Category name':'category_name'}, inplace=True)
df.info()

df["category_name"] = df["category_name"].astype("category")

df['date_format'] = pd.to_datetime(df['Date']).dt.strftime("%Y-%m") ## %Y-%m-%d"
df['amount'] = df['Amount'].abs()

# filter
df = df[(df['Type'] == 'Expense') & (df['category_name'] != 'Savings')]


df.groupby('date_format').amount.sum()

<class 'pandas.core.frame.DataFrame'>
Index: 2043 entries, 0 to 2055
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   Date           2043 non-null   object  
 1   Wallet         2043 non-null   object  
 2   Type           2043 non-null   object  
 3   category_name  2043 non-null   category
 4   Amount         2043 non-null   float64 
 5   Currency       2043 non-null   object  
 6   Note           859 non-null    object  
 7   Labels         620 non-null    object  
 8   Author         2043 non-null   object  
 9   date_format    2043 non-null   object  
 10  amount         2043 non-null   float64 
dtypes: category(1), float64(2), object(8)
memory usage: 178.3+ KB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.rename(columns = {'Category name':'category_name'}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["category_name"] = df["category_name"].astype("category")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['date_format'] = pd.to_datetime(df['Date']).dt.strftime("%Y-%m") ## %Y-%m-%d"
A value is trying to be set on a copy of a slice fr

date_format
2021-11      CLP $25
2023-12     CLP $442
2024-01   CLP $1,887
2024-02   CLP $3,323
2024-03   CLP $9,002
2024-04     CLP $983
2024-05   CLP $2,049
2024-06   CLP $2,665
2024-07   CLP $1,931
2024-08   CLP $1,650
2024-09   CLP $2,973
2024-10   CLP $3,556
2024-11   CLP $7,038
2024-12   CLP $1,756
2025-01     CLP $173
2025-02     CLP $264
2025-03      CLP $30
2025-04      CLP $96
2025-05      CLP $40
2025-06     CLP $116
2025-07     CLP $137
Name: amount, dtype: float64

# Confi - Number Format

Show currency in CLP or USD

In [8]:
# formatting

# display
pd.options.display.float_format = 'CLP ${:,.0f}'.format

In [None]:
df.dtypes


## Budgets

In [15]:
# read budgets
import json

with open('../utils/budgets.json') as json_file:
    budgets = json.load(json_file)

budgets

{'Rent': 'Gastos fijos',
 'Sport': 'Gastos fijos',
 'Personal Care': 'Gastos fijos',
 'Groceries': 'Gastos fijos',
 'Utilities': 'Gastos fijos',
 'Coffee-Snacks': 'Chao culpa',
 'Restaurant': 'Chao culpa',
 'Alcohol': 'Chao culpa',
 'Activities': 'Chao culpa',
 'Shopping': 'Chao culpa',
 'Travel': 'Viajes',
 'Investments': 'Inversion',
 'Savings': 'Ahorro'}

# Plots

In [None]:
source = df

tool = alt.TooltipValue = 'amount'
tooltip=['date', 'price', 'symbol', alt.Tooltip('amount:Q', format=':,.0f')]


alt.Chart(source).mark_bar(
    cornerRadiusTopLeft=3,
    cornerRadiusTopRight=3
).encode(
    y='date_format',
    x='amount',
    color='category_name:N',
    tooltip = ['Note', alt.Tooltip("mean(amount):Q", format="$,.0f")]
).properties(height=600, width=800)

In [None]:
# scatter

# remove outliers
# new_df = df.drop(df[df['amount'] > 1_000_000].index)
# new_df_ii = new_df.drop(new_df[new_df['category_name'] == 'Rent'].index)

# TODO: exluce Rent and shopping over 1 M clp

alt.Chart(source).mark_circle().encode(
    alt.Y('category_name', scale=alt.Scale(zero=False)),
    alt.X('amount', scale=alt.Scale(zero=False, padding=1)),
    size='amount',
    color= 'category_name',
    tooltip= 'Note'
).properties(height=1000, width=600)