## Loading Data

In [1]:
import pandas as pd
import os

In [2]:
if not os.path.exists("data"):
    os.makedirs("data")
    
dfs = {}
for filename in os.listdir("data"):
    if not filename.endswith(".csv"):
        continue
    if "combined" in filename:
        continue
    
    path = os.path.join("data", filename)
    try:
        df = pd.read_csv(path, encoding="utf-8")
    except UnicodeDecodeError:
        df = pd.read_csv(path, encoding="cp1252")
    df["Date"] = pd.to_datetime(df["Date"], dayfirst=True)
    df['Item'] = df['Item'].astype(str)
    df['Category'] = df['Category'].astype(str)
    
    key = filename.removesuffix(".csv")
    dfs[key] = df
    print(f"Loaded {filename} with {len(df)} rows. Null Entries: {df.isnull().sum().sum()}")

Loaded 2018.csv with 112 rows. Null Entries: 0
Loaded 2019.csv with 154 rows. Null Entries: 0
Loaded 2020.csv with 109 rows. Null Entries: 0
Loaded 2021.csv with 168 rows. Null Entries: 0
Loaded 2022.csv with 135 rows. Null Entries: 0
Loaded 2023.csv with 296 rows. Null Entries: 0
Loaded 2024.csv with 216 rows. Null Entries: 0


In [3]:
combined = []
for year, df in dfs.items():
    df = df.copy()
    combined.append(df)

df_combined = pd.concat(combined, ignore_index=True)
df_combined.info()
df_combined.head()
df_combined.to_csv("data/combined.csv", index=False, encoding="utf-8")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1190 entries, 0 to 1189
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Item      1190 non-null   object        
 1   Category  1190 non-null   object        
 2   Cost      1190 non-null   float64       
 3   Date      1190 non-null   datetime64[ns]
dtypes: datetime64[ns](1), float64(1), object(2)
memory usage: 37.3+ KB


## Create categories

In [10]:
from openai import OpenAI

In [11]:
api_key = input("Enter your OpenAI API key: ")
client = OpenAI(api_key=api_key)

In [39]:
items = df_combined['Item'].unique()
items_string = "\n".join(items.tolist())

system_prompt = f"""
You are a helpful assistant that creates practical and intuitive categories for personal purchase items.
Your task is to group items into a small set of consistent, human-friendly categories that would make sense for budgeting or spending review. 
You should be concise, avoid overly specific categories, and use everyday language.
"""

user_prompt = f"""
I have a list of items from my personal purchase history. Please group them into intuitive, non-overlapping spending categories. Your task is to:

1. Identify and list a practical set of categories to organize these purchases. Keep the number under 10.
2. Make sure the categories are distinct — avoid overlapping groups. For example:
   - Separate 'Gaming' (video games and in-game purchases) from 'Digital Subscriptions' (e.g. software, music services, cloud apps).
   - Use 'Apparel' only for wearable clothing — do not include accessories like bags or watches unless they are strictly clothing-related.
   - Combine 'Fitness', 'Health', and 'Personal Care' into a single category.
   - Use 'Music & Soundtracks' for both physical and digital music/soundtrack purchases. Music streaming subscriptions should go under 'Digital Subscriptions'.
   - Separate 'Books' into its own category — distinct from 'Entertainment' or 'Education'.
   - Manga and light novels should be included under the 'Books & Literature' category.
   - Group items clearly intended for collection (e.g. enamel pins, commemorative coins, special edition memorabilia) into a category called 'Collectibles'.
   - Include artbooks that are themed around games, movies, or anime as part of the 'Collectibles' category (not 'Books').
3. **Do not invent or include categories that are not clearly represented by actual examples from the item list.**

For each category you do create, provide:
- A short explanation of what kinds of items belong in it
- Exactly 3 example items from my list that belong in that category

Here is the list of items:

{items_string}

Please organize the response in this format:

Category Details:
- **[Category Name]**: [short explanation]  
  Example items (Only 3): [item a], [item b], [item c]
...
"""

In [40]:
response = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ],
    temperature=0.2,
    max_tokens=1000)

In [41]:
lines = response.choices[0].message.content.split("\n")
for line in lines:
    print(line)

Category Details:

- **Food & Beverages**: Items related to meals, snacks, and drinks.  
  Example items (Only 3): Waffle (NYP), Coke, Chicken Chop (AMK HUB)

- **Books & Literature**: Physical and digital books, including manga and light novels.  
  Example items (Only 3): Halo: Legacy of Onyx (Book), The Last Wish (Book), Sword Art Online Progressive 5 (Kindle)

- **Entertainment**: Movies, movie tickets, and video games.  
  Example items (Only 3): Thor: Ragnarok DVD, Avengers: Infinity War Tickets, Far Cry: New Dawn (Digital)

- **Digital Subscriptions**: Recurring digital services and software subscriptions.  
  Example items (Only 3): Netflix Subscription, Spotify Premium Subscription, Adobe Creative Cloud (Student Plan)

- **Gaming**: Video games, in-game purchases, and gaming-related hardware.  
  Example items (Only 3): $100 Xbox Gift Card, Destiny 2 - 3000 Silver, Xbox Series X

- **Electronics & Accessories**: Gadgets, electronic accessories, and related items.  
  Example i

## Update CSVs with new categories