In [None]:
# Groceries_dataset.csv downloaded from Kaggle https://www.kaggle.com/datasets/heeraldedhia/groceries-dataset

In [20]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json

In [21]:
# Load Kaggle groceries dataset (one item per row)
groceries = pd.read_csv("data/Groceries_dataset.csv")
groceries.head()


Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [22]:
# group by customer + date so each row is now a shopping basket
baskets = (
    groceries
    .groupby(["Member_number", "Date"])["itemDescription"]
    .apply(list)
    .reset_index(name="items")
)

baskets.head()


Unnamed: 0,Member_number,Date,items
0,1000,15-03-2015,"[sausage, whole milk, semi-finished bread, yog..."
1,1000,24-06-2014,"[whole milk, pastry, salty snack]"
2,1000,24-07-2015,"[canned beer, misc. beverages]"
3,1000,25-11-2015,"[sausage, hygiene articles]"
4,1000,27-05-2015,"[soda, pickled vegetables]"


In [23]:
# load trained model from task 1
item_model = tf.keras.models.load_model("../task-1/model.keras")

# load label map (index -> category name)
with open("../task-1/label_map.json", "r") as f:
    idx_to_label = json.load(f)


In [24]:
# predict category for a single item - used chat gpt to help write this
def predict_category_for_item(item_name):
    item_name = str(item_name)
    x = np.array([item_name], dtype=object)

    # run the model
    preds = item_model.predict(x, verbose=0)[0]

    # find the class with the highest probability
    idx = int(np.argmax(preds))

    # look up the category name
    category_name = idx_to_label[str(idx)]
    return category_name


In [25]:
# convert each basket from items to categories
# whole basket -> list of categories
def get_categories_for_basket(item_list):
    categories = []
    for item in item_list:
        cat = predict_category_for_item(item)
        categories.append(cat)

    # remove duplicates and sort
    categories = sorted(list(set(categories)))
    return categories

# apply to every basket
baskets["categories"] = baskets["items"].apply(get_categories_for_basket)

baskets[["items", "categories"]].head()


Unnamed: 0,items,categories
0,"[sausage, whole milk, semi-finished bread, yog...","[Dairy & Eggs, Grains & Bakery, Pantry Items]"
1,"[whole milk, pastry, salty snack]","[Dairy & Eggs, Other, Pantry Items]"
2,"[canned beer, misc. beverages]",[Pantry Items]
3,"[sausage, hygiene articles]",[Pantry Items]
4,"[soda, pickled vegetables]","[Grains & Bakery, Pantry Items]"


In [28]:
# convert lists into strings - used chat gpt for the first part of this cell 
baskets["items_str"] = baskets["items"].apply(lambda lst: ";".join(lst))
baskets["categories_str"] = baskets["categories"].apply(lambda lst: ";".join(lst))

# keep only the useful columns
baskets_to_save = baskets[["Member_number", "Date", "items_str", "categories_str"]]

baskets_to_save.to_csv("data/baskets_with_categories.csv", index=False)

print("saved baskets_with_categories.csv")


saved baskets_with_categories.csv
