In [None]:
# Groceries_dataset.csv downloaded from Kaggle https://www.kaggle.com/datasets/heeraldedhia/groceries-dataset

In [32]:
import pandas as pd
import numpy as np
import tensorflow as tf
import json
import random

from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models


In [33]:
groceries = pd.read_csv("data/Groceries_dataset.csv")
groceries.head()

Unnamed: 0,Member_number,Date,itemDescription
0,1808,21-07-2015,tropical fruit
1,2552,05-01-2015,whole milk
2,2300,19-09-2015,pip fruit
3,1187,12-12-2015,other vegetables
4,3037,01-02-2015,whole milk


In [34]:
# group items into baskets by member id and date (one shopping trip)

baskets = (
    groceries
    .groupby(["Member_number", "Date"])["itemDescription"]
    .apply(list)
    .reset_index(name="items")
)

baskets.head()

Unnamed: 0,Member_number,Date,items
0,1000,15-03-2015,"[sausage, whole milk, semi-finished bread, yog..."
1,1000,24-06-2014,"[whole milk, pastry, salty snack]"
2,1000,24-07-2015,"[canned beer, misc. beverages]"
3,1000,25-11-2015,"[sausage, hygiene articles]"
4,1000,27-05-2015,"[soda, pickled vegetables]"


In [35]:
# load task 1 classification model 

item_model = tf.keras.models.load_model("../task-1/model.keras")

with open("../task-1/label_map.json", "r") as f:
    idx_to_label = json.load(f)


In [36]:
# predict category for each item

def predict_category(item_name):
    x = np.array([str(item_name)], dtype=object)
    predictions = item_model.predict(x, verbose=0)[0]
    index = int(np.argmax(predictions))
    return idx_to_label[str(index)]

In [37]:
# apply task 1 to a whole basket - basket -> list of categories

def get_categories_for_basket(item_list):
    categories = []

    for item in item_list:
        cat = predict_category(item)
        categories.append(cat)

    # remove duplicates
    categories = list(set(categories))
    categories.sort()
    return categories


In [38]:
# create category lists for all baskets 
baskets["categories"] = baskets["items"].apply(get_categories_for_basket)
baskets.head()


# data is fully prepared now

Unnamed: 0,Member_number,Date,items,categories
0,1000,15-03-2015,"[sausage, whole milk, semi-finished bread, yog...","[Dairy & Eggs, Grains & Bakery, Pantry Items]"
1,1000,24-06-2014,"[whole milk, pastry, salty snack]","[Dairy & Eggs, Other, Pantry Items]"
2,1000,24-07-2015,"[canned beer, misc. beverages]",[Pantry Items]
3,1000,25-11-2015,"[sausage, hygiene articles]",[Pantry Items]
4,1000,27-05-2015,"[soda, pickled vegetables]","[Grains & Bakery, Pantry Items]"


In [39]:
# convert category names to numbers and vice versa 

all_categories = []

for cats in baskets["categories"]:
    for c in cats:
        if c not in all_categories:
            all_categories.append(c)

all_categories.sort()

# used chatgpt for cat_to_idx and idx_to_cat logic
cat_to_idx = {}
for i, cat in enumerate(all_categories):
    cat_to_idx[cat] = i

idx_to_cat = {}
for cat, idx in cat_to_idx.items():
    idx_to_cat[idx] = cat

num_categories = len(cat_to_idx)
num_categories


8

In [40]:
# multi hot encoding function
def make_multi_hot(categories):
    vector = np.zeros(num_categories)

    for c in categories:
        index = cat_to_idx[c]
        vector[index] = 1

    return vector

In [None]:
# build training data - the model needs to learn to predict categories

X = []
y = []

for cats in baskets["categories"]:
    if len(cats) < 2:
        continue

    for target in cats:
        input_cats = [c for c in cats if c != target]

        X.append(make_multi_hot(input_cats))
        y.append(cat_to_idx[target])

X = np.array(X)
y = np.array(y)

X.shape, y.shape


((25157, 8), (25157,))

In [42]:
# train test split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [43]:
# define model 

rec_model = models.Sequential([
    layers.Input(shape=(num_categories,)),
    layers.Dense(64, activation="relu"),
    layers.Dense(32, activation="relu"),
    layers.Dense(num_categories, activation="softmax")
])

rec_model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

rec_model.summary()


In [44]:
# train model 

rec_model.fit(
    X_train,
    y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=128
)


Epoch 1/10
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.5600 - loss: 1.3281 - val_accuracy: 0.6222 - val_loss: 1.0342
Epoch 2/10
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 591us/step - accuracy: 0.6260 - loss: 0.9964 - val_accuracy: 0.6242 - val_loss: 0.9948
Epoch 3/10
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 594us/step - accuracy: 0.6255 - loss: 0.9755 - val_accuracy: 0.6252 - val_loss: 0.9813
Epoch 4/10
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 581us/step - accuracy: 0.6256 - loss: 0.9688 - val_accuracy: 0.6258 - val_loss: 0.9772
Epoch 5/10
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 620us/step - accuracy: 0.6277 - loss: 0.9649 - val_accuracy: 0.6268 - val_loss: 0.9742
Epoch 6/10
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.6279 - loss: 0.9622 - val_accuracy: 0.6212 - val_loss: 0.9748
Epoch 7/10
[1m158/1

<keras.src.callbacks.history.History at 0x157b1d5d0>

In [45]:
# save the model and label map

rec_model.save("rec_model.keras")

with open("all_categories.json", "w") as f:
    json.dump(all_categories, f) 