In [73]:
import pandas as pd
import numpy as np
import json
import random
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import layers, models


In [74]:
# load baskets with items and categories
baskets = pd.read_csv("data/baskets_with_categories.csv")

# turn "a;b;c" into a python list ["a","b","c"]
def string_to_list(s):
    if pd.isna(s) or s == "":
        return []
    return s.split(";")

baskets["categories_list"] = baskets["categories_str"].apply(string_to_list)

# build a list of all categories that appear
all_categories = []
for cats in baskets["categories_list"]:
    for c in cats:
        if c not in all_categories:
            all_categories.append(c)

all_categories = sorted(all_categories)

# make category -> index mapping
cat_to_idx = {}
for i, c in enumerate(all_categories):
    cat_to_idx[c] = i

# and the reverse: index -> category
idx_to_cat = {}
for cat, idx in cat_to_idx.items():
    idx_to_cat[idx] = cat

num_cats = len(cat_to_idx)
print("number of categories:", num_cats)
print("categories:", all_categories)


number of categories: 8
categories: ['Beverages', 'Dairy & Eggs', 'Frozen Foods', 'Grains & Bakery', 'Other', 'Pantry Items', 'Produce', 'Snacks']


In [75]:
# multi-hot encoding function 
# [1, 0, 1, 0] -> 1 = cateogry present
def make_multi_hot_vector(categories):
    vec = np.zeros(num_cats, dtype=np.float32)
    for c in categories:
        if c in cat_to_idx:
            idx = cat_to_idx[c]
            vec[idx] = 1.0
    return vec

In [76]:
# take each absket's category list and for each one, hide it and ask the model to predict it

X_list = []
y_list = []

for cats in baskets["categories_list"]:
    # need at least 2 categories to hide one
    if len(cats) < 2:
        continue

    for target_cat in cats:
        # input categories = all except the one we are trying to predict
        input_cats = [c for c in cats if c != target_cat]
        if len(input_cats) == 0:
            continue

        x_vec = make_multi_hot_vector(input_cats)
        y_idx = cat_to_idx[target_cat]

        X_list.append(x_vec)
        y_list.append(y_idx)

X = np.array(X_list)
y = np.array(y_list)

print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (25157, 8)
y shape: (25157,)


In [77]:
# train/ test split 
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [78]:
# define and train the model 
rec_model = models.Sequential()
rec_model.add(layers.Input(shape=(num_cats,)))
rec_model.add(layers.Dense(64, activation="relu"))
rec_model.add(layers.Dense(32, activation="relu"))
rec_model.add(layers.Dense(num_cats, activation="softmax"))

rec_model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

rec_model.summary()


In [79]:
history = rec_model.fit(
    X_train, y_train,
    validation_data=(X_test, y_test),
    epochs=10,
    batch_size=128
)


Epoch 1/10
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.5740 - loss: 1.3069 - val_accuracy: 0.6238 - val_loss: 1.0224
Epoch 2/10
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 617us/step - accuracy: 0.6244 - loss: 0.9884 - val_accuracy: 0.6254 - val_loss: 0.9874
Epoch 3/10
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 650us/step - accuracy: 0.6267 - loss: 0.9718 - val_accuracy: 0.6202 - val_loss: 0.9835
Epoch 4/10
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 634us/step - accuracy: 0.6266 - loss: 0.9664 - val_accuracy: 0.6254 - val_loss: 0.9791
Epoch 5/10
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 632us/step - accuracy: 0.6276 - loss: 0.9633 - val_accuracy: 0.6266 - val_loss: 0.9713
Epoch 6/10
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 608us/step - accuracy: 0.6258 - loss: 0.9609 - val_accuracy: 0.6266 - val_loss: 0.9725
Epoch 7/10
[1m158

In [80]:
# load text classifier from task 1
item_model = tf.keras.models.load_model("../task-1/model.keras")

# load label map for task 1 model (index -> category name)
with open("../task-1/label_map.json", "r") as f:
    idx_to_label_text = json.load(f)


In [81]:
# one item name -> task 1 category
def predict_category_from_text(item_name):
    item_name = str(item_name)

    # Keras wants an array
    x = np.array([item_name], dtype=object)

    preds = item_model.predict(x, verbose=0)[0]
    idx = int(np.argmax(preds))
    return idx_to_label_text[str(idx)]

In [82]:
# basket of item names -> list of task 1 categories
def get_categories_from_items(item_list):
    cats = []
    for item in item_list:
        cat = predict_category_from_text(item)
        cats.append(cat)

    # remove duplicates
    cats = sorted(list(set(cats)))
    return cats

In [83]:
# map task 2 categories (from the Kaggle dataset to final_grocery_items.csv categories)

all_items = pd.read_csv("../task-1/data/final_grocery_dataset.csv")

category_mapping = {
    "Grains & Bakery": ["Bakery", "Pasta & Grains"],
    "Pantry Items": ["Pantry", "Canned Goods", "Condiments & Sauces"],
    "Meat & Deli": ["Meat & Seafood", "Deli"],
    "Beverages": ["Beverages"],
    "Dairy & Eggs": ["Dairy & Eggs"],
    "Frozen Foods": ["Frozen Foods"],
    "Produce": ["Produce"],
    "Snacks": ["Snacks"],
    "Household": ["Household"],
    "Personal Care": ["Personal Care"],
    "Pet Supplies": ["Pet Supplies"],
    "Other": ["Other"]
}


In [84]:
# pick a random item from the recommended category

def recommend_item_from_category(model_category):
    if model_category not in category_mapping:
        return None

    item_cats = category_mapping[model_category]

    possible_items = all_items[all_items["Category"].isin(item_cats)]["Item"].tolist()

    if len(possible_items) == 0:
        return None

    return random.choice(possible_items)


In [85]:
def recommend_extra_item(item_list):
    # 1. turn item names into categories using the task 1 model
    cats = get_categories_from_items(item_list)

    if len(cats) == 0:
        return None

    # 2. turn categories into a multi-hot vector
    x_vec = make_multi_hot_vector(cats)
    x_vec = x_vec.reshape(1, -1)

    # 3. use the task 2 model to predict an extra category
    probs = rec_model.predict(x_vec, verbose=0)[0]

    # do not recommend a category that is already in the basket
    for c in cats:
        if c in cat_to_idx:
            probs[cat_to_idx[c]] = 0.0

    best_idx = int(np.argmax(probs))
    best_category = idx_to_cat[best_idx]

    # 4. pick an example item from that category
    extra_item = recommend_item_from_category(best_category)

    return extra_item


In [86]:
test_basket = ["pasta", "olive oil"]

print("Basket:", test_basket)

rec_item = recommend_extra_item(test_basket)

print("Recommended item:", rec_item)

Basket: ['pasta', 'olive oil']
Recommended item: crackers
