In [None]:
# dataset (data.csv) link: https://huggingface.co/datasets/AmirMohseni/GroceryList/viewer/default/train?utm_source=chatgpt.com
# off_grocery_dataset.cvs contains pets at home, eataly and open food facts scraped data

In [34]:
import numpy as np
import tensorflow as tf
import keras as keras
import pandas as pd
import json
from keras import layers
from sklearn.model_selection import train_test_split
# print(keras.__version__)
# keras.backend.backend()


In [35]:
# basic cleaning 
def clean_text(s: str) -> str:
    s = str(s).lower().strip()
    s = " ".join(s.split())
    return s

# load datasets
df_basic = pd.read_csv("data/data.csv", names=["Item", "Category"])
df_off = pd.read_csv("data/off_grocery_dataset.csv")

# clean text
for df in (df_basic, df_off):
    df["Item"] = df["Item"].apply(clean_text)
    df["Category"] = df["Category"].str.strip()

# merge + remove dupes
df = pd.concat([df_basic, df_off], ignore_index=True)
df = df.drop_duplicates(subset=["Item", "Category"]).reset_index(drop=True)

# print(df["Category"].value_counts())
# print("Total rows:", len(df))

print(len(df_basic))
print(len(df_off))
print(len(df))




226
980
1179


In [36]:
CAP = 120  # per category

df = (
    df.groupby("Category", group_keys=False)
      .apply(lambda g: g.sample(min(len(g), CAP), random_state=42))
      .reset_index(drop=True)
)

print(df["Category"].value_counts())

Category
Bakery                 120
Pet Supplies           120
Snacks                 120
Dairy & Eggs           108
Condiments & Sauces    107
Beverages               94
Produce                 62
Canned Goods            55
Meat & Seafood          43
Pasta & Grains          43
Frozen Foods            36
Household               35
Pantry                  18
Personal Care           16
Deli                    13
Category                 1
Name: count, dtype: int64


  .apply(lambda g: g.sample(min(len(g), CAP), random_state=42))


In [39]:
df = df[df["Category"] != "Category"].reset_index(drop=True)

# encode labels
categories = sorted(df["Category"].unique())
cat_to_idx = {c: i for i, c in enumerate(categories)}
idx_to_cat = {i: c for c, i in cat_to_idx.items()}

df["label"] = df["Category"].map(cat_to_idx)
# print(cat_to_idx)

In [40]:
# train/text split

X_train, X_test, y_train, y_test = train_test_split(
    df["Item"].values,
    df["label"].values,
    test_size=0.2,
    random_state=42,
    stratify=df["label"].values
)

In [41]:
# text vectoriser
max_tokens = 5000
sequence_length = 20
vectorizer = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=sequence_length
)
vectorizer.adapt(X_train)

In [42]:
# define model 

num_classes = len(categories)

inputs = keras.Input(shape=(1,), dtype=tf.string)
x = vectorizer(inputs)
x = layers.Embedding(input_dim=max_tokens, output_dim=64)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)

model = keras.Model(inputs, outputs)
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


In [43]:
# train
history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=32
)

Epoch 1/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.1216 - loss: 2.6784 - val_accuracy: 0.1069 - val_loss: 2.6246
Epoch 2/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1248 - loss: 2.5984 - val_accuracy: 0.1069 - val_loss: 2.5432
Epoch 3/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1232 - loss: 2.5715 - val_accuracy: 0.1069 - val_loss: 2.5097
Epoch 4/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1422 - loss: 2.5410 - val_accuracy: 0.1384 - val_loss: 2.4937
Epoch 5/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1801 - loss: 2.5198 - val_accuracy: 0.1698 - val_loss: 2.4786
Epoch 6/20
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.1912 - loss: 2.4999 - val_accuracy: 0.1887 - val_loss: 2.4514
Epoch 7/20
[1m20/20[0m [32m━━━━━━━━━━

In [44]:
# evaluate 
test_loss, test_acc = model.evaluate(X_test, y_test)
print("Test accuracy:", test_acc)

[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.4596 - loss: 1.9254 
Test accuracy: 0.4595959484577179


In [45]:
# this is in the interface file (only here for testing) 

def predict_category(item_name: str, threshold=0.5):
    item_name = clean_text(item_name)

    x = tf.constant([[item_name]])
    probs = model.predict(x, verbose=0)[0]

    max_prob = float(probs.max())
    pred_idx = int(probs.argmax())

    if max_prob < threshold:
        return "Other", max_prob

    return idx_to_cat[pred_idx], max_prob


print(predict_category("oat milk"))
print(predict_category("chocolate biscuits"))
print(predict_category("sourdough bread"))
print(predict_category("toothpaste")) # personal care is not a category yet - should return 'other'
print(predict_category("oil")) 
print(predict_category("dry cat food")) 


('Other', 0.22215628623962402)
('Other', 0.27774450182914734)
('Other', 0.27907776832580566)
('Other', 0.1435113549232483)
('Other', 0.14941711723804474)
('Pet Supplies', 0.8857770562171936)


In [46]:
# save model
model.save("model.keras")

# save label map
with open("label_map.json", "w") as f:
    json.dump({int(k): v for k, v in idx_to_cat.items()}, f)