In [None]:
# dataset (data.csv) link: https://huggingface.co/datasets/AmirMohseni/GroceryList/viewer/default/train?utm_source=chatgpt.com

In [82]:
import numpy as np
import tensorflow as tf
import keras as keras
import pandas as pd
import json
from keras import layers
from sklearn.model_selection import train_test_split
# print(keras.__version__)
# keras.backend.backend()


In [67]:
KEEP_CATEGORIES = {
    "Snacks",
    "Bakery",
    "Dairy & Eggs",
    "Pantry",
    "Beverages",
}

In [68]:
# basic cleaning 
def clean_text(s: str) -> str:
    s = str(s).lower().strip()
    s = " ".join(s.split())
    return s

# load datasets
df_basic = pd.read_csv("data/data.csv", names=["Item", "Category"])
df_off = pd.read_csv("data/off_grocery_dataset.csv")

# clean text
for df in (df_basic, df_off):
    df["Item"] = df["Item"].apply(clean_text)
    df["Category"] = df["Category"].str.strip()

# keep only strong categories
df_basic = df_basic[df_basic["Category"].isin(KEEP_CATEGORIES)]
df_off = df_off[df_off["Category"].isin(KEEP_CATEGORIES)]

# merge + remove dupes
df = pd.concat([df_basic, df_off], ignore_index=True)
df = df.drop_duplicates(subset=["Item", "Category"]).reset_index(drop=True)

print(df["Category"].value_counts())
print("Total rows:", len(df))


Category
Snacks          186
Bakery          168
Dairy & Eggs    122
Pantry          104
Beverages        97
Name: count, dtype: int64
Total rows: 677


In [69]:
CAP = 120  # per category

df = (
    df.groupby("Category", group_keys=False)
      .apply(lambda g: g.sample(min(len(g), CAP), random_state=42))
      .reset_index(drop=True)
)

print(df["Category"].value_counts())

Category
Bakery          120
Dairy & Eggs    120
Snacks          120
Pantry          104
Beverages        97
Name: count, dtype: int64


  .apply(lambda g: g.sample(min(len(g), CAP), random_state=42))


In [None]:
# encode labels
categories = sorted(df["Category"].unique())
cat_to_idx = {c: i for i, c in enumerate(categories)}
idx_to_cat = {i: c for c, i in cat_to_idx.items()}

df["label"] = df["Category"].map(cat_to_idx)
# print(cat_to_idx)

{'Bakery': 0, 'Beverages': 1, 'Dairy & Eggs': 2, 'Pantry': 3, 'Snacks': 4}


In [71]:
# train/text split

X_train, X_test, y_train, y_test = train_test_split(
    df["Item"].values,
    df["label"].values,
    test_size=0.2,
    random_state=42,
    stratify=df["label"].values
)

In [72]:
# text vectoriser
max_tokens = 5000
sequence_length = 20
vectorizer = layers.TextVectorization(
    max_tokens=max_tokens,
    output_mode="int",
    output_sequence_length=sequence_length
)
vectorizer.adapt(X_train)

In [73]:
# define model 

num_classes = len(categories)

inputs = keras.Input(shape=(1,), dtype=tf.string)
x = vectorizer(inputs)
x = layers.Embedding(input_dim=max_tokens, output_dim=64)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dense(64, activation="relu")(x)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(num_classes, activation="softmax")(x)

model = keras.Model(inputs, outputs)
model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"]
)

model.summary()


In [78]:
# train
history = model.fit(
    X_train,
    y_train,
    validation_split=0.2,
    epochs=20,
    batch_size=32
)

Epoch 1/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.8408 - loss: 0.9060 - val_accuracy: 0.6333 - val_loss: 1.1881
Epoch 2/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8464 - loss: 0.8287 - val_accuracy: 0.6667 - val_loss: 1.1467
Epoch 3/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9134 - loss: 0.7460 - val_accuracy: 0.6111 - val_loss: 1.1213
Epoch 4/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.8799 - loss: 0.6977 - val_accuracy: 0.6556 - val_loss: 1.0970
Epoch 5/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9078 - loss: 0.6355 - val_accuracy: 0.6667 - val_loss: 1.0669
Epoch 6/20
[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.9162 - loss: 0.5715 - val_accuracy: 0.6667 - val_loss: 1.0246
Epoch 7/20
[1m12/12[0m [32m━━━━━━━━━━

In [79]:
# evaluate 
test_loss, test_acc = model.evaluate(X_test, y_test)
print("Test accuracy:", test_acc)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7257 - loss: 0.7117 
Test accuracy: 0.7256637215614319


In [None]:
# this is in the interface file (only here for testing) 

# def predict_category(item_name: str, threshold=0.5):
#     item_name = clean_text(item_name)

#     x = tf.constant([[item_name]])
#     probs = model.predict(x, verbose=0)[0]

#     max_prob = float(probs.max())
#     pred_idx = int(probs.argmax())

#     if max_prob < threshold:
#         return "Other", max_prob

#     return idx_to_cat[pred_idx], max_prob


# print(predict_category("oat milk"))
# print(predict_category("chocolate biscuits"))
# print(predict_category("sourdough bread"))
# print(predict_category("toothpaste")) # personal care is not a category yet - should return 'other'
# print(predict_category("oil")) 


In [None]:
# save model
model.save("model.keras")

# save label map
with open("label_map.json", "w") as f:
    json.dump({int(k): v for k, v in idx_to_cat.items()}, f)