## imports, again... 

In [21]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import pickle

## load processed data from previous step 

In [22]:
df = pd.read_csv("../data/processed/groceries_processed.csv")

print("Data preview:")
display(df.head())

Data preview:


Unnamed: 0,item,category,item_clean,category_id
0,apples,Fruit & Vegetables,apple,4
1,bananas,Fruit & Vegetables,banana,4
2,oranges,Fruit & Vegetables,orange,4
3,spinach,Fruit & Vegetables,spinach,4
4,carrots,Fruit & Vegetables,carrot,4


Set up inputs / lables

In [23]:
X_raw = df['item_clean'].values
y = df['category_id'].values

## Vectorising the text - Adding n-grams


In [24]:
vectorizer = TfidfVectorizer(ngram_range=(1,2))
X = vectorizer.fit_transform(X_raw).toarray()

print(f"Input shape: {X.shape}")
print(f"Number of categories: {len(np.unique(y))}")

Input shape: (50, 93)
Number of categories: 9


## Building model with tensorflow!

In [25]:
num_categories = len(np.unique(y))
input_dim = X.shape[1]

model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(input_dim,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(num_categories, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

## Train the new model on ALL the data

In [7]:
history = model.fit(
    X, y,
    epochs=50,   # adjust for dataset size
    batch_size=8,
    verbose=1
)

Epoch 1/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.1000 - loss: 2.1992  
Epoch 2/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.1200 - loss: 2.1684 
Epoch 3/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2400 - loss: 2.1450     
Epoch 4/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.2600 - loss: 2.1218 
Epoch 5/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.3000 - loss: 2.1005 
Epoch 6/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.4000 - loss: 2.0785 
Epoch 7/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.4400 - loss: 2.0550 
Epoch 8/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5200 - loss: 2.0309 
Epoch 9/50
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[

## Save the trained model and vectorize!

In [26]:
model.save("../models/grocery_classifier_model.keras")

import pickle
with open("../models/vectorizer.pkl", "wb") as f:
    pickle.dump(vectorizer, f)

print("Model and vectorizer saved in models/")


Model and vectorizer saved in models/
