# Ingredient Line Classifier

This notebook builds a classifier to tag ingredient lines into 4 categories:
- `ingredient_only` — e.g., "Tomato"
- `ingredient_with_qty` — e.g., "Milk 200 ml"
- `instruction_like` — e.g., "Chop the onions"
- `non_food` — e.g., "Plastic wrap"


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score
import re


In [None]:
# Load training data
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

print("Training data shape:", train_df.shape)
print("\nTraining data:")
print(train_df)
print("\nTest data shape:", test_df.shape)
print("\nTest data:")
print(test_df)


Training data shape: (12, 2)

Training data:
                     text                label
0                  Tomato      ingredient_only
1                   Onion      ingredient_only
2             Butter 50 g  ingredient_with_qty
3             Milk 200 ml  ingredient_with_qty
4         Chop the onions     instruction_like
5   Simmer for 10 minutes     instruction_like
6            Plastic wrap             non_food
7            Baking paper             non_food
8                  Eggs 2  ingredient_with_qty
9               Olive oil      ingredient_only
10        Slice the bread     instruction_like
11            Paper towel             non_food

Test data shape: (8, 2)

Test data:
                 text  label
0              Garlic    NaN
1          Sugar 20 g    NaN
2       Warm in a pan    NaN
3       Aluminum foil    NaN
4          Rice 150 g    NaN
5               Cumin    NaN
6  Stir for 2 minutes    NaN
7                Salt    NaN


In [None]:
# Check label distribution
print("Label distribution:")
print(train_df['label'].value_counts())
print("\nUnique labels:", train_df['label'].unique())


Label distribution:
label
ingredient_only        3
ingredient_with_qty    3
instruction_like       3
non_food               3
Name: count, dtype: int64

Unique labels: ['ingredient_only' 'ingredient_with_qty' 'instruction_like' 'non_food']


## Feature Engineering

Looking at the patterns:
- `ingredient_with_qty`: contains numbers/units (g, ml, kg, etc.)
- `instruction_like`: contains action verbs (chop, simmer, stir, etc.)
- `non_food`: items like "plastic wrap", "baking paper"
- `ingredient_only`: simple ingredient names

Let me try a simple approach with TF-IDF first, but I'll also add some rule-based features.


In [4]:
# Let's look at some patterns manually
print("Examples with quantities:")
for text in train_df['text']:
    if re.search(r'\d+\s*(g|ml|kg|l|G|ML|KG|L)', text, re.IGNORECASE):
        print(f"  {text}")

print("\nExamples that look like instructions:")
instruction_keywords = ['chop', 'simmer', 'stir', 'slice', 'warm', 'mix']
for text in train_df['text']:
    if any(kw in text.lower() for kw in instruction_keywords):
        print(f"  {text}")


Examples with quantities:
  Butter 50 g
  Milk 200 ml

Examples that look like instructions:
  Chop the onions
  Simmer for 10 minutes
  Slice the bread


In [5]:
# Prepare data
X_train = train_df['text'].values
y_train = train_df['label'].values
X_test = test_df['text'].values

print(f"Training on {len(X_train)} examples")
print(f"Predicting on {len(X_test)} examples")


Training on 12 examples
Predicting on 8 examples


In [16]:
# Let me try a different approach - add some simple features manually
# Since we have very few examples, rule-based features might help

def has_quantity(text):
    """Check if text has a quantity (number + unit)"""
    return 1 if re.search(r'\d+\s*(g|ml|kg|l|gram|grams|liter|liters)', text.lower()) else 0

def has_instruction_verb(text):
    """Check if text has instruction verbs"""
    verbs = ['chop', 'simmer', 'stir', 'slice', 'warm', 'mix', 'cook', 'heat', 'boil', 'fry']
    return 1 if any(verb in text.lower() for verb in verbs) else 0

def has_non_food_keyword(text):
    """Check if text has non-food keywords"""
    keywords = ['wrap', 'paper', 'foil', 'towel', 'plastic', 'baking', 'aluminum']
    return 1 if any(kw in text.lower() for kw in keywords) else 0

# Extract rule-based features for training data
X_train_features = []
for text in X_train:
    feat = [has_quantity(text), has_instruction_verb(text), has_non_food_keyword(text)]
    X_train_features.append(feat)

X_train_features = np.array(X_train_features)
print("Rule-based features shape:", X_train_features.shape)
print("Sample features:", X_train_features[:3])


Rule-based features shape: (12, 3)
Sample features: [[0 0 0]
 [0 0 0]
 [1 0 0]]


In [None]:
# Now let's get TF-IDF features
from sklearn.linear_model import LogisticRegression

# Get TF-IDF features
tfidf = TfidfVectorizer(ngram_range=(1, 2), lowercase=True, min_df=1)
X_train_tfidf = tfidf.fit_transform(X_train)

print("TF-IDF features shape:", X_train_tfidf.shape)

# Combine rule-based and TF-IDF features
X_train_combined = hstack([csr_matrix(X_train_features), X_train_tfidf])
print("Combined features shape:", X_train_combined.shape)

# Try logistic regression with balanced weights
model = LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42)
model.fit(X_train_combined, y_train)
print("Model trained!")


TF-IDF features shape: (12, 38)
Combined features shape: (12, 41)
Model trained!


In [18]:
# Check performance on training data
y_train_pred = model.predict(X_train_combined)
print("Training set performance:")
print(classification_report(y_train, y_train_pred))


Training set performance:
                     precision    recall  f1-score   support

    ingredient_only       1.00      1.00      1.00         3
ingredient_with_qty       1.00      1.00      1.00         3
   instruction_like       1.00      1.00      1.00         3
           non_food       1.00      1.00      1.00         3

           accuracy                           1.00        12
          macro avg       1.00      1.00      1.00        12
       weighted avg       1.00      1.00      1.00        12



In [19]:
# Extract features for test set
X_test_features = []
for text in X_test:
    feat = [has_quantity(text), has_instruction_verb(text), has_non_food_keyword(text)]
    X_test_features.append(feat)

X_test_features = np.array(X_test_features)
X_test_tfidf = tfidf.transform(X_test)
X_test_combined = hstack([csr_matrix(X_test_features), X_test_tfidf])

# Predict on test set
y_test_pred = model.predict(X_test_combined)

# Create predictions dataframe
predictions_df = pd.DataFrame({
    'text': X_test,
    'pred': y_test_pred
})

print("Predictions:")
print(predictions_df)


Predictions:
                 text                 pred
0              Garlic      ingredient_only
1          Sugar 20 g  ingredient_with_qty
2       Warm in a pan     instruction_like
3       Aluminum foil             non_food
4          Rice 150 g  ingredient_with_qty
5               Cumin      ingredient_only
6  Stir for 2 minutes     instruction_like
7                Salt      ingredient_only


In [20]:
# Let's check if predictions make sense
print("\nChecking predictions:")
for i, (text, pred) in enumerate(zip(X_test, y_test_pred)):
    print(f"{i+1}. '{text}' -> {pred}")
    
    # Quick validation
    if has_quantity(text) and pred != 'ingredient_with_qty':
        print(f"   (has quantity but predicted as {pred})")
    if has_instruction_verb(text) and pred != 'instruction_like':
        print(f"   (has instruction verb but predicted as {pred})")
    if has_non_food_keyword(text) and pred != 'non_food':
        print(f"   (has non-food keyword but predicted as {pred})")



Checking predictions:
1. 'Garlic' -> ingredient_only
2. 'Sugar 20 g' -> ingredient_with_qty
3. 'Warm in a pan' -> instruction_like
4. 'Aluminum foil' -> non_food
5. 'Rice 150 g' -> ingredient_with_qty
6. 'Cumin' -> ingredient_only
7. 'Stir for 2 minutes' -> instruction_like
8. 'Salt' -> ingredient_only


In [None]:
# Save predictions
predictions_df.to_csv('predictions.csv', index=False)
print("Saved predictions to predictions.csv")
