<a href="https://colab.research.google.com/github/crunchdomo/llm_conversation/blob/main/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Collecting joblib (from nltk)
  Downloading joblib-1.5.0-py3-none-any.whl.metadata (5.6 kB)
Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 27.0 MB/s eta 0:00:00
Downloading joblib-1.5.0-py3-none-any.whl (307 kB)
Installing collected packages: joblib, nltk

   ---------------------------------------- 0/2 [joblib]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   ---------------------------------------- 2/2 [nltk]

Successfully installed joblib-1.5.0 nltk-3.

In [10]:
import pandas as pd
import re

# Load CSV
df = pd.read_csv('13k-recipes.csv')

# Drop irrelevant columns
columns_to_drop = ['Ingredients', 'Image_Name']  # Replace with actual column names
df.drop(columns=columns_to_drop, axis=1, inplace=True)

# Function to convert imperial units to metric
def convert_to_metric(ingredient):
    # Convert cups to ml (1 cup = 240 ml)
    ingredient = re.sub(r'(\d+)\s*cups?', lambda m: f"{int(m.group(1)) * 240} ml", ingredient)
    # Convert tablespoons to ml (1 tbsp = 15 ml)
    ingredient = re.sub(r'(\d+)\s*tablespoons?', lambda m: f"{int(m.group(1)) * 15} ml", ingredient)
    # Convert teaspoons to ml (1 tsp = 5 ml)
    ingredient = re.sub(r'(\d+)\s*teaspoons?', lambda m: f"{int(m.group(1)) * 5} ml", ingredient)
    return ingredient

# Apply conversion to the ingredient column
df['Cleaned_Ingredients'] = df['Cleaned_Ingredients'].apply(convert_to_metric)

# Save to new CSV
df.to_csv('ingredients_preprocessed.csv', index=False)


In [11]:
import spacy
from spacy.lang.en import English

# For rule-based splitting (lightweight)
nlp = English()
nlp.add_pipe("sentencizer")

# For statistical accuracy (requires model download)
# !python -m spacy download en_core_web_sm
# nlp = spacy.load("en_core_web_sm")


<spacy.pipeline.sentencizer.Sentencizer at 0x18e97ee1f50>

In [20]:
import pandas as pd
import re
import nltk
nltk.download("punkt")
from nltk.tokenize import sent_tokenize
import json

# Load your CSV
df = pd.read_csv("ingredients_preprocessed.csv")

def ingredients_to_list(ingredients_str):
    if not isinstance(ingredients_str, str):
        return []
    items = re.split(r',|;', ingredients_str)
    return [item.strip() for item in items if item.strip()]

def merge_steps(sentences):
    merged = []
    current_step = []
    for sent in sentences:
        sent = re.sub(r'\s+', ' ', sent).strip()
        if should_merge(sent, current_step):
            current_step.append(sent)
        else:
            if current_step:
                merged.append(" ".join(current_step))
            current_step = [sent]
    if current_step:
        merged.append(" ".join(current_step))
    return merged

def should_merge(sent, current_step):
    if not current_step:
        return False
    return (sent[0].islower() or
            len(sent.split()) < 5 or
            sent.startswith(('and ', 'or ', 'then ', 'but ')))


import re
import nltk
nltk.download("punkt")
from nltk.tokenize import sent_tokenize

def extract_steps(instructions, max_steps=20):
    if not isinstance(instructions, str) or len(instructions.strip()) < 10:
        return []

    text = instructions.strip()

    # 1. Try to split by numbered steps
    numbered = re.split(r'(?:^|\n)\s*(?:\d+[\.\)]|Step\s*\d+)\s*', text)
    steps = [s.strip() for s in numbered if s.strip()]
    if len(steps) > 1:
        return steps if len(steps) <= max_steps else merge_excess_steps(steps, max_steps)

    # 2. Try to split by paragraphs
    paragraphs = [p.strip() for p in re.split(r'\n{2,}', text) if p.strip()]
    if len(paragraphs) > 1:
        return paragraphs if len(paragraphs) <= max_steps else merge_excess_steps(paragraphs, max_steps)

    # 3. Fallback: sentence tokenization and aggressive merging
    sentences = sent_tokenize(text)
    merged = []
    current = ""
    for sent in sentences:
        # Merge if sentence is short or starts with lowercase/conjunction/parenthesis
        if (len(sent.split()) < 8 or
            sent[0].islower() or
            sent.startswith(('and ', 'or ', 'then ', 'but ', '('))):
            current += " " + sent
        else:
            if current:
                merged.append(current.strip())
            current = sent
    if current:
        merged.append(current.strip())

    # If too many steps, merge last ones
    if len(merged) > max_steps:
        merged = merge_excess_steps(merged, max_steps)
    return merged

def merge_excess_steps(steps, max_steps):
    # Merge the last steps together so total does not exceed max_steps
    if len(steps) <= max_steps:
        return steps
    merged = steps[:max_steps-1]
    merged.append(' '.join(steps[max_steps-1:]))
    return merged



def process_instructions(text):
    if not isinstance(text, str):
        return []
    sentences = sent_tokenize(text)
    return merge_steps(sentences)

structured_recipes = []
for idx, row in df.iterrows():
    steps = extract_steps(row["Instructions"])
    recipe = {
        "title": row["Title"],
        "ingredients": ingredients_to_list(row["Cleaned_Ingredients"]),
        "instructions": {i+1: step for i, step in enumerate(steps)}
    }
    structured_recipes.append(recipe)

# Save to JSON
with open("structured_recipes.json", "w", encoding="utf-8") as f:
    json.dump(structured_recipes, f, ensure_ascii=False, indent=4)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oenfa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oenfa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
