<a href="https://colab.research.google.com/github/crunchdomo/llm_conversation/blob/main/preprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import pandas as pd
import re

# Load CSV
df = pd.read_csv('13k-recipes.csv')

# Drop irrelevant columns
columns_to_drop = ['Ingredients', 'Image_Name']  # Replace with actual column names
df.drop(columns=columns_to_drop, axis=1, inplace=True)

# Function to convert imperial units to metric
def convert_to_metric(ingredient):
    # Convert cups to ml (1 cup = 240 ml)
    ingredient = re.sub(r'(\d+)\s*cups?', lambda m: f"{int(m.group(1)) * 240} ml", ingredient)
    # Convert tablespoons to ml (1 tbsp = 15 ml)
    ingredient = re.sub(r'(\d+)\s*tablespoons?', lambda m: f"{int(m.group(1)) * 15} ml", ingredient)
    # Convert teaspoons to ml (1 tsp = 5 ml)
    ingredient = re.sub(r'(\d+)\s*teaspoons?', lambda m: f"{int(m.group(1)) * 5} ml", ingredient)
    return ingredient

# Apply conversion to the ingredient column
df['Cleaned_Ingredients'] = df['Cleaned_Ingredients'].apply(convert_to_metric)

# Save to new CSV
df.to_csv('ingredients_preprocessed.csv', index=False)


In [7]:
import spacy
from spacy.lang.en import English

# For rule-based splitting (lightweight)
nlp = English()
nlp.add_pipe("sentencizer")

# For statistical accuracy (requires model download)
# !python -m spacy download en_core_web_sm
# nlp = spacy.load("en_core_web_sm")


<spacy.pipeline.sentencizer.Sentencizer at 0x7c12ee39bc50>

In [16]:
import pandas as pd
import spacy
from spacy.lang.en import English
import nltk

# Download both 'punkt' and 'punkt_tab' just to be safe, as the error specifically requested 'punkt_tab'
nltk.download("punkt")
nltk.download("punkt_tab") # Add this line to download the specific resource

from nltk.tokenize import sent_tokenize

df = pd.read_csv("ingredients_preprocessed.csv")

# For rule-based splitting (lightweight)
nlp = English()
nlp.add_pipe("sentencizer")

# For statistical accuracy (requires model download)
# !python -m spacy download en_core_web_sm
# nlp = spacy.load("en_core_web_sm")


def split_with_nltk(text):
    # Ensure the input is a string before tokenizing
    if isinstance(text, str):
        return sent_tokenize(text)
    else:
        # Handle non-string inputs, perhaps return an empty list or a specific indicator
        return []


def split_with_spacy(text):
    # Ensure the input is a string before tokenizing
    if isinstance(text, str):
        doc = nlp(text)
        return [sent.text for sent in doc.sents]
    else:
        # Handle non-string inputs
        return []

# Use the NLTK splitter first
df["Instruction_Steps"] = df["Instructions"].apply(split_with_nltk)

# Assuming your DataFrame is called 'df'
import pandas as pd
import spacy
from spacy.lang.en import English
import nltk
import re

nltk.download("punkt")
nlp = English()
nlp.add_pipe("sentencizer")

def merge_steps(sentences):
    merged = []
    current_step = []

    for sent in sentences:
        # Clean up whitespace
        sent = re.sub(r'\s+', ' ', sent).strip()

        # Merge conditions
        if should_merge(sent, current_step):
            current_step.append(sent)
        else:
            if current_step:
                merged.append(" ".join(current_step))
            current_step = [sent]

    if current_step:
        merged.append(" ".join(current_step))

    return merged

def should_merge(sent, current_step):
    if not current_step:
        return False
    # Merge if sentence starts with lowercase or is short
    return (sent[0].islower() or
            len(sent.split()) < 5 or
            sent.startswith(('and ', 'or ', 'then ', 'but ')))

# Updated processing pipeline
def process_instructions(text):
    if not isinstance(text, str):
        return []

    # Initial split with NLTK
    sentences = sent_tokenize(text)

    # Merge using recipe-specific logic
    return merge_steps(sentences)

# Apply to DataFrame
df["Instruction_Steps"] = df["Instructions"].apply(process_instructions)
df['Numbered_Steps'] = df['Instruction_Steps'].apply(
    lambda steps: [f"Step {i+1}: {step}" for i, step in enumerate(steps)]
)
# To see the results for the first recipe
print(df.loc[0, 'Numbered_Steps'])


# Convert the 'Instructions' column to string type before applying the function
# There seems to be an error in the original code where it applies split_with_spacy
# after applying split_with_nltk, overwriting the result.
# You likely only want to use one of them.
# If you want to use spaCy instead, uncomment the line below and remove the NLTK apply line.
# df["Instruction_Steps"] = df["Instructions"].astype(str).apply(split_with_spacy)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


['Step 1: Pat chicken dry with paper towels, season all over with 2 tsp. salt, and tie legs together with kitchen twine.', 'Step 2: Let sit at room temperature 1 hour.', 'Step 3: Meanwhile, halve squash and scoop out seeds.', 'Step 4: Run a vegetable peeler along ridges of squash halves to remove skin.', 'Step 5: Cut each half into ½"-thick wedges; arrange on a rimmed baking sheet.', 'Step 6: Combine sage, rosemary, and 6 Tbsp. melted butter in a large bowl; pour half of mixture over squash on baking sheet.', 'Step 7: Sprinkle squash with allspice, red pepper flakes, and ½ tsp. salt and season with black pepper; toss to coat.', 'Step 8: Add bread, apples, oil, and ¼ tsp. salt to remaining herb butter in bowl; season with black pepper and toss to combine. Set aside.', 'Step 9: Place onion and vinegar in a small bowl; season with salt and toss to coat.', 'Step 10: Let sit, tossing occasionally, until ready to serve.', 'Step 11: Place a rack in middle and lower third of oven; preheat to 4

In [17]:
df.to_csv('ingredients_preprocessed.csv', index=False)
