<a href="https://colab.research.google.com/github/crunchdomo/llm_conversation/blob/main/recipe_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
import os
import csv
import json
import ast
import pandas as pd
import networkx as nx
from itertools import combinations
from sklearn.feature_extraction.text import CountVectorizer
from openai import OpenAI
from slugify import slugify
import matplotlib.pyplot as plt
from pyvis.network import Network

# Configuration
API_KEY = ''  # Replace with valid OpenAI key
INPUT_CSV = '13k-recipes.csv'
OUTPUT_FOLDER = 'selected_recipes'
NUM_RECIPES_TO_SELECT = 200  # Adjust based on your needs

# Initialize OpenAI client
client = OpenAI(api_key=API_KEY)

def visualize_network(G, ingredient_names, centrality):
    """Visualize ingredient network with Pyvis"""
    net = Network(height="800px", width="100%", notebook=True)

    # Add nodes with centrality-based sizing
    max_centrality = max(centrality.values())
    for node in G.nodes():
        net.add_node(node,
                    label=ingredient_names[node],
                    size=centrality[node]/max_centrality*50,
                    title=f"Centrality: {centrality[node]:.4f}")

    # Add edges with weight-based scaling
    for u, v, data in G.edges(data=True):
        net.add_edge(u, v, value=data['weight']/10)

    net.show_buttons(filter_=['physics'])
    net.show("ingredient_network.html")

def plot_centrality_distribution(centrality):
    """Plot histogram of centrality values"""
    plt.figure(figsize=(10,6))
    plt.hist(list(centrality.values()), bins=50, color='skyblue')
    plt.title('Eigenvector Centrality Distribution')
    plt.xlabel('Centrality Score')
    plt.ylabel('Number of Ingredients')
    plt.savefig('centrality_distribution.png')
    plt.close()


def select_recipes():
    """Select diverse recipes using ingredient network analysis"""
    print("Loading data and analyzing ingredients...")

    # Read and preprocess data
    df = pd.read_csv(INPUT_CSV)
    df['Cleaned_Ingredients'] = df['Cleaned_Ingredients'].apply(ast.literal_eval)

    # Create ingredient presence matrix
    vectorizer = CountVectorizer(binary=True, max_features=500)
    ingredient_matrix = vectorizer.fit_transform(
        df['Cleaned_Ingredients'].apply(' '.join)
    )
    ingredient_names = vectorizer.get_feature_names_out()

    # Build co-occurrence network
    G = nx.Graph()
    for i in range(ingredient_matrix.shape[0]):
        ingredients = ingredient_matrix[i].nonzero()[1]
        for pair in combinations(ingredients, 2):
            if G.has_edge(pair[0], pair[1]):
                G[pair[0]][pair[1]]['weight'] += 1
            else:
                G.add_edge(pair[0], pair[1], weight=1)

    # Find most central ingredients
    centrality = nx.eigenvector_centrality(G)
    top_ingredients = sorted(centrality, key=centrality.get, reverse=True)[:100]

    # Select recipes containing these ingredients
    selected_indices = []
    for ing_idx in top_ingredients:
        ing_name = ingredient_names[ing_idx]
        mask = df['Cleaned_Ingredients'].apply(lambda x: ing_name in x)
        if mask.sum() > 1:  # Ensure we have recipes to sample
            selected_indices.extend(df[mask].sample(2).index.tolist())

    # Deduplicate and limit selection
    return list(set(selected_indices))[:NUM_RECIPES_TO_SELECT]

def generate_questions(title, ingredients):
    """Generate cooking questions using GPT-4"""
    prompt = f"Generate 5 question variations a user might ask a chef about a recipe titled '{title}' with these ingredients: {ingredients}."
    response = client.chat.completions.create(
        model='gpt-4',
        messages=[{'role': 'user', 'content': prompt}],
        max_tokens=150,
        temperature=0.7
    )
    return response.choices[0].message.content.strip()

def main():
    # Create output folder
    os.makedirs(OUTPUT_FOLDER, exist_ok=True)

    # Select recipes using network analysis
    selected_indices = select_recipes()
    print(f"Selected {len(selected_indices)} recipes for processing")

    # Process selected recipes
    with open(INPUT_CSV, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for idx, recipe in enumerate(reader, start=1):
            df_index = idx - 1  # Convert to 0-based index
            if df_index not in selected_indices:
                continue

            # Create recipe folder
            safe_title = slugify(recipe['Title'], max_length=40)
            recipe_folder = os.path.join(OUTPUT_FOLDER, f"{safe_title}_{idx}")
            os.makedirs(recipe_folder, exist_ok=True)

            # Save recipe data
            with open(os.path.join(recipe_folder, 'recipe.json'), 'w') as f:
                json.dump(recipe, f, indent=2)

            # Generate and save questions
            try:
                ingredients = ast.literal_eval(recipe['Cleaned_Ingredients'])
                questions = generate_questions(recipe['Title'], ingredients)
                with open(os.path.join(recipe_folder, 'questions.txt'), 'w') as f:
                    f.write(questions)
                print(f"✅ Processed: {recipe['Title']}")
            except Exception as e:
                print(f"❌ Error processing {recipe['Title']}: {str(e)}")

if __name__ == "__main__":
    main()


Loading data and analyzing ingredients...
Selected 40 recipes for processing
✅ Processed: Pajeon Sauce
✅ Processed: Kiribath (Coconut Milk Rice)
✅ Processed: One-Pot Pie with Callaloo, Plantain, Goat Cheese, and Cornmeal Crust
✅ Processed: Christmas Cake
✅ Processed: Carrot Curry
✅ Processed: Buddha Bowl with Roasted Sweet Potatoes, Spiced Chickpeas, and Chard
✅ Processed: Quinoa and Sweet Potato Bakes
✅ Processed: 3-Ingredient Gingersnap Icebox Cake
✅ Processed: Butter-Roasted Turkey Breasts
✅ Processed: Buttermilk-Fried Ramps
✅ Processed: Spaghetti Pie


KeyboardInterrupt: 