<a href="https://colab.research.google.com/github/darlon31/FlavorGraph/blob/master/Full_Hybrid_System_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# CELL 1: Setup and Dependencies
!pip install -q transformers torch networkx pandas numpy scikit-learn matplotlib seaborn tqdm gdown

import os
import pickle
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm
import time
import logging

# Set up logging
logging.basicConfig(level=logging.INFO,
                   format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [32]:
# CELL 2: Mount Google Drive and Setup
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Make sure we're in the content directory and clean up
os.chdir('/content')
!rm -rf /content/FlavorGraph
logger.info("Environment prepared")

Mounted at /content/drive


In [33]:
# CELL 3: Clone Repository and Check Structure
# Clone the repository
!git clone https://github.com/darlon31/FlavorGraph.git
logger.info("FlavorGraph repository cloned successfully")

# Check the actual structure
print("\nChecking actual directory structure:")
!ls -R /content/FlavorGraph

# Change to the FlavorGraph directory
os.chdir('FlavorGraph')
logger.info(f"Current working directory: {os.getcwd()}")

Cloning into 'FlavorGraph'...
remote: Enumerating objects: 327, done.[K
remote: Counting objects: 100% (78/78), done.[K
remote: Compressing objects: 100% (44/44), done.[K
remote: Total 327 (delta 42), reused 58 (delta 33), pack-reused 249 (from 1)[K
Receiving objects: 100% (327/327), 20.83 MiB | 18.96 MiB/s, done.
Resolving deltas: 100% (190/190), done.

Checking actual directory structure:
/content/FlavorGraph:
images	input  LICENSE	output	README.md  src

/content/FlavorGraph/images:
embeddings.png	flavorgraph2vec.png  flavorgraph.png

/content/FlavorGraph/input:
'dict_ingr2cate - Top300+FDB400+HyperFoods104=616.csv'	 node_classification_hub.csv
 edges_191120.csv					 nodes_191120.csv

/content/FlavorGraph/output:
 kitchenette_embeddings.pkl  'place output files here'

/content/FlavorGraph/src:
dataloader.py  graph2vec.py  main.py  model.py	parser.py  plotter.py  utils.py  walkers.py


In [34]:
# CELL 4: Configuration and Path Verification
class Config:
    """Configuration class for the FlavorGraph Hybrid Recipe System"""

    def __init__(self):
        # Detect environment
        self.IN_COLAB = 'google.colab' in str(get_ipython())

        # Base paths
        if self.IN_COLAB:
            self.BASE_DIR = "/content/FlavorGraph"
        else:
            self.BASE_DIR = "c:/Users/dario/FlavorGraph"

        # Directory structure - using the correct paths as shown in the repository
        self.INPUT_DIR = os.path.join(self.BASE_DIR, "input")
        self.OUTPUT_DIR = os.path.join(self.BASE_DIR, "output")
        self.SRC_DIR = os.path.join(self.BASE_DIR, "src")

        # Data files with exact names from the repository
        self.NODES_FILE = os.path.join(self.INPUT_DIR, "nodes_191120.csv")
        self.EDGES_FILE = os.path.join(self.INPUT_DIR, "edges_191120.csv")
        self.CATEGORIES_FILE = os.path.join(self.INPUT_DIR, "dict_ingr2cate - Top300+FDB400+HyperFoods104=616.csv")
        self.NODE_CLASSIFICATION_FILE = os.path.join(self.INPUT_DIR, "node_classification_hub.csv")
        self.EMBEDDING_FILE = os.path.join(self.OUTPUT_DIR, "kitchenette_embeddings.pkl")

        # Model settings
        self.EMBEDDING_DIM = 300
        self.MOLECULAR_DIM = 881
        self.MODEL_NAME = "flax-community/t5-recipe-generation"
        self.MODEL_CACHE_DIR = os.path.join(self.BASE_DIR, "model_cache")

        # Generation settings
        self.GENERATION_CONFIG = {
            "max_length": 512,
            "min_length": 64,
            "no_repeat_ngram_size": 3,
            "do_sample": True,
            "top_k": 60,
            "top_p": 0.95,
            "temperature": 0.7
        }

        # Hybrid settings
        self.SIMILARITY_THRESHOLD = 0.7
        self.MAX_SIMILAR_INGREDIENTS = 3

    def verify_paths(self):
        """Verify all paths exist and log their status"""
        paths = {
            'BASE_DIR': self.BASE_DIR,
            'INPUT_DIR': self.INPUT_DIR,
            'OUTPUT_DIR': self.OUTPUT_DIR,
            'SRC_DIR': self.SRC_DIR,
            'NODES_FILE': self.NODES_FILE,
            'EDGES_FILE': self.EDGES_FILE,
            'CATEGORIES_FILE': self.CATEGORIES_FILE,
            'NODE_CLASSIFICATION_FILE': self.NODE_CLASSIFICATION_FILE,
            'EMBEDDING_FILE': self.EMBEDDING_FILE
        }

        all_exist = True
        logger.info("\nVerifying paths:")
        for name, path in paths.items():
            if os.path.exists(path):
                if os.path.isfile(path):
                    size = os.path.getsize(path)
                    logger.info(f"{name}: ✓ ({path}) - Size: {size/1024:.2f} KB")
                else:
                    logger.info(f"{name}: ✓ ({path}) - Directory")
            else:
                logger.warning(f"{name}: ✗ ({path}) - Not found")
                all_exist = False
        return all_exist

# Create and verify configuration
config = Config()
logger.info(f"Running in Colab: {config.IN_COLAB}")
logger.info(f"Current working directory: {os.getcwd()}")
logger.info(f"Base directory: {config.BASE_DIR}")

if not config.verify_paths():
    logger.warning("Some required paths are missing!")
else:
    logger.info("All paths verified successfully!")

# Print out all available files in input and output directories for verification
print("\nInput directory contents:")
!ls -l {config.INPUT_DIR}
print("\nOutput directory contents:")
!ls -l {config.OUTPUT_DIR}


Input directory contents:
total 5392
-rw-r--r-- 1 root root   16219 Nov 30 13:05 'dict_ingr2cate - Top300+FDB400+HyperFoods104=616.csv'
-rw-r--r-- 1 root root 5155973 Nov 30 13:05  edges_191120.csv
-rw-r--r-- 1 root root    1484 Nov 30 13:05  node_classification_hub.csv
-rw-r--r-- 1 root root  343416 Nov 30 13:05  nodes_191120.csv

Output directory contents:
total 8644
-rw-r--r-- 1 root root 8845756 Nov 30 13:05  kitchenette_embeddings.pkl
-rw-r--r-- 1 root root      25 Nov 30 13:05 'place output files here'


In [36]:
# CELL 5: Setup Hybrid System Structure
# Create hybrid_system directory
!mkdir -p /content/FlavorGraph/hybrid_system

# Create necessary Python files
hybrid_files = {
    'config.py': '''import os
import logging

logging.basicConfig(level=logging.INFO,
                   format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class Config:
    """Configuration class for the FlavorGraph Hybrid Recipe System"""

    def __init__(self):
        # Detect environment
        self.IN_COLAB = 'google.colab' in str(get_ipython())

        # Base paths
        if self.IN_COLAB:
            self.BASE_DIR = "/content/FlavorGraph"
        else:
            self.BASE_DIR = "c:/Users/dario/FlavorGraph"

        # Directory structure
        self.INPUT_DIR = os.path.join(self.BASE_DIR, "input")
        self.OUTPUT_DIR = os.path.join(self.BASE_DIR, "output")
        self.SRC_DIR = os.path.join(self.BASE_DIR, "src")

        # Data files
        self.NODES_FILE = os.path.join(self.INPUT_DIR, "nodes_191120.csv")
        self.EDGES_FILE = os.path.join(self.INPUT_DIR, "edges_191120.csv")
        self.CATEGORIES_FILE = os.path.join(self.INPUT_DIR, "dict_ingr2cate - Top300+FDB400+HyperFoods104=616.csv")
        self.NODE_CLASSIFICATION_FILE = os.path.join(self.INPUT_DIR, "node_classification_hub.csv")
        self.EMBEDDING_FILE = os.path.join(self.OUTPUT_DIR, "kitchenette_embeddings.pkl")

        # Model settings
        self.EMBEDDING_DIM = 300
        self.MOLECULAR_DIM = 881
        self.MODEL_NAME = "flax-community/t5-recipe-generation"
        self.MODEL_CACHE_DIR = os.path.join(self.BASE_DIR, "model_cache")

        # Generation settings
        self.GENERATION_CONFIG = {
            "max_length": 512,
            "min_length": 64,
            "no_repeat_ngram_size": 3,
            "do_sample": True,
            "top_k": 60,
            "top_p": 0.95,
            "temperature": 0.7
        }

        # Hybrid settings
        self.SIMILARITY_THRESHOLD = 0.7
        self.MAX_SIMILAR_INGREDIENTS = 3

    def verify_paths(self):
        """Verify all paths exist and log their status"""
        paths = {
            'BASE_DIR': self.BASE_DIR,
            'INPUT_DIR': self.INPUT_DIR,
            'OUTPUT_DIR': self.OUTPUT_DIR,
            'SRC_DIR': self.SRC_DIR,
            'NODES_FILE': self.NODES_FILE,
            'EDGES_FILE': self.EDGES_FILE,
            'CATEGORIES_FILE': self.CATEGORIES_FILE,
            'NODE_CLASSIFICATION_FILE': self.NODE_CLASSIFICATION_FILE,
            'EMBEDDING_FILE': self.EMBEDDING_FILE
        }

        all_exist = True
        logger.info("\\nVerifying paths:")
        for name, path in paths.items():
            if os.path.exists(path):
                if os.path.isfile(path):
                    size = os.path.getsize(path)
                    logger.info(f"{name}: ✓ ({path}) - Size: {size/1024:.2f} KB")
                else:
                    logger.info(f"{name}: ✓ ({path}) - Directory")
            else:
                logger.warning(f"{name}: ✗ ({path}) - Not found")
                all_exist = False
        return all_exist''',
    'ingredient_processor.py': '''from config import Config
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pickle

class IngredientProcessor:
    def __init__(self, config):
        self.config = config
        self._load_data()

    def _load_data(self):
        """Load all necessary data files"""
        # Load ingredient nodes
        self.nodes_df = pd.read_csv(self.config.NODES_FILE)

        # Load categories
        self.categories_df = pd.read_csv(self.config.CATEGORIES_FILE)

        # Load embeddings
        with open(self.config.EMBEDDING_FILE, 'rb') as f:
            self.embeddings = pickle.load(f)
''',
    'recipe_generator.py': '''from transformers import AutoTokenizer, T5ForConditionalGeneration
from config import Config
import torch

class RecipeGenerator:
    def __init__(self, config):
        self.config = config
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self._load_model()

    def _load_model(self):
        """Load the T5 model and tokenizer"""
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.MODEL_NAME)
        self.model = T5ForConditionalGeneration.from_pretrained(self.config.MODEL_NAME)
        self.model.to(self.device)
''',
    'test_config.py': '''import unittest
from config import Config
import os

class TestConfig(unittest.TestCase):
    def setUp(self):
        self.config = Config()

    def test_paths_exist(self):
        """Test that all required paths exist"""
        self.assertTrue(self.config.verify_paths())
''',
    'test_hybrid.py': '''import unittest
from config import Config
from ingredient_processor import IngredientProcessor
from recipe_generator import RecipeGenerator

class TestHybridSystem(unittest.TestCase):
    def setUp(self):
        self.config = Config()
        self.ingredient_processor = IngredientProcessor(self.config)
        self.recipe_generator = RecipeGenerator(self.config)
'''
}

# Write files to hybrid_system directory
for filename, content in hybrid_files.items():
    filepath = f'/content/FlavorGraph/hybrid_system/{filename}'
    with open(filepath, 'w') as f:
        f.write(content)

print("Created hybrid system files:")
!ls -l /content/FlavorGraph/hybrid_system/

Created hybrid system files:
total 20
-rw-r--r-- 1 root root 3024 Nov 30 13:08 config.py
-rw-r--r-- 1 root root  663 Nov 30 13:08 ingredient_processor.py
-rw-r--r-- 1 root root  585 Nov 30 13:08 recipe_generator.py
-rw-r--r-- 1 root root  284 Nov 30 13:08 test_config.py
-rw-r--r-- 1 root root  366 Nov 30 13:08 test_hybrid.py


In [55]:
# CELL 5: Setup System Components
import os

# Create hybrid_system directory
os.makedirs('/content/FlavorGraph/hybrid_system', exist_ok=True)

# Update recipe_generator.py
recipe_generator_code = '''from transformers import AutoTokenizer, T5ForConditionalGeneration
import torch
import logging

logger = logging.getLogger(__name__)

class RecipeGenerator:
    def __init__(self, config):
        """Initialize the recipe generation model"""
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        logger.info(f"Loading recipe generation model on {self.device}...")

        self.tokenizer = AutoTokenizer.from_pretrained(
            self.config.MODEL_NAME,
            cache_dir=self.config.MODEL_CACHE_DIR
        )
        self.model = T5ForConditionalGeneration.from_pretrained(
            self.config.MODEL_NAME,
            cache_dir=self.config.MODEL_CACHE_DIR
        ).to(self.device)

        logger.info("Recipe generation model loaded successfully!")

    def generate_recipe(self, ingredients):
        """Generate a recipe from a list of ingredients"""
        # Format input
        input_text = f"ingredients: {', '.join(ingredients)}"
        input_ids = self.tokenizer(
            input_text,
            return_tensors="pt",
            max_length=512,
            truncation=True
        ).input_ids.to(self.device)

        # Generate recipe
        outputs = self.model.generate(
            input_ids,
            **self.config.GENERATION_CONFIG
        )

        # Decode and return
        recipe = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return recipe'''

# Write recipe_generator.py
with open('/content/FlavorGraph/hybrid_system/recipe_generator.py', 'w') as f:
    f.write(recipe_generator_code)

# Update ingredient_processor.py
ingredient_processor_code = '''from config import Config
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import logging

logger = logging.getLogger(__name__)

class IngredientProcessor:
    def __init__(self, config):
        self.config = config
        self._load_data()

    def _normalize_ingredient_name(self, name):
        """Normalize ingredient name for consistent matching"""
        # Convert to lowercase and remove spaces
        norm_name = name.lower().strip()

        # Handle plural forms
        if norm_name.endswith('es'):
            norm_name = norm_name[:-2]
        elif norm_name.endswith('s'):
            norm_name = norm_name[:-1]

        # Replace spaces with underscores
        norm_name = norm_name.replace(' ', '_')

        # Common substitutions
        substitutions = {
            'tomatoes': 'tomato',
            'onions': 'onion',
            'garlic_cloves': 'garlic',
            'garlic_clove': 'garlic',
            'bell_peppers': 'bell_pepper',
            'carrots': 'carrot'
        }

        return substitutions.get(norm_name, norm_name)

    def _load_data(self):
        """Load all necessary data files"""
        logger.info("Loading ingredient data...")
        # Load ingredient nodes and clean data
        self.nodes_df = pd.read_csv(self.config.NODES_FILE)
        self.nodes_df = self.nodes_df.dropna(subset=['name'])
        self.nodes_df['name'] = self.nodes_df['name'].astype(str)

        # Load categories
        self.categories_df = pd.read_csv(self.config.CATEGORIES_FILE)

        # Load embeddings and create mapping
        with open(self.config.EMBEDDING_FILE, 'rb') as f:
            embeddings_dict = pickle.load(f)

        # Create name to embedding mapping
        self.name_to_embedding = {}
        self.embeddings_list = []
        self.valid_ingredients = []

        for name in self.nodes_df['name']:
            try:
                # Try both original and normalized names
                norm_name = self._normalize_ingredient_name(name)
                if norm_name in embeddings_dict:
                    self.name_to_embedding[name] = len(self.embeddings_list)
                    self.embeddings_list.append(embeddings_dict[norm_name])
                    self.valid_ingredients.append(name)
            except AttributeError:
                continue

        if not self.embeddings_list:
            raise ValueError("No valid embeddings found!")

        self.embeddings = np.array(self.embeddings_list)
        logger.info(f"Successfully loaded {len(self.valid_ingredients)} ingredients with embeddings")

    def find_similar_ingredients(self, ingredient_name, n=3):
        """Find similar ingredients based on embedding similarity"""
        # Normalize input name
        norm_name = self._normalize_ingredient_name(ingredient_name)

        # Try both original and normalized names
        if ingredient_name in self.name_to_embedding:
            idx = self.name_to_embedding[ingredient_name]
        elif norm_name in self.name_to_embedding:
            idx = self.name_to_embedding[norm_name]
        else:
            raise ValueError(f"No embedding found for ingredient '{ingredient_name}' (normalized: '{norm_name}')")

        ingredient_embedding = self.embeddings[idx].reshape(1, -1)
        similarities = cosine_similarity(ingredient_embedding, self.embeddings)[0]

        # Get top similar ingredients (excluding self)
        similar_indices = np.argsort(similarities)[::-1][1:n*2]  # Get more candidates for filtering

        # Get category of original ingredient
        original_category = self.get_ingredient_category(self.valid_ingredients[idx])

        # Filter and sort candidates
        similar_ingredients = []
        for idx in similar_indices:
            ingredient = self.valid_ingredients[idx]
            category = self.get_ingredient_category(ingredient)

            # Skip ingredients that are too similar in name
            if ingredient.lower() in ingredient_name.lower() or ingredient_name.lower() in ingredient.lower():
                continue

            similar_ingredients.append({
                'name': ingredient,
                'similarity': similarities[idx],
                'category': category
            })

            if len(similar_ingredients) >= n:
                break

        return similar_ingredients

    def get_ingredient_category(self, ingredient_name):
        """Get category for an ingredient"""
        # Try exact match first
        category_row = self.categories_df[self.categories_df['ingredient'] == ingredient_name]
        if len(category_row) > 0:
            return category_row['category'].iloc[0]

        # Try normalized match
        norm_name = self._normalize_ingredient_name(ingredient_name)
        category_row = self.categories_df[self.categories_df['ingredient'].str.lower().str.replace(' ', '_') == norm_name]
        return category_row['category'].iloc[0] if len(category_row) > 0 else 'Unknown'

    def suggest_substitutions(self, ingredient_name, n=3, same_category_only=True):
        """Suggest ingredient substitutions with preference for same category"""
        if ingredient_name not in self.name_to_embedding:
            norm_name = self._normalize_ingredient_name(ingredient_name)
            if norm_name not in self.name_to_embedding:
                raise ValueError(f"No embedding found for ingredient '{ingredient_name}' (normalized: '{norm_name}')")

        # Get original ingredient's category
        original_category = self.get_ingredient_category(ingredient_name)

        # Get similar ingredients
        similar_ingredients = self.find_similar_ingredients(ingredient_name, n=n*2 if same_category_only else n)

        if same_category_only:
            # Filter for same category and take top n
            same_category_substitutes = [
                ing for ing in similar_ingredients
                if ing['category'] == original_category
            ][:n]

            # If we don't have enough same-category substitutes, add others
            if len(same_category_substitutes) < n:
                other_substitutes = [
                    ing for ing in similar_ingredients
                    if ing['category'] != original_category
                ][:n - len(same_category_substitutes)]
                same_category_substitutes.extend(other_substitutes)

            return same_category_substitutes

        return similar_ingredients

    def generate_recipe_variations(self, ingredients, n_variations=3):
        """Generate recipe variations by substituting ingredients"""
        variations = []
        for _ in range(n_variations):
            variation = []
            for ingredient in ingredients:
                try:
                    # 30% chance to substitute each ingredient
                    if np.random.random() < 0.3:
                        substitutes = self.suggest_substitutions(ingredient, n=1)
                        if substitutes:
                            variation.append(substitutes[0]['name'])
                        else:
                            variation.append(ingredient)
                    else:
                        variation.append(ingredient)
                except ValueError:
                    variation.append(ingredient)
            variations.append(variation)
        return variations

    def print_data_stats(self):
        """Print statistics about the loaded data"""
        print("\\nData Statistics:")
        print(f"Total nodes in dataset: {len(self.nodes_df)}")
        print(f"Valid ingredients with embeddings: {len(self.valid_ingredients)}")
        print(f"Total categories: {len(self.categories_df['category'].unique())}")
        print("\\nSample of valid ingredients:")
        print(np.random.choice(self.valid_ingredients, 5))'''

# Write ingredient_processor.py
with open('/content/FlavorGraph/hybrid_system/ingredient_processor.py', 'w') as f:
    f.write(ingredient_processor_code)

print("Files updated successfully!")

Files updated successfully!


In [56]:
# CELL 6: Comprehensive System Test
import importlib
from ingredient_processor import IngredientProcessor
from recipe_generator import RecipeGenerator

# Initialize components
config = Config()
ingredient_proc = IngredientProcessor(config)
recipe_gen = RecipeGenerator(config)

# Print data statistics
ingredient_proc.print_data_stats()

# Test ingredient substitutions
test_ingredients = [
    "chicken",
    "tomatoes",
    "onion",
    "garlic",
    "bell peppers",
    "carrots"
]

print("\n=== Testing Smart Substitutions ===")
for ingredient in test_ingredients:
    try:
        print(f"\nSubstitutes for {ingredient}:")
        substitutes = ingredient_proc.suggest_substitutions(ingredient)
        for sub in substitutes:
            print(f"- {sub['name']} (Category: {sub['category']}, Similarity: {sub['similarity']:.3f})")
    except ValueError as e:
        print(f"Cannot find substitutes for {ingredient}: {str(e)}")

# Test recipe generation with variations
print("\n=== Testing Recipe Generation with Variations ===")
base_recipe = ["chicken", "rice", "tomatoes", "onion", "garlic"]
print("\nOriginal ingredients:", base_recipe)

variations = ingredient_proc.generate_recipe_variations(base_recipe, n_variations=2)
for i, variation in enumerate(variations, 1):
    print(f"\nVariation {i} ingredients:", variation)
    recipe = recipe_gen.generate_recipe(variation)
    print("Recipe:", recipe)


Data Statistics:
Total nodes in dataset: 8297
Valid ingredients with embeddings: 2552
Total categories: 17

Sample of valid ingredients:
['barbecue_seasoning' 'karo_syrup' 'tomato_juice' 'dijon_mustard' 'tea']

=== Testing Smart Substitutions ===

Substitutes for chicken:
- beef (Category: Meat/Animal Product, Similarity: 0.861)
- meat (Category: Meat/Animal Product, Similarity: 0.843)
- pork (Category: Meat/Animal Product, Similarity: 0.838)

Substitutes for tomatoes:
Cannot find substitutes for tomatoes: No embedding found for ingredient 'tomatoes'

Substitutes for onion:
- paprika (Category: Plant/Vegetable, Similarity: 0.876)
- basil (Category: Plant/Vegetable, Similarity: 0.865)
- pepper (Category: Spice, Similarity: 0.896)

Substitutes for garlic:
- beef_mince (Category: Unknown, Similarity: 0.499)
- boneless_chicken (Category: Unknown, Similarity: 0.490)
- brinjal (Category: Unknown, Similarity: 0.474)

Substitutes for bell peppers:
Cannot find substitutes for bell peppers: No 