<a href="https://colab.research.google.com/github/darlon31/FlavorGraph/blob/HybridSystem/Full_Hybrid_System_Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# CELL 1: Setup and Dependencies
!pip install -q transformers torch networkx pandas numpy scikit-learn matplotlib seaborn tqdm gdown

import os
import pickle
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration
from sklearn.metrics.pairwise import cosine_similarity
from tqdm.notebook import tqdm
import time
import logging

# Set up logging
logging.basicConfig(level=logging.INFO,
                   format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [32]:
# CELL 2: Mount Google Drive and Setup
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Make sure we're in the content directory and clean up
os.chdir('/content')
!rm -rf /content/FlavorGraph
logger.info("Environment prepared")

Mounted at /content/drive


In [33]:
# CELL 3: Clone Repository and Check Structure
# Clone the repository
!git clone https://github.com/darlon31/FlavorGraph.git
logger.info("FlavorGraph repository cloned successfully")

# Check the actual structure
print("\nChecking actual directory structure:")
!ls -R /content/FlavorGraph

# Change to the FlavorGraph directory
os.chdir('FlavorGraph')
logger.info(f"Current working directory: {os.getcwd()}")

Cloning into 'FlavorGraph'...
remote: Enumerating objects: 327, done.[K
remote: Counting objects: 100% (78/78), done.[K
remote: Compressing objects: 100% (44/44), done.[K
remote: Total 327 (delta 42), reused 58 (delta 33), pack-reused 249 (from 1)[K
Receiving objects: 100% (327/327), 20.83 MiB | 18.96 MiB/s, done.
Resolving deltas: 100% (190/190), done.

Checking actual directory structure:
/content/FlavorGraph:
images	input  LICENSE	output	README.md  src

/content/FlavorGraph/images:
embeddings.png	flavorgraph2vec.png  flavorgraph.png

/content/FlavorGraph/input:
'dict_ingr2cate - Top300+FDB400+HyperFoods104=616.csv'	 node_classification_hub.csv
 edges_191120.csv					 nodes_191120.csv

/content/FlavorGraph/output:
 kitchenette_embeddings.pkl  'place output files here'

/content/FlavorGraph/src:
dataloader.py  graph2vec.py  main.py  model.py	parser.py  plotter.py  utils.py  walkers.py


In [34]:
# CELL 4: Configuration and Path Verification
class Config:
    """Configuration class for the FlavorGraph Hybrid Recipe System"""

    def __init__(self):
        # Detect environment
        self.IN_COLAB = 'google.colab' in str(get_ipython())

        # Base paths
        if self.IN_COLAB:
            self.BASE_DIR = "/content/FlavorGraph"
        else:
            self.BASE_DIR = "c:/Users/dario/FlavorGraph"

        # Directory structure - using the correct paths as shown in the repository
        self.INPUT_DIR = os.path.join(self.BASE_DIR, "input")
        self.OUTPUT_DIR = os.path.join(self.BASE_DIR, "output")
        self.SRC_DIR = os.path.join(self.BASE_DIR, "src")

        # Data files with exact names from the repository
        self.NODES_FILE = os.path.join(self.INPUT_DIR, "nodes_191120.csv")
        self.EDGES_FILE = os.path.join(self.INPUT_DIR, "edges_191120.csv")
        self.CATEGORIES_FILE = os.path.join(self.INPUT_DIR, "dict_ingr2cate - Top300+FDB400+HyperFoods104=616.csv")
        self.NODE_CLASSIFICATION_FILE = os.path.join(self.INPUT_DIR, "node_classification_hub.csv")
        self.EMBEDDING_FILE = os.path.join(self.OUTPUT_DIR, "kitchenette_embeddings.pkl")

        # Model settings
        self.EMBEDDING_DIM = 300
        self.MOLECULAR_DIM = 881
        self.MODEL_NAME = "flax-community/t5-recipe-generation"
        self.MODEL_CACHE_DIR = os.path.join(self.BASE_DIR, "model_cache")

        # Generation settings
        self.GENERATION_CONFIG = {
            "max_length": 512,
            "min_length": 64,
            "no_repeat_ngram_size": 3,
            "do_sample": True,
            "top_k": 60,
            "top_p": 0.95,
            "temperature": 0.7
        }

        # Hybrid settings
        self.SIMILARITY_THRESHOLD = 0.7
        self.MAX_SIMILAR_INGREDIENTS = 3

    def verify_paths(self):
        """Verify all paths exist and log their status"""
        paths = {
            'BASE_DIR': self.BASE_DIR,
            'INPUT_DIR': self.INPUT_DIR,
            'OUTPUT_DIR': self.OUTPUT_DIR,
            'SRC_DIR': self.SRC_DIR,
            'NODES_FILE': self.NODES_FILE,
            'EDGES_FILE': self.EDGES_FILE,
            'CATEGORIES_FILE': self.CATEGORIES_FILE,
            'NODE_CLASSIFICATION_FILE': self.NODE_CLASSIFICATION_FILE,
            'EMBEDDING_FILE': self.EMBEDDING_FILE
        }

        all_exist = True
        logger.info("\nVerifying paths:")
        for name, path in paths.items():
            if os.path.exists(path):
                if os.path.isfile(path):
                    size = os.path.getsize(path)
                    logger.info(f"{name}: ✓ ({path}) - Size: {size/1024:.2f} KB")
                else:
                    logger.info(f"{name}: ✓ ({path}) - Directory")
            else:
                logger.warning(f"{name}: ✗ ({path}) - Not found")
                all_exist = False
        return all_exist

# Create and verify configuration
config = Config()
logger.info(f"Running in Colab: {config.IN_COLAB}")
logger.info(f"Current working directory: {os.getcwd()}")
logger.info(f"Base directory: {config.BASE_DIR}")

if not config.verify_paths():
    logger.warning("Some required paths are missing!")
else:
    logger.info("All paths verified successfully!")

# Print out all available files in input and output directories for verification
print("\nInput directory contents:")
!ls -l {config.INPUT_DIR}
print("\nOutput directory contents:")
!ls -l {config.OUTPUT_DIR}


Input directory contents:
total 5392
-rw-r--r-- 1 root root   16219 Nov 30 13:05 'dict_ingr2cate - Top300+FDB400+HyperFoods104=616.csv'
-rw-r--r-- 1 root root 5155973 Nov 30 13:05  edges_191120.csv
-rw-r--r-- 1 root root    1484 Nov 30 13:05  node_classification_hub.csv
-rw-r--r-- 1 root root  343416 Nov 30 13:05  nodes_191120.csv

Output directory contents:
total 8644
-rw-r--r-- 1 root root 8845756 Nov 30 13:05  kitchenette_embeddings.pkl
-rw-r--r-- 1 root root      25 Nov 30 13:05 'place output files here'


In [36]:
# CELL 5: Setup Hybrid System Structure
# Create hybrid_system directory
!mkdir -p /content/FlavorGraph/hybrid_system

# Create necessary Python files
hybrid_files = {
    'config.py': '''import os
import logging

logging.basicConfig(level=logging.INFO,
                   format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class Config:
    """Configuration class for the FlavorGraph Hybrid Recipe System"""

    def __init__(self):
        # Detect environment
        self.IN_COLAB = 'google.colab' in str(get_ipython())

        # Base paths
        if self.IN_COLAB:
            self.BASE_DIR = "/content/FlavorGraph"
        else:
            self.BASE_DIR = "c:/Users/dario/FlavorGraph"

        # Directory structure
        self.INPUT_DIR = os.path.join(self.BASE_DIR, "input")
        self.OUTPUT_DIR = os.path.join(self.BASE_DIR, "output")
        self.SRC_DIR = os.path.join(self.BASE_DIR, "src")

        # Data files
        self.NODES_FILE = os.path.join(self.INPUT_DIR, "nodes_191120.csv")
        self.EDGES_FILE = os.path.join(self.INPUT_DIR, "edges_191120.csv")
        self.CATEGORIES_FILE = os.path.join(self.INPUT_DIR, "dict_ingr2cate - Top300+FDB400+HyperFoods104=616.csv")
        self.NODE_CLASSIFICATION_FILE = os.path.join(self.INPUT_DIR, "node_classification_hub.csv")
        self.EMBEDDING_FILE = os.path.join(self.OUTPUT_DIR, "kitchenette_embeddings.pkl")

        # Model settings
        self.EMBEDDING_DIM = 300
        self.MOLECULAR_DIM = 881
        self.MODEL_NAME = "flax-community/t5-recipe-generation"
        self.MODEL_CACHE_DIR = os.path.join(self.BASE_DIR, "model_cache")

        # Generation settings
        self.GENERATION_CONFIG = {
            "max_length": 512,
            "min_length": 64,
            "no_repeat_ngram_size": 3,
            "do_sample": True,
            "top_k": 60,
            "top_p": 0.95,
            "temperature": 0.7
        }

        # Hybrid settings
        self.SIMILARITY_THRESHOLD = 0.7
        self.MAX_SIMILAR_INGREDIENTS = 3

    def verify_paths(self):
        """Verify all paths exist and log their status"""
        paths = {
            'BASE_DIR': self.BASE_DIR,
            'INPUT_DIR': self.INPUT_DIR,
            'OUTPUT_DIR': self.OUTPUT_DIR,
            'SRC_DIR': self.SRC_DIR,
            'NODES_FILE': self.NODES_FILE,
            'EDGES_FILE': self.EDGES_FILE,
            'CATEGORIES_FILE': self.CATEGORIES_FILE,
            'NODE_CLASSIFICATION_FILE': self.NODE_CLASSIFICATION_FILE,
            'EMBEDDING_FILE': self.EMBEDDING_FILE
        }

        all_exist = True
        logger.info("\\nVerifying paths:")
        for name, path in paths.items():
            if os.path.exists(path):
                if os.path.isfile(path):
                    size = os.path.getsize(path)
                    logger.info(f"{name}: ✓ ({path}) - Size: {size/1024:.2f} KB")
                else:
                    logger.info(f"{name}: ✓ ({path}) - Directory")
            else:
                logger.warning(f"{name}: ✗ ({path}) - Not found")
                all_exist = False
        return all_exist''',
    'ingredient_processor.py': '''from config import Config
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pickle

class IngredientProcessor:
    def __init__(self, config):
        self.config = config
        self._load_data()

    def _load_data(self):
        """Load all necessary data files"""
        # Load ingredient nodes
        self.nodes_df = pd.read_csv(self.config.NODES_FILE)

        # Load categories
        self.categories_df = pd.read_csv(self.config.CATEGORIES_FILE)

        # Load embeddings
        with open(self.config.EMBEDDING_FILE, 'rb') as f:
            self.embeddings = pickle.load(f)
''',
    'recipe_generator.py': '''from transformers import AutoTokenizer, T5ForConditionalGeneration
from config import Config
import torch

class RecipeGenerator:
    def __init__(self, config):
        self.config = config
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self._load_model()

    def _load_model(self):
        """Load the T5 model and tokenizer"""
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.MODEL_NAME)
        self.model = T5ForConditionalGeneration.from_pretrained(self.config.MODEL_NAME)
        self.model.to(self.device)
''',
    'test_config.py': '''import unittest
from config import Config
import os

class TestConfig(unittest.TestCase):
    def setUp(self):
        self.config = Config()

    def test_paths_exist(self):
        """Test that all required paths exist"""
        self.assertTrue(self.config.verify_paths())
''',
    'test_hybrid.py': '''import unittest
from config import Config
from ingredient_processor import IngredientProcessor
from recipe_generator import RecipeGenerator

class TestHybridSystem(unittest.TestCase):
    def setUp(self):
        self.config = Config()
        self.ingredient_processor = IngredientProcessor(self.config)
        self.recipe_generator = RecipeGenerator(self.config)
'''
}

# Write files to hybrid_system directory
for filename, content in hybrid_files.items():
    filepath = f'/content/FlavorGraph/hybrid_system/{filename}'
    with open(filepath, 'w') as f:
        f.write(content)

print("Created hybrid system files:")
!ls -l /content/FlavorGraph/hybrid_system/

Created hybrid system files:
total 20
-rw-r--r-- 1 root root 3024 Nov 30 13:08 config.py
-rw-r--r-- 1 root root  663 Nov 30 13:08 ingredient_processor.py
-rw-r--r-- 1 root root  585 Nov 30 13:08 recipe_generator.py
-rw-r--r-- 1 root root  284 Nov 30 13:08 test_config.py
-rw-r--r-- 1 root root  366 Nov 30 13:08 test_hybrid.py


In [37]:
# CELL 6: Test Hybrid System Components
import sys
sys.path.append('/content/FlavorGraph/hybrid_system')

from config import Config
from ingredient_processor import IngredientProcessor
from recipe_generator import RecipeGenerator

# Initialize components
config = Config()
print("\n=== Configuration Verification ===")
config.verify_paths()

print("\n=== Testing Ingredient Processor ===")
try:
    ingredient_processor = IngredientProcessor(config)
    print("✓ Ingredient data loaded successfully")
    print(f"Number of ingredients: {len(ingredient_processor.nodes_df)}")
    print(f"Number of categories: {len(ingredient_processor.categories_df)}")
    print(f"Embedding shape: {ingredient_processor.embeddings.shape}")
except Exception as e:
    print(f"✗ Error loading ingredient data: {str(e)}")

print("\n=== Testing Recipe Generator ===")
try:
    recipe_generator = RecipeGenerator(config)
    print("✓ T5 model and tokenizer loaded successfully")
    print(f"Model name: {config.MODEL_NAME}")
    print(f"Running on device: {recipe_generator.device}")
except Exception as e:
    print(f"✗ Error loading recipe generator: {str(e)}")

# Display sample of ingredient data
print("\n=== Sample Ingredient Data ===")
print("\nFirst 5 ingredients:")
print(ingredient_processor.nodes_df.head())

print("\nFirst 5 categories:")
print(ingredient_processor.categories_df.head())


=== Configuration Verification ===

=== Testing Ingredient Processor ===
✓ Ingredient data loaded successfully
Number of ingredients: 8298
Number of categories: 616
✗ Error loading ingredient data: 'dict' object has no attribute 'shape'

=== Testing Recipe Generator ===


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

✓ T5 model and tokenizer loaded successfully
Model name: flax-community/t5-recipe-generation
Running on device: cpu

=== Sample Ingredient Data ===

First 5 ingredients:
   node_id                    name  id   node_type  is_hub
0        0       1%_fat_buttermilk NaN  ingredient  no_hub
1        1   1%_fat_cottage_cheese NaN  ingredient  no_hub
2        3               10%_cream NaN  ingredient  no_hub
3        4               100%_bran NaN  ingredient  no_hub
4        5  10_inch_flour_tortilla NaN  ingredient  no_hub

First 5 categories:
         ingredient          category
0           abalone           Seafood
1             acorn          Nut/Seed
2  active_dry_yeast            Fungus
3       adzuki_bean  Cereal/Crop/Bean
4              agar   Plant/Vegetable


In [41]:
# CELL 7A: Update IngredientProcessor
%%writefile /content/FlavorGraph/hybrid_system/ingredient_processor.py
from config import Config
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import logging

logger = logging.getLogger(__name__)

class IngredientProcessor:
    def __init__(self, config):
        self.config = config
        self._load_data()

    def _load_data(self):
        """Load all necessary data files"""
        logger.info("Loading ingredient data...")
        # Load ingredient nodes
        self.nodes_df = pd.read_csv(self.config.NODES_FILE)

        # Load categories
        self.categories_df = pd.read_csv(self.config.CATEGORIES_FILE)

        # Load embeddings and convert to numpy array
        with open(self.config.EMBEDDING_FILE, 'rb') as f:
            embeddings_dict = pickle.load(f)
            # Convert dictionary to ordered numpy array
            self.ingredient_ids = sorted(embeddings_dict.keys())
            self.embeddings = np.array([embeddings_dict[id_] for id_ in self.ingredient_ids])

        logger.info(f"Loaded {len(self.nodes_df)} ingredients and {self.embeddings.shape[0]} embeddings")

    def find_similar_ingredients(self, ingredient_name, n=3):
        """Find similar ingredients based on embedding similarity"""
        # Get ingredient ID
        ingredient_row = self.nodes_df[self.nodes_df['name'] == ingredient_name]
        if len(ingredient_row) == 0:
            raise ValueError(f"Ingredient '{ingredient_name}' not found")

        ingredient_id = ingredient_row.index[0]
        if ingredient_id not in range(len(self.ingredient_ids)):
            raise ValueError(f"No embedding found for ingredient '{ingredient_name}'")

        # Get embedding and calculate similarities
        ingredient_embedding = self.embeddings[ingredient_id].reshape(1, -1)
        similarities = cosine_similarity(ingredient_embedding, self.embeddings)[0]

        # Get top similar ingredients (excluding self)
        similar_indices = np.argsort(similarities)[::-1][1:n+1]
        similar_ingredients = [
            {
                'name': self.nodes_df.iloc[idx]['name'],
                'similarity': similarities[idx],
                'category': self.get_ingredient_category(self.nodes_df.iloc[idx]['name'])
            }
            for idx in similar_indices
        ]

        return similar_ingredients

    def get_ingredient_category(self, ingredient_name):
        """Get category for an ingredient"""
        category_row = self.categories_df[self.categories_df['ingredient'] == ingredient_name]
        return category_row['category'].iloc[0] if len(category_row) > 0 else 'Unknown'

Overwriting /content/FlavorGraph/hybrid_system/ingredient_processor.py


In [42]:
# CELL 7B: Update RecipeGenerator
%%writefile /content/FlavorGraph/hybrid_system/recipe_generator.py
from transformers import AutoTokenizer, T5ForConditionalGeneration
from config import Config
import torch
import logging

logger = logging.getLogger(__name__)

class RecipeGenerator:
    def __init__(self, config):
        self.config = config
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self._load_model()

    def _load_model(self):
        """Load the T5 model and tokenizer"""
        logger.info("Loading T5 model and tokenizer...")
        self.tokenizer = AutoTokenizer.from_pretrained(self.config.MODEL_NAME)
        self.model = T5ForConditionalGeneration.from_pretrained(self.config.MODEL_NAME)
        self.model.to(self.device)

    def generate_recipe(self, ingredients):
        """Generate a recipe from a list of ingredients"""
        # Format input text
        input_text = "generate recipe: " + ", ".join(ingredients)

        # Tokenize and generate
        inputs = self.tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True)
        inputs = {k: v.to(self.device) for k, v in inputs.items()}

        # Generate recipe
        outputs = self.model.generate(
            **inputs,
            **self.config.GENERATION_CONFIG
        )

        # Decode and return recipe
        recipe = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return recipe

Overwriting /content/FlavorGraph/hybrid_system/recipe_generator.py


In [43]:
# CELL 8: Test Hybrid Recipe System
# Reload our updated modules
import importlib
import ingredient_processor
import recipe_generator
importlib.reload(ingredient_processor)
importlib.reload(recipe_generator)

from ingredient_processor import IngredientProcessor
from recipe_generator import RecipeGenerator

# Initialize components
config = Config()
ingredient_proc = IngredientProcessor(config)
recipe_gen = RecipeGenerator(config)

# Test similar ingredients
test_ingredient = "chicken"
print(f"\n=== Finding similar ingredients to '{test_ingredient}' ===")
similar_ingredients = ingredient_proc.find_similar_ingredients(test_ingredient)
for ing in similar_ingredients:
    print(f"- {ing['name']} (Category: {ing['category']}, Similarity: {ing['similarity']:.3f})")

# Test recipe generation
test_ingredients = ["chicken", "rice", "tomatoes"]
print(f"\n=== Generating recipe for {', '.join(test_ingredients)} ===")
recipe = recipe_gen.generate_recipe(test_ingredients)
print("\nGenerated Recipe:")
print(recipe)


=== Finding similar ingredients to 'chicken' ===
- cider (Category: Beverage, Similarity: 0.408)
- dried_calimyrna_fig (Category: Unknown, Similarity: 0.402)
- demi_glace (Category: Unknown, Similarity: 0.398)

=== Generating recipe for chicken, rice, tomatoes ===

Generated Recipe:
title: crock pot chicken and rice ingredients: 1 chicken, cut up and skinned 1 c. rice 1 can tomatoes, diced 2 cans v 8 juice 1 small can v 8.6 oz. tomatoes with green chilies directions: place chicken in ccrockpot. add rice, tomatoes and juice. cover and cook on low 8 to 9 hours.
