In [6]:
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain_openai import AzureChatOpenAI
from langchain.docstore.document import Document
import os

### Load embeddings

In [7]:
from langchain_community.embeddings import HuggingFaceEmbeddings
embedding_model = HuggingFaceEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")

### Load data from S3

In [8]:
import boto3
import pandas as pd

s3 = boto3.client('s3')
bucket_name = 'recipes-rag'

In [9]:
file_key = 'recipes_w_cleaning_time_combined_features.parquet'
s3.download_file(bucket_name, file_key, f'../data/{file_key}')
df = pd.read_parquet(f'../data/{file_key}')

df.head()

Unnamed: 0,Name,RecipeCategory,Description,Keywords_string,RecipeIngredientQuantities,RecipeIngredientParts,RecipeInstructions,AggregatedRating,ReviewCount,CookTime_Minutes,PrepTime_Minutes,TotalTime_Minutes,Combined_Features,Combined_Features_Clean
0,Low-Fat Berry Blue Frozen Dessert,Frozen Desserts,Make and share this Low-Fat Berry Blue Frozen ...,Dessert Low Protein Low Cholesterol Healthy Fr...,"[4, 1⁄4, 1, 1]","[blueberries, granulated sugar, vanilla yogurt...",Toss 2 cups berries with sugar. Let stand for ...,4.5,4.0,1440,45,1485,Low-Fat Berry Blue Frozen Dessert Frozen Desse...,Low-Fat Berry Blue Frozen Dessert Frozen Desse...
1,Biryani,Chicken Breast,Make and share this Biryani recipe from Food.com.,Chicken Thigh & Leg Chicken Poultry Meat Asian...,"[1, 4, 2, 2, 8, 1⁄4, 8, 1⁄2, 1, 1, 1⁄4, 1⁄4, 1...","[saffron, milk, hot green chili peppers, onion...",Soak saffron in warm milk for 5 minutes and pu...,3.0,1.0,25,240,265,Biryani Chicken Breast Make and share this Bir...,Biryani Chicken Breast Make share Biryani reci...
2,Best Lemonade,Beverages,This is from one of my first Good House Keepi...,Low Protein Low Cholesterol Healthy Summer < 6...,"[1 1⁄2, 1, None, 1 1⁄2, None, 3⁄4]","[sugar, lemons, rind of, lemon, zest of, fresh...","Into a 1 quart Jar with tight fitting lid, put...",4.5,10.0,5,30,35,Best Lemonade Beverages This is from one of my...,Best Lemonade Beverages one first Good House K...
3,Carina's Tofu-Vegetable Kebabs,Soy/Tofu,This dish is best prepared a day in advance to...,Beans Vegetable Low Cholesterol Weeknight Broi...,"[12, 1, 2, 1, 10, 1, 3, 2, 2, 2, 1, 2, 1⁄2, 1⁄...","[extra firm tofu, eggplant, zucchini, mushroom...","Drain the tofu, carefully squeezing out excess...",4.5,2.0,20,1440,1460,Carina's Tofu-Vegetable Kebabs Soy/Tofu This d...,Carina's Tofu-Vegetable Kebabs Soy/Tofu dish b...
4,Cabbage Soup,Vegetable,Make and share this Cabbage Soup recipe from F...,Low Protein Vegan Low Cholesterol Healthy Wint...,"[46, 4, 1, 2, 1]","[plain tomato juice, cabbage, onion, carrots, ...",Mix everything together and bring to a boil. R...,4.5,11.0,30,20,50,Cabbage Soup Vegetable Make and share this Cab...,Cabbage Soup Vegetable Make share Cabbage Soup...


### Generate qdrant store with small subset of data

#### Subset the data

In [10]:
max_num_docs = 1000
df_subset = df.sample(n=max_num_docs)
df_subset.shape

(1000, 14)

In [11]:
def create_documents(df):
    df_copy = df.copy(deep=True)
    df = df.dropna(subset=["AggregatedRating"])
    df_copy = df_copy.fillna("")  # Convert NA values to empty strings
    df_copy = df_copy.astype(str)  # Cast all columns to string

    documents = []
    for _index, row in df_copy.iterrows():
        metadata = {
            "name": row["Name"] if row["Name"] else "No Name Available",
            "description": (
                row["Description"] if row["Description"] else "No Description Available"
            ),
            "recipe_category": (
                row["RecipeCategory"]
                if row["RecipeCategory"]
                else "No Category Available"
            ),
            "keywords": (
                row["Keywords_string"]
                if row["Keywords_string"]
                else "No Keywords Available"
            ),
            "recipe_ingredient_parts": (
                row["RecipeIngredientParts"]
                if row["RecipeIngredientParts"]
                else "No Recipe Ingredient Parts Available"
            ),
            "recipe_instructions": (
                row["RecipeInstructions"]
                if row["RecipeInstructions"]
                else "No Recipe Instructions Available"
            ),
            "aggregated_rating": (
                row["AggregatedRating"]
                if row["AggregatedRating"]
                else "No Rating Available"
            ),
            "review_count": (
                row["ReviewCount"] if row["ReviewCount"] else "No Reviews Available"
            ),
        }

        # List of fields to be included in the document content
        content_field = (
            row["Combined_Features"]
            if row["Combined_Features"]
            else "No Content Available"
        )

        # Create the document content using the combined features field
        doc = Document(page_content=content_field, metadata=metadata)
        documents.append(doc)

    return documents

In [12]:
documents = create_documents(df_subset)

In [13]:
documents[0]

Document(metadata={'name': 'Slow Cooker Corn Chowder', 'description': 'I got this recipe from Allrecipes where Christine Benjamin submitted it. I have tweaked it to my families taste and we adore it! Nothing better on a cold winter evening.', 'recipe_category': 'Chowders', 'keywords': 'Corn Vegetable Low Cholesterol Healthy Weeknight', 'recipe_ingredient_parts': "['potatoes' 'butter' 'onions' 'ham' 'chicken bouillon cubes' 'celery'\n 'evaporated milk' 'whole kernel corn' 'carrots']", 'recipe_instructions': 'In a slow cooker, place potatoes, onions, ham, celery, carrots, red pepper, whole kernel corn, butter, and salt and pepper to taste. Add enough water to cover and the bouillon cubes. Cook on low setting for 8 to 9 hours and then stir in evaporated milk and cream corn. Cook for 30 minutes more. Add mashed potatoes flakes if you like a thicker chowder. Stir to blend well.', 'aggregated_rating': '5.0', 'review_count': '10.0'}, page_content='Slow Cooker Corn Chowder Chowders I got this 

In [14]:
from langchain_community.vectorstores import Qdrant

store = Qdrant.from_documents(
    documents,
    embedding_model,
    location=":memory:",
)

## DON'T COMMIT THIS VALUE TO THE GITHUB REPO

In [15]:
os.environ["AZURE_OPENAI_API_KEY"] = "871c0436cba54ef8af07924f44b0b32e" # Delete this string value before committing changes

In [16]:
# Configure the AzureChatOpenAI model
llm = AzureChatOpenAI(
    openai_api_version="2024-06-01",
    azure_deployment="capstone_gpt4o",
    openai_api_key=os.environ["AZURE_OPENAI_API_KEY"],
    azure_endpoint="https://capstone1.openai.azure.com/"
)

### Validate that openai integration is working


In [17]:
messages = [
    (
        "system",
        "You are a helpful assistant that translates English to French. Translate the user sentence.",
    ),
    ("human", "I love programming."),
]
llm.invoke(messages)

AIMessage(content="J'adore la programmation.", response_metadata={'token_usage': {'completion_tokens': 5, 'prompt_tokens': 31, 'total_tokens': 36}, 'model_name': 'gpt-4o-2024-05-13', 'system_fingerprint': 'fp_abc28019ad', 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}], 'finish_reason': 'stop', 'logprobs': None, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'filtered': False, 'severity': 'safe'}, 'violence': {'filtered': False, 'severity': 'safe'}}}, id='run-4e1d642c-5b87-41dd-8a2d-16f65515cf1e-0')

### Define metadata

In [18]:
metadata_field_info = [
    AttributeInfo(
        name="name",
        description="The name of the recipe",
        type="string",
    ),
    AttributeInfo(
        name="description",
        description="A brief description of the recipe",
        type="string",
    ),
    AttributeInfo(
        name="recipe_category",
        description="The category of the recipe, such as 'Quick Breads', 'Desserts', etc.",
        type="string",
    ),
    AttributeInfo(
        name="keywords",
        description="Keywords associated with the recipe",
        type="string",
    ),
    AttributeInfo(
        name="recipe_ingredient_parts",
        description="The ingredients required for the recipe",
        type="string",
    ),
    AttributeInfo(
        name="recipe_instructions",
        description="The instructions to prepare the recipe",
        type="string",
    ),
    AttributeInfo(
        name="aggregated_rating",
        description="The aggregated rating for the recipe",
        type="string",
    ),
    AttributeInfo(
        name="review_count",
        description="The number of reviews for the recipe",
        type="string",
    ),
]
document_content_description = "Detailed information about a recipe"

In [19]:
retriever = SelfQueryRetriever.from_llm(
    llm,
    store,  # Ensure your vectorstore is properly initialized
    document_content_description,
    metadata_field_info,
)

In [20]:
# Example usage of the retriever
query = "Show me a quick bread recipe that uses honey as an ingredient and is easy to make."
results = retriever.invoke(query)
for result in results:
    print("Recipe Name:", result.metadata["name"])
    print("Description:", result.metadata["description"])
    print("Category:", result.metadata["recipe_category"])
    print("Keywords:", result.metadata["keywords"])
    print("Ingredients:", result.metadata["recipe_ingredient_parts"])
    print("Instructions:", result.metadata["recipe_instructions"])
    print("Rating:", result.metadata["aggregated_rating"])
    print("Reviews:", result.metadata["review_count"])
    print("Content:", result.page_content)
    print("\n")


RateLimitError: Error code: 429 - {'error': {'code': '429', 'message': 'Requests to the ChatCompletions_Create Operation under Azure OpenAI API version 2024-06-01 have exceeded token rate limit of your current OpenAI S0 pricing tier. Please retry after 86400 seconds. Please go here: https://aka.ms/oai/quotaincrease if you would like to further increase the default rate limit.'}}