## About the sheet:

1. Creates two retreivers, first retreiver a standard one. Creates the semantic embeddings, picks top k chunks.

2. Second retreiver, enhanced descriptions, embedding generation with BM25 and semantically. Reranking the chunks to pick top k.

In [4]:
import boto3
import numpy as np
import json
import pandas as pd
import time
import os
from sentence_transformers import SentenceTransformer
import faiss
import hashlib
from rank_bm25 import BM25Okapi
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from sentence_transformers import CrossEncoder
import requests
from sklearn.metrics.pairwise import cosine_similarity 


In [5]:
df = pd.read_csv("Imputed_decription.csv").drop(["Unnamed: 0.1", "Unnamed: 0"], axis = 1)
df["categories"] = np.where(df["categories"].isna(), "Unknown", df["categories"])
df["rating"] = np.where(df["rating"].isna(), 0, df["rating"])
df["review_count"] = np.where(df["review_count"].isna(), 0, df["review_count"])
df["price"] = np.where(df["price"].isna(), "$", df["price"])

In [6]:
QnA = pd.read_csv("QnA.csv")


In [9]:
from dotenv import load_dotenv
import os


load_dotenv("Credentials")  

bedrock_client = boto3.client(
    service_name = "bedrock-runtime",
    region_name="us-east-2",
    aws_access_key_id = os.getenv("aws_access_key_id"), 
    aws_secret_access_key = os.getenv("aws_secret_access_key")
)

In [30]:
def add_restaurant_id(df):

    address_cols = ['address1', 'city', 'state', 'zip_code', 'country']
    
    df[address_cols] = df[address_cols].fillna('').astype(str)
    df['address_fingerprint'] = df[address_cols].apply(lambda x: '|'.join(x), axis=1)
    
    df['restaurant_id'] = df.apply(
        lambda x: hashlib.sha256(
            f"{x['restaurant_name']}_{x['address_fingerprint']}".encode()
        ).hexdigest()[:12],  
        axis=1
    )
    
    return df.drop(columns=['address_fingerprint'])


df = add_restaurant_id(df)

def _normalize_price(price_str):
    price_str = str(price_str).strip().lower()
    return {
        '$': 'low',
        '$$': 'mid',
        '$$$': 'high',
        'low': 'low',
        'mid': 'mid',
        'high': 'high'
    }.get(price_str, 'unknown')

    
def _clean_categories(category_str):
    """Normalize and split category strings"""
    if pd.isna(category_str):
        return []
    return [
        cat.strip().title()
        for cat in str(category_str).split(',')
        if cat.strip() != ''
    ]


def _normalize_price(price_str):
    """Standardize price indicators"""
    price_str = str(price_str).strip().lower()
    return {
        '$': 'low',
        '$$': 'mid',
        '$$$': 'high',
        'low': 'low',
        'mid': 'mid',
        'high': 'high'
    }.get(price_str, 'unknown')

def prepare_chunks(df):
    """Create item-wise chunks for RAG with enriched metadata"""
    chunks = []
        
    for _, row in df.iterrows():
        ingredients = [ing.strip() for ing in row['ingredient_name'].split(',') if ing.strip()]
        
        description = row['menu_description'].strip()
        
        if description == "Unknown":
            description = f"A {row['menu_category']} featuring {', '.join(ingredients)}"
        
        # Build chunk text
        chunk_text = (
            f"{row['menu_item']} ({row['menu_category']}) at {row['restaurant_name']}. "
            f"Description: {description}. Ingredients: {', '.join(ingredients)}. "
            f"Price: {_normalize_price(row['price'])}. Location: {row['city']}, {row['state']}, {row['zip_code']}."
        )
        
        metadata = {
            "restaurant_id": row['restaurant_id'],
            "restaurant_name": row['restaurant_name'].strip(),
            "menu_category": row['menu_category'].strip().title(),
            "menu_description": row['menu_description'].strip(),
            "menu_item": row.get('menu_item', 'Unknown Item').strip(), 
            "price_tier": _normalize_price(row['price']),
            "cuisine_types": [ct.strip() for ct in row['categories'].split(',') if ct.strip()],
            "ingredients": ingredients,
            "location": f"{row['city']}, {row['state']}",
            "item_id": row['item_id']
        }
        
        chunks.append({"text": chunk_text, "metadata": metadata})
    
    return chunks

def get_titan_embeddings(texts, dimensions=512):
    embeddings = []
    cnt = 0 
    for row, text in enumerate(texts):

        body = json.dumps({
            "inputText": text,
            "dimensions": dimensions,
            "normalize": True
        })
        
        response = bedrock_client.invoke_model(
            modelId="amazon.titan-embed-text-v2:0",
            contentType="application/json",
            accept="*/*",
            body=body
        )
        
        # Parse the response
        response_body = json.loads(response['body'].read())
        
        # Extract the embedding
        embedding = response_body['embedding']
        embeddings.append(embedding)
        cnt+=1

        if (cnt %1000) == 0:
            print(cnt)
    
    return np.array(embeddings)

In [29]:

def create_tfidf_pipeline(chunks):
    """Create TF-IDF vectorizer and matrix using chunk texts"""
    corpus = [chunk['text'] for chunk in chunks]
    vectorizer = TfidfVectorizer(
        stop_words='english',
        ngram_range=(1, 3),
        min_df=2,
        max_df=0.9,
        sublinear_tf=True
    )
    return vectorizer, vectorizer.fit_transform(corpus)

def create_faiss_index(embeddings):
    """Create FAISS index with dimension validation"""
    embeddings = np.array(embeddings).astype('float32')
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index


def retrieve_with_tfidf(query, chunks, vectorizer, tfidf_matrix, top_n=50):
    query_vec = vectorizer.transform([query])
    
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    # Pair scores with chunks using matrix row indices
    scored_chunks = sorted(
        [(i, score) for i, score in enumerate(cosine_sim)],
        key=lambda x: x[1], 
        reverse=True
    )[:top_n]
    
    return [{
        "metadata": {
            **chunks[i]["metadata"],
            "text": chunks[i]["text"],
            "chunk_id": chunks[i]["chunk_id"]
        },
        "text": chunks[i]["text"],
        "chunk_id": chunks[i]["chunk_id"],
        "score": float(score)
    } for i, score in scored_chunks]
    
def refine_with_bm25(query, tfidf_results, top_n=20):
    texts = [res["text"] for res in tfidf_results]
    bm25 = BM25Okapi([doc.split() for doc in texts])
    scores = bm25.get_scores(query.split())
    
    return [{
        "metadata": {  # Move all fields into metadata
            **res["metadata"],
            "text": res["text"],
            "chunk_id": res["chunk_id"]
        },
        "text": res["text"],
        "chunk_id": res["chunk_id"],
        "score": float(scores[i])
    } for i, res in enumerate(tfidf_results)][:top_n]

def retrieve_with_embeddings(query, faiss_index, faiss_index_to_chunk, top_n=20):
    query_embedding = get_titan_embeddings([query])
    distances, indices = faiss_index.search(
        np.array(query_embedding).astype('float32'), 
        top_n
    )
    
    return [{
        "metadata": {  # Move all fields into metadata
            **faiss_index_to_chunk[idx],
            "text": faiss_index_to_chunk[idx]["text"],
            "chunk_id": faiss_index_to_chunk[idx]["chunk_id"]
        },
        "text": faiss_index_to_chunk[idx]["text"],
        "chunk_id": faiss_index_to_chunk[idx]["chunk_id"],
        "score": float(1 / (1 + distances[0][i]))
    } for i, idx in enumerate(indices[0]) if idx in faiss_index_to_chunk]
    
def hybrid_retrieval(query, chunks, vectorizer, tfidf_matrix, 
                    faiss_index, faiss_index_to_chunk, reranker, top_k=5):
    """Hybrid retrieval with consistent metadata structure"""
    # Retrieve and refine results
    tfidf_results = retrieve_with_tfidf(query, chunks, vectorizer, tfidf_matrix, 50)
    bm25_results = refine_with_bm25(query, tfidf_results, 20)
    vector_results = retrieve_with_embeddings(query, faiss_index, faiss_index_to_chunk, 20)
    
    # Combine and deduplicate
    seen = set()
    combined = []
    for res in bm25_results + vector_results:
        uid = res["metadata"].get("item_id", f"{res['metadata']['restaurant_id']}_{res['metadata']['menu_item']}")
        if uid not in seen:
            seen.add(uid)
            combined.append(res)
    
    # Rerank final results
    return rerank_results(query, combined, reranker, top_k)
    
    


def rerank_results(query, combined_results, reranker, top_k=5):
    """Re-rank retrieved results using a Cross-Encoder"""
    if not combined_results:
        return []

    query_doc_pairs = [(query, res["text"]) for res in combined_results]

    scores = reranker.predict(query_doc_pairs)

    for i, res in enumerate(combined_results):
        res['combined_score'] = scores[i]
        res['reranker_score'] = scores[i]  
    
    sorted_results = sorted(combined_results, key=lambda x: x["reranker_score"], reverse=True)[:top_k]
    
    return sorted_results

def initialize_system(chunks, embeddings):
    chunk_embeddings = np.array(embeddings).astype('float32')
    faiss_index = create_faiss_index(chunk_embeddings)
    
    # Create index mapping
    faiss_index_to_chunk = {
        i: {
            **chunk["metadata"],
            "text": chunk["text"],
            "chunk_id": chunk["chunk_id"]
        } for i, chunk in enumerate(chunks)
    }
    
    vectorizer, tfidf_matrix = create_tfidf_pipeline(chunks)
    
    return {
        "faiss_index": faiss_index,
        "faiss_index_to_chunk": faiss_index_to_chunk,
        "vectorizer": vectorizer,
        "tfidf_matrix": tfidf_matrix,
        "reranker": CrossEncoder("cross-encoder/ms-marco-MiniLM-L-6-v2")
    }

def generate_final_response(query, retrieved_chunks, historic_context, bedrock_client):
    restaurant_entries = []
    for chunk in retrieved_chunks[:3]: 
        meta = chunk.get('metadata', {})
        entry = (
            f"Restaurant: {meta.get('restaurant_name', 'Unknown')}\n"
            f"- Menu Item: {meta.get('menu_item', 'N/A')} (ID: {meta.get('item_id', '')})\n"
            f"- Price: {meta.get('price_tier', '').capitalize()}\n"
            f"- Cuisine: {', '.join(meta.get('cuisine_types', []))}\n"
            f"- Ingredients: {', '.join(meta.get('ingredients', []))}"
        )
        restaurant_entries.append(entry)

    # Preparing historical context if available
    history_block = ""
    if historic_context.get('Historic_context', -1) not in [-1, "Data not available on Wikipedia"]:
        history_block = (
            f"\n\nHistorical Context: {historic_context['Historic_context']}"
            f"\nSource: {historic_context['link']}"
        )

    generation_prompt = f"""You are a restaurant information specialist. Use this structure:

    {{restaurants}}

    {{history}}

    **Guidelines**
    - Present restaurants in bullet points
    - ALWAYS include item IDs from metadata
    - For history: Only include if relevant to query
    - If asked sources: Reference item IDs like "(ID: 24399115)"
    - Never mention chunks or retrieval process

    **Current Query**: {query}
    """

    payload = {
        "modelId": "arn:aws:bedrock:us-east-2:302263051966:inference-profile/us.anthropic.claude-3-5-haiku-20241022-v1:0",
        "body": json.dumps({
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 500,
            "temperature": 0.2,
            "system": """You are a restaurant concierge for San Francisco. Follow these rules:
            1. Use ONLY the provided restaurant metadata
            2. For history: Use only the provided Wikipedia context
            3. Always cite item IDs from metadata
            4. If no history requested: Don't mention it""",
            "messages": [{
                "role": "user",
                "content": [{
                    "type": "text",
                    "text": generation_prompt.format(
                        restaurants="\n\n".join(restaurant_entries),
                        history=history_block
                    )
                }]
            }]
        })
    }

    try:
        response = bedrock_client.invoke_model(**payload)
        result = json.loads(response["body"].read().decode("utf-8"))
        raw_response = result.get("content", [{}])[0].get("text", "")
        
        return raw_response.replace("(ID:", "(Item ID:").replace("ID:", "Item ID:")
    
    except Exception as e:
        print(f"Generation error: {str(e)[:200]}")
        return f"Could not generate response. Please check your query. Reference IDs: {', '.join([str(c['metadata']['item_id']) for c in retrieved_chunks[:3]])}"
        
def find_information_missing_in_chunks(query, retrieved_chunks, bedrock_client):
    
    enhancement_prompt = f"""Analyze the given restaurant-related query and retrieved search results.  
    Determine if the query explicitly asks for historical or cultural context about a dish, ingredient, or restaurant practice,  
    and check whether this information is present in the retrieved search results.  

    **INPUT:**  
    - **Query:** {query}  
    - **Retrieved Chunks:** {retrieved_chunks}  

    **OUTPUT GUIDELINES:**  
    - If the query **does not explicitly** ask for historical/cultural context (e.g., it only asks for restaurants or locations), return an **empty string** (`""`).  
    - If the query **explicitly** asks for historical or cultural context, check if the retrieved chunks answer it.  
    - If the historical or cultural context is required but **not answered** in the retrieved chunks, return a **concise Wikipedia search term** in the format: `"History of <topic>"`.  
    - Do **not** generate historical terms unless they are explicitly requested.  
    - Do **not** include any extra text, explanations, or formatting beyond the required search term.  
"""

    payload = {
        "modelId": "arn:aws:bedrock:us-east-2:302263051966:inference-profile/us.anthropic.claude-3-5-haiku-20241022-v1:0",
        "body": json.dumps({
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 200,
            "temperature": 0.3,
            "system": """You are an expert in restaurant data analysis.  
Your task is to determine if a restaurant-related query requires missing historical or cultural context.  
- Only return a Wikipedia search term **if the query explicitly asks for historical or cultural context** and the retrieved information does not answer it.  
- If the query is about finding restaurants, return an empty string.  
- Do not assume historical context is needed just because a food item is mentioned.""",
            "messages": [{
                "role": "user",
                "content": [{
                    "type": "text", 
                    "text": enhancement_prompt
                }]
            }]
        })
    }

    try:
        response = bedrock_client.invoke_model(**payload)
        result = json.loads(response["body"].read().decode("utf-8"))
        search_term = result.get("content", [{}])[0].get("text", "").strip()
        
        if search_term:
            wiki_summary =  fetch_wikipedia_summary(search_term[1: -1])
            if wiki_summary:
                url = f"https://en.wikipedia.org/api/rest_v1/page/summary/"
                link = url + search_term[1:-1]
                return {"Historic_context" : wiki_summary, "link" : link}
            else:
                return {"Historic_context" : "Data not available on Wikipedia", "link" : "Data not available on Wikipedia"}

        return {"Historic_context" : -1, "link" : "Data not available on Wikipedia"}
        
    except Exception as e:
        print(f"Analysis error: {str(e)[:200]}")
        return {"Historic_context" : -1, "link" : "Data not available on Wikipedia"}
        
def fetch_wikipedia_summary(title):
    url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{title}"
    response = requests.get(url)
    return response.json().get("extract", "-1")


In [23]:
from sentence_transformers import CrossEncoder

reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')


query = QnA["question"][0]

system = initialize_system(chunks, loaded_embeddings)


results = hybrid_retrieval(
    query_text,
    chunks=chunks[:1100],
    vectorizer=system["vectorizer"],
    tfidf_matrix=system["tfidf_matrix"],
    faiss_index=system["faiss_index"],
    faiss_index_to_chunk=system["faiss_index_to_chunk"],
    reranker=system["reranker"],
    top_k=5
)

for res in results:
    print(f"""\
[Chunk {res['chunk_id']}] {res['metadata']['menu_item']}
Combined Score: {res['combined_score']:.2f} (Reranker: {res['reranker_score']:.2f}, Original: {res['score']:.2f})
{res['metadata']['menu_description']}
Ingredients: {', '.join(res['metadata']['ingredients'])}
{'-'*40}""")

NameError: name 'loaded_embeddings' is not defined

In [None]:
# Initialize variables to track hit scores
hit_scores = []

# Iterate over QnA dataset to evaluate retrieval performance
for i, row in QnA.iterrows():
    query_text = row["question"]  
    correct_chunk_id = row["chunk_id"]  

    
    # Retrieve top-k results using the hybrid retrieval system
    retrieved_results = hybrid_retrieval(
        query_text,
        chunks=chunks[:1100],  # Ensure consistent chunk structure
        vectorizer=system["vectorizer"],
        tfidf_matrix=system["tfidf_matrix"],
        faiss_index=system["faiss_index"],
        faiss_index_to_chunk=system["faiss_index_to_chunk"],
        reranker=system["reranker"],
        top_k=5
    )

    # Check if the correct chunk is among retrieved results
    hit = int(any(result["chunk_id"] == correct_chunk_id for result in retrieved_results))
    hit_scores.append(hit)

    # Debugging info
    print(f"Question: {query_text}")
    print(f"Correct Chunk ID: {correct_chunk_id}")
    print(f"Retrieved Chunk IDs: {[result['chunk_id'] for result in retrieved_results]}")
    print(f"Hit: {hit}\n")

# Compute final hit rate
overall_hit_rate = sum(hit_scores) / len(hit_scores)
print(f"Overall Hit Rate: {overall_hit_rate * 100:.2f}%")


Question: In the heart of San Francisco's vibrant culinary scene, which liquid velvet whispers tales of Portuguese tradition while dancing with dark chocolate and caramel notes?
Correct Chunk ID: 0
Retrieved Chunk IDs: [0, 580, 896, 252, 132]
Hit: 1

Question: What artisanal San Francisco treat transforms a humble nut into a crave-worthy delicacy with just a whisper of oceanic seasoning?
Correct Chunk ID: 1
Retrieved Chunk IDs: [131, 1, 794, 4, 0]
Hit: 1

Question: What Mediterranean snack transforms a simple olive into a zesty flavor adventure with just two unexpected botanical companions?
Correct Chunk ID: 2
Retrieved Chunk IDs: [2, 232, 876, 9, 865]
Hit: 1

Question: Where can you find a sweet, spreadable creation that marries fruit and dairy in the Mission District's culinary landscape?
Correct Chunk ID: 3
Retrieved Chunk IDs: [12, 217, 131, 205, 247]
Hit: 0

Question: What culinary creation transforms a simple spread into a symphony of textures that could make a cheese board blush

In [105]:
overall_hit_rate

0.48846153846153845

## Lets see how the chunks with better descriptions behave 

In [10]:
with open('chunks_new.pkl', 'rb') as f:
    chunks_enhanced = pickle.load(f)

In [11]:
texts_enhanced = [chunk["text"] for chunk in chunks_enhanced]
embeddings_enhanced = get_titan_embeddings(texts_enhanced[:1100])

1000


In [123]:

with open('embeddings_enhanced.pkl', 'wb') as f:
    pickle.dump(embeddings_enhanced, f)

with open('embeddings_enhanced.pkl', 'rb') as f:
    loaded_embeddings_enhanced = pickle.load(f)


In [31]:

system = initialize_system(chunks_enhanced[:1100], loaded_embeddings_enhanced)

query = "Resturants serving sushi?"

results = hybrid_retrieval(
    query,
    chunks=chunks_enhanced[:1100],
    vectorizer=system["vectorizer"],
    tfidf_matrix=system["tfidf_matrix"],
    faiss_index=system["faiss_index"],
    faiss_index_to_chunk=system["faiss_index_to_chunk"],
    reranker=system["reranker"],
    top_k=5
)

for res in results:
    print(f"""\
[Chunk {res['chunk_id']}] {res['metadata']['menu_item']}
Combined Score: {res['combined_score']:.2f} (Reranker: {res['reranker_score']:.2f}, Original: {res['score']:.2f})
{res['metadata']['menu_description']}
Ingredients: {', '.join(res['metadata']['ingredients'])}
{'-'*40}""")

[Chunk 689] 3 sushi special combo
Combined Score: -3.93 (Reranker: -3.93, Original: 0.00)
eight pieces of caterpillar roll two pieces of salmon nigiri two pieces of tuna nigiri
Ingredients: caterpillar roll, salmon, tuna
----------------------------------------
[Chunk 727] 303 hamachi nigiri sushi
Combined Score: -3.95 (Reranker: -3.95, Original: 0.00)
yellow tail two pieces
Ingredients: yellow tail, rice
----------------------------------------
[Chunk 726] 301 maguro nigiri sushi
Combined Score: -4.31 (Reranker: -4.31, Original: 0.00)
tuna two pieces
Ingredients: rice, tuna
----------------------------------------
[Chunk 725] 302 sake nigiri sushi
Combined Score: -4.52 (Reranker: -4.52, Original: 0.00)
salmon two pieces
Ingredients: salmon, rice
----------------------------------------
[Chunk 730] 305 ebi nigiri sushi
Combined Score: -4.83 (Reranker: -4.83, Original: 0.00)
cooked shrimp two pieces
Ingredients: cooked shrimp, rice
----------------------------------------


In [125]:
# Initialize variables to track hit scores
hit_scores = []

# Iterate over QnA dataset to evaluate retrieval performance
for i, row in QnA.iterrows():
    query_text = row["question"]  
    correct_chunk_id = row["chunk_id"]  

    
    # Retrieve top-k results using the hybrid retrieval system
    retrieved_results = hybrid_retrieval(
    query_text,
    chunks=chunks_enhanced[:1100],
    vectorizer=system["vectorizer"],
    tfidf_matrix=system["tfidf_matrix"],
    faiss_index=system["faiss_index"],
    faiss_index_to_chunk=system["faiss_index_to_chunk"],
    reranker=system["reranker"],
    top_k=5
)
    # Check if the correct chunk is among retrieved results
    hit = int(any(result["chunk_id"] == correct_chunk_id for result in retrieved_results))
    hit_scores.append(hit)


# Compute final hit rate
overall_hit_rate = sum(hit_scores) / len(hit_scores)
print(f"Overall Hit Rate: {overall_hit_rate * 100:.2f}%")


Overall Hit Rate: 65.77%


> Enhanced description really helped with better retrieval.  The hit rate increased by near about 22%.
> 
> Lets work on the generation end now.

> This function would help extract data from wikipedia summary page on it

> Lets work on 75% data atleast.

> I would enhance 7500 chunks out of some 10k chunks.

In [12]:

def enhance_description_with_llm(item_data, bedrock_client):
    """
    Enriches menu item descriptions using LLM to better match potential user queries.
    """
    enhancement_prompt = f"""Analyze this menu item and enhance its description to include:
1. Sensory details (texture, aroma, taste)
2. Pairing suggestions
3. Dietary information
4. Usage scenarios
5. Complementary ingredients
6. Also include cuisines, cultural/historical context around the dishes.

**Output GUIDELINES**
- Keep it concise. Less than 5 sentences.
- Maintain factual accuracy.
- Do not start with "here's an enhanced description". Just give the description.

Original Data:
{json.dumps(item_data, indent=2)}

Enhanced Description:"""

    payload = {
        "modelId": "arn:aws:bedrock:us-east-2:302263051966:inference-profile/us.anthropic.claude-3-5-haiku-20241022-v1:0",
        "contentType": "application/json",
        "accept": "application/json",
        "body": json.dumps({
            "anthropic_version": "bedrock-2023-05-31",
            "max_tokens": 150,
            "temperature": 0.2,
            "system": "You are a culinary assistant that enhances menu descriptions for better searchability while maintaining accuracy.",
            "messages": [
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": enhancement_prompt}
                    ]
                }
            ]
        })
    }

    try:
        response = bedrock_client.invoke_model(**payload)
        result = json.loads(response["body"].read().decode("utf-8"))
        return next(item["text"].strip() for item in result["content"] if item["type"] == "text")
    except Exception as e:
        print(f"Enhancement error: {e}")
        return item_data.get('menu_description', '')  # Fallback to original

def generate_enhanced_chunk(chunk, bedrock_client):
    enhanced_desc = enhance_description_with_llm(chunk['metadata'], bedrock_client)
    
    enhanced_text = (
        f"{chunk['metadata']['menu_item']} ({chunk['metadata']['menu_category']}) at {chunk['metadata']['restaurant_name']}. "
        f"Description: {enhanced_desc} "
        f"Ingredients: {', '.join(chunk['metadata']['ingredients'])}. "
        f"Pairings: {chunk['metadata'].get('pairings', 'Excellent with desserts or cheese boards')}. "
        f"Dietary: {chunk['metadata'].get('dietary', 'Contains alcohol' if 'wine' in chunk['metadata']['menu_category'].lower() else '')}. "
        f"Price: {chunk['metadata']['price_tier']}. Location: {chunk['metadata']['location']}"
    )
    
    return {
        "text": enhanced_text,
        "metadata": chunk["metadata"],
        "chunk_id": chunk["chunk_id"]
    }



In [13]:
df = pd.read_csv("Imputed_decription.csv").drop(["Unnamed: 0.1", "Unnamed: 0"], axis = 1)
df["categories"] = np.where(df["categories"].isna(), "Unknown", df["categories"])
df["rating"] = np.where(df["rating"].isna(), 0, df["rating"])
df["review_count"] = np.where(df["review_count"].isna(), 0, df["review_count"])
df["price"] = np.where(df["price"].isna(), "$", df["price"])

df = add_restaurant_id(df)

chunks = prepare_chunks(df)
len(chunks)

10571

In [14]:
for i, chunk in enumerate(chunks):
    chunk["chunk_id"] = i

In [15]:
with open('chunks_new.pkl', 'rb') as f:
    chunks_enhanced = pickle.load(f)


In [80]:
start_idx = 1174

while start_idx < len(chunks):

    chunks_enhanced.append(generate_enhanced_chunk(chunks[start_idx], bedrock_client))
    print(start_idx, end =", ")
        
    if (start_idx + 1) %500 == 0:
        embeddings_enhanced_file = "enhancing_chunks_" + str(start_idx) + '.pkl'
        with open(embeddings_enhanced_file, 'wb') as f:
            pickle.dump(chunks_enhanced, f)

    start_idx+=1



1174, 1175, 1176, 1177, 1178, 1179, 1180, 1181, 1182, 1183, 1184, 1185, 1186, 1187, 1188, 1189, 1190, 1191, 1192, 1193, 1194, 1195, 1196, 1197, 1198, 1199, 1200, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1228, 1229, 1230, 1231, 1232, 1233, 1234, 1235, 1236, 1237, 1238, 1239, 1240, 1241, 1242, 1243, 1244, 1245, 1246, 1247, 1248, 1249, 1250, 1251, 1252, 1253, 1254, 1255, 1256, 1257, 1258, 1259, 1260, Enhancement error: An error occurred (ThrottlingException) when calling the InvokeModel operation (reached max retries: 4): Too many requests, please wait before trying again.
1262, 1263, 1264, 1265, 1266, 1267, 1268, 1269, 1270, 1271, 1272, 1273, 1274, 1275, 1276, 1277, 1278, 1279, 1280, 1281, 1282, 1283, 1284, 1285, 1286, 1287, 1288, 1289, 1290, 1291, 1292, 1293, 1294, 1295, 1296, 1297, 1298, 1299, 1300, 1301, 1302, 1303, 1304, 1305, 1306, 1307, 1308, 1309, 1310, 1311, 13


KeyboardInterrupt



In [83]:
with open("embeddings_enhanced_file_1543.pkl", 'wb') as f:
    pickle.dump(chunks_enhanced, f)


In [None]:
start_idx = 1544

while start_idx < len(chunks):

    chunks_enhanced.append(generate_enhanced_chunk(chunks[start_idx], bedrock_client))
    print(start_idx, end =", ")
        
    if ((start_idx + 1) %500) == 0:
        embeddings_enhanced_file = "enhancing_chunks_" + str(start_idx) + '.pkl'
        with open(embeddings_enhanced_file, 'wb') as f:
            pickle.dump(chunks_enhanced, f)
        print("Embedding file name: ", embeddings_enhanced_file)

    start_idx+=1



1544, 1545, 1546, 1547, 1548, 1549, 1550, 1551, 1552, 1553, 1554, 1555, 1556, 1557, 1558, 1559, 1560, 1561, 1562, 1563, 1564, 1565, 1566, 1567, 1568, 1569, 1570, 1571, 1572, 1573, 1574, 1575, 1576, 1577, 1578, 1579, 1580, 1581, 1582, 1583, 1584, 1585, 1586, 1587, Enhancement error: An error occurred (ThrottlingException) when calling the InvokeModel operation (reached max retries: 4): Too many requests, please wait before trying again.
1589, 1590, 1591, 1592, 1593, 1594, 1595, 1596, 1597, 1598, 1599, 1600, 1601, 1602, 1603, 1604, 1605, 1606, 1607, 1608, 1609, 1610, 1611, 1612, 1613, 1614, 1615, 1616, 1617, 1618, 1619, 1620, 1621, 1622, 1623, 1624, 1625, 1626, 1627, 1628, 1629, 1630, 1631, 1632, 1633, 1634, 1635, 1636, 1637, 1638, 1639, 1640, 1641, 1642, 1643, 1644, 1645, 1646, 1647, 1648, 1649, 1650, 1651, 1652, 1653, 1654, 1655, 1656, 1657, 1658, 1659, 1660, 1661, 1662, Enhancement error: An error occurred (ThrottlingException) when calling the InvokeModel operation (reached max retri

> We were able to do 7500 chunks. Thats nearly 75% data

In [19]:
with open('enhancing_chunks_7499.pkl', 'rb') as f:
    enhanced_chunks_7500 = pickle.load(f)


In [17]:
enhanced_chunks_7500 = [chunk["text"] for chunk in enhanced_chunks_7500]
enhanced_chunk_embeddings_7500 = get_titan_embeddings(enhanced_chunks_7500)


1000
2000
3000
4000
5000
6000
7000


In [18]:
with open('enhanced_chunk_embeddings_7500.pkl', 'wb') as f:
    pickle.dump(enhanced_chunk_embeddings_7500, f)

with open('enhanced_chunk_embeddings_7500.pkl', 'rb') as f:
    enhanced_chunk_embeddings_7500_loaded = pickle.load(f)


In [16]:
with open('enhanced_chunk_embeddings_7500.pkl', 'rb') as f:
    enhanced_chunk_embeddings_7500_loaded = pickle.load(f)


In [17]:
chunk_copy = chunks[:7500].copy()

In [20]:
for idx, chunk in enumerate(chunk_copy):
    chunk_copy[idx]["text"] = enhanced_chunks_7500[idx]

In [53]:
with open('chunk_copy.pkl', 'wb') as f:
    pickle.dump(chunk_copy, f)

with open('chunk_copy.pkl', 'rb') as f:
    chunk_copy = pickle.load(f)


In [17]:
with open('chunk_copy.pkl', 'rb') as f:
    chunk_copy = pickle.load(f)


[{'text': {'text': "passagem porto (Dessert Wine) at 20 spot. Description: A luxurious Portuguese ruby port from the Douro Valley, the Passagem Porto offers a rich, velvety texture with deep notes of dark chocolate, caramel, and subtle almond nuances. Ideal for post-dinner indulgence, this medium-bodied dessert wine beautifully complements dark chocolate truffles, aged blue cheeses, and walnut-based desserts. Gluten-free and naturally sweet, it's perfect for intimate gatherings or as a sophisticated nightcap. Traditionally crafted using indigenous Portuguese grape varieties, this port represents centuries of winemaking heritage from the world's oldest demarcated wine region. Best served slightly chilled at 60- Ingredients: port. Pairings: Excellent with desserts or cheese boards. Dietary: Contains alcohol. Price: mid. Location: San Francisco, CA",
   'metadata': {'restaurant_id': 'ee70f1d57653',
    'restaurant_name': '20 spot',
    'menu_category': 'Dessert Wine',
    'menu_descriptio

In [24]:
def create_tfidf_pipeline(chunks):
    """Create TF-IDF vectorizer and matrix using chunk texts"""
    corpus = []
    for chunk in chunks:
        text = chunk['text']
        if isinstance(text, dict) and 'text' in text:
            corpus.append(text['text'])
        else:
            corpus.append(text)
            
    vectorizer = TfidfVectorizer(
        stop_words='english',
        ngram_range=(1, 3),
        min_df=2,
        max_df=0.9,
        sublinear_tf=True
    )
    return vectorizer, vectorizer.fit_transform(corpus)

In [25]:
system = initialize_system(chunk_copy, enhanced_chunk_embeddings_7500_loaded)


In [32]:
import pickle

with open('chunks_copy.pkl', 'wb') as f:
    pickle.dump(chunks, f)

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(system["vectorizer"], f)

with open('tfidf_matrix.pkl', 'wb') as f:
    pickle.dump(system["tfidf_matrix"], f)

with open('faiss_index.pkl', 'wb') as f:
    pickle.dump(system["faiss_index"], f)

with open('faiss_index_to_chunk.pkl', 'wb') as f:
    pickle.dump(system["faiss_index_to_chunk"], f)

with open('reranker.pkl', 'wb') as f:
    pickle.dump(system["reranker"], f)


In [30]:
import pickle

chunks_5000 = chunk_copy[:5000]
embeddings_5000 = enhanced_chunk_embeddings_7500_loaded[:5000]


with open("chunks_5000.pkl", 'wb') as f:
    pickle.dump(chunks_5000, f)

with open("embeddings_5000.pkl", 'wb') as f:
    pickle.dump(embeddings_5000, f)


# Load each object from its respective file
with open('chunks.pkl', 'rb') as f:
    chunks = pickle.load(f)

with open('vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

with open('tfidf_matrix.pkl', 'rb') as f:
    tfidf_matrix = pickle.load(f)

with open('faiss_index.pkl', 'rb') as f:
    faiss_index = pickle.load(f)

with open('faiss_index_to_chunk.pkl', 'rb') as f:
    faiss_index_to_chunk = pickle.load(f)

with open('reranker.pkl', 'rb') as f:
    reranker = pickle.load(f)

# The objects are now loaded and available for use


In [40]:
with open('enhanced_chunk_embeddings_7500.pkl', 'rb') as f:
    enhanced_chunk_embeddings_7500_loaded = pickle.load(f)


In [41]:
import pickle

chunks_5000 = chunk_copy[:5000]
embeddings_5000 = enhanced_chunk_embeddings_7500_loaded[:5000]


with open("chunks_5000.pkl", 'wb') as f:
    pickle.dump(chunks_5000, f)

with open("embeddings_5000.pkl", 'wb') as f:
    pickle.dump(embeddings_5000, f)


> Used a cross encoder to re-rank the chunks against the query.

> With better descriptions, BM25 and semantic retreival of embeddings. The hit rate has increased by approximately 18%. We do see the new approach over performs the standard RAG.

> Now we would be developing a generation end for this to complete the RAG. 