In [31]:
import os
import time
import pandas as pd
from datetime import datetime
import getpass
import warnings
import numpy as np
from typing import List, Dict, Any, Optional
import asyncio
import nest_asyncio
from uuid import uuid4
warnings.filterwarnings('ignore')

# Apply nest_asyncio for Jupyter compatibility
nest_asyncio.apply()

print("🔑 Setting up API Keys")
print("=" * 40)

# OpenAI API Key (required)
if not os.getenv("OPENAI_API_KEY"):
    os.environ["OPENAI_API_KEY"] = getpass.getpass("🤖 Enter your OpenAI API Key: ")
    print("✅ OpenAI API key set")
else:
    print("✅ OpenAI API key already set")

# Cohere API Key (required for reranking)
if not os.getenv("COHERE_API_KEY"):
    os.environ["COHERE_API_KEY"] = getpass.getpass("🔄 Enter your Cohere API Key: ")
    print("✅ Cohere API key set")
else:
    print("✅ Cohere API key already set")

# Tavily API Key (recommended for external search)
if not os.getenv("TAVILY_API_KEY"):
    tavily_key = getpass.getpass("🔍 Enter your Tavily API Key (or press Enter to skip): ")
    if tavily_key.strip():
        os.environ["TAVILY_API_KEY"] = tavily_key
        print("✅ Tavily API key set")
    else:
        print("⚠️ Tavily API key skipped - external search will be limited")
else:
    print("✅ Tavily API key already set")

# LangSmith API Key (optional for monitoring)
if not os.getenv("LANGSMITH_API_KEY"):
    langsmith_key = getpass.getpass("📊 Enter your LangSmith API Key (or press Enter to skip): ")
    if langsmith_key.strip():
        os.environ["LANGSMITH_API_KEY"] = langsmith_key
        os.environ["LANGSMITH_TRACING"] = "true"
        os.environ["LANGCHAIN_TRACING_V2"] = "true"
        os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
        os.environ["LANGSMITH_ENDPOINT"] = "https://api.smith.langchain.com"
        print("✅ LangSmith API key set and tracing enabled")
    else:
        os.environ["LANGSMITH_TRACING"] = "false"
        os.environ["LANGCHAIN_TRACING_V2"] = "false"
        print("⚠️ LangSmith skipped - no monitoring/tracing")
else:
    print("✅ LangSmith API key already set")

print("\n🎯 API Key Setup Complete!")
print("✅ Ready for enhanced movie review RAG system!")


🔑 Setting up API Keys
✅ OpenAI API key already set
✅ Cohere API key already set
✅ Tavily API key already set
✅ LangSmith API key already set

🎯 API Key Setup Complete!
✅ Ready for enhanced movie review RAG system!


In [17]:
# Load the Rotten Tomatoes datasets
print("🍅 Loading Rotten Tomatoes movie review datasets...")

# Robust CSV loading function with error handling
def load_csv_robust(filepath):
    """Load CSV with robust error handling for malformed data"""
    encodings = ['utf-8', 'latin1', 'cp1252', 'iso-8859-1']
    
    for encoding in encodings:
        try:
            print(f"  Trying encoding: {encoding}")
            # Try with error handling for malformed lines
            df = pd.read_csv(
                filepath, 
                encoding=encoding,
                on_bad_lines='skip',  # Skip bad lines instead of failing
                engine='python',      # Use Python engine for better error handling
                quoting=1,           # Quote all fields
                skipinitialspace=True
            )
            print(f"  ✅ Success with {encoding}")
            return df
        except UnicodeDecodeError:
            print(f"  ❌ Failed with {encoding}")
            continue
        except Exception as e:
            print(f"  ❌ Failed with {encoding}: {str(e)}")
            continue
    
    # If all encodings fail, try with minimal options
    print("  Trying with basic fallback...")
    try:
        df = pd.read_csv(filepath, encoding='latin1', on_bad_lines='skip', engine='python')
        print("  ✅ Success with fallback method")
        return df
    except Exception as e:
        raise ValueError(f"Could not read {filepath}: {str(e)}")

# Load Rotten Tomatoes movies metadata
print("Loading Rotten Tomatoes movies metadata...")
movies_df = load_csv_robust("data/rotten_tomatoes_movies.csv")
print(f"Movies dataset: {len(movies_df)} movies")

# Load Rotten Tomatoes reviews
print("\nLoading Rotten Tomatoes reviews...")
reviews_df = load_csv_robust("data/rotten_tomatoes_movie_reviews.csv")
print(f"Reviews dataset: {len(reviews_df)} reviews")

print(f"\n📊 Dataset Statistics:")
print(f"• Total movies: {len(movies_df):,}")
print(f"• Total reviews: {len(reviews_df):,}")
print(f"• Average reviews per movie: {len(reviews_df)/len(movies_df):.1f}")


🍅 Loading Rotten Tomatoes movie review datasets...
Loading Rotten Tomatoes movies metadata...
  Trying encoding: utf-8
  ✅ Success with utf-8
Movies dataset: 143258 movies

Loading Rotten Tomatoes reviews...
  Trying encoding: utf-8
  ✅ Success with utf-8
Reviews dataset: 1444963 reviews

📊 Dataset Statistics:
• Total movies: 143,258
• Total reviews: 1,444,963
• Average reviews per movie: 10.1


In [18]:
# Clean and prepare the data
def clean_text(text):
    """Clean text data for better processing"""
    if pd.isna(text):
        return ""
    
    text = str(text)
    # Remove extra whitespace and normalize
    text = ' '.join(text.split())
    return text.strip()

# Clean movie data
print("🧹 Cleaning and preparing movie data...")

movies_df['title_clean'] = movies_df['title'].apply(clean_text)
movies_df['genre_clean'] = movies_df['genre'].apply(clean_text) 
movies_df['director_clean'] = movies_df['director'].apply(clean_text)

# Clean reviews data  
reviews_df['reviewText_clean'] = reviews_df['reviewText'].apply(clean_text)
reviews_df['criticName_clean'] = reviews_df['criticName'].apply(clean_text)
reviews_df['publicatioName_clean'] = reviews_df['publicatioName'].apply(clean_text)

print("🔗 Merging movies and reviews data...")

# Merge movies and reviews
merged_df = reviews_df.merge(
    movies_df, 
    left_on='id', 
    right_on='id', 
    how='left'
)

print(f"✅ Merged dataset: {len(merged_df)} reviews with movie metadata")
print(f"✅ Reviews with movie titles: {merged_df['title_clean'].notna().sum()} / {len(merged_df)}")


🧹 Cleaning and preparing movie data...
🔗 Merging movies and reviews data...
✅ Merged dataset: 1469840 reviews with movie metadata
✅ Reviews with movie titles: 1469543 / 1469840


In [19]:
# Create unified data structure for processing Rotten Tomatoes data
def create_review_documents(df, max_reviews=1000):
    """Convert merged DataFrame to list of review documents"""
    documents = []
    
    # Use a sample for better performance
    if len(df) > max_reviews:
        print(f"🧪 Using sample of {max_reviews} reviews...")
        df_sample = df.head(max_reviews)
    else:
        df_sample = df
    
    for idx, row in df_sample.iterrows():
        # Create comprehensive metadata
        metadata = {
            'source': 'rotten_tomatoes',
            'movie_id': row.get('id', ''),
            'movie_title': row.get('title_clean', row.get('id', 'Unknown')),
            'critic_name': row.get('criticName_clean', 'Anonymous'),
            'publication': row.get('publicatioName_clean', 'Unknown'),
            'review_date': row.get('creationDate', 'Unknown'),
            'original_score': row.get('originalScore', ''),
            'review_state': row.get('reviewState', ''),
            'sentiment': row.get('scoreSentiment', ''),
            'is_top_critic': row.get('isTopCritic', False),
            'genre': row.get('genre_clean', ''),
            'director': row.get('director_clean', ''),
            'rating': row.get('rating', ''),
            'audience_score': row.get('audienceScore', ''),
            'tomato_meter': row.get('tomatoMeter', ''),
            'release_date': row.get('releaseDateTheaters', ''),
            'runtime': row.get('runtimeMinutes', ''),
            'index': idx
        }
        
        # Create rich content for embedding
        content = f"Movie: {row.get('title_clean', row.get('id', 'Unknown'))}\n"
        
        # Add movie metadata
        if row.get('genre_clean'):
            content += f"Genre: {row.get('genre_clean')}\n"
        if row.get('director_clean'):
            content += f"Director: {row.get('director_clean')}\n"
        if row.get('rating'):
            content += f"Rating: {row.get('rating')}\n"
        if row.get('releaseDateTheaters'):
            content += f"Release Date: {row.get('releaseDateTheaters')}\n"
        
        # Add review information
        content += f"Critic: {row.get('criticName_clean', 'Anonymous')}\n"
        if row.get('publicatioName_clean'):
            content += f"Publication: {row.get('publicatioName_clean')}\n"
        if row.get('originalScore'):
            content += f"Score: {row.get('originalScore')}\n"
        if row.get('reviewState'):
            content += f"Review State: {row.get('reviewState')}\n"
        if row.get('scoreSentiment'):
            content += f"Sentiment: {row.get('scoreSentiment')}\n"
        
        # Add the main review text
        review_text = row.get('reviewText_clean', '')
        if review_text:
            content += f"Review: {review_text}"
        
        documents.append({
            'content': content,
            'metadata': metadata
        })
    
    return documents

# Create documents from merged Rotten Tomatoes data
print("🍅 Creating review documents from Rotten Tomatoes data...")
all_documents = create_review_documents(merged_df, max_reviews=1000)

print(f"✅ Created {len(all_documents)} total review documents")
print(f"   - Source: Rotten Tomatoes")
print(f"   - Reviews with movie metadata included")

# Show sample document
print("\n📄 Sample document:")
print(all_documents[0]['content'][:300] + "...")

# Basic statistics
print(f"\n📊 Document Statistics:")
unique_movies = len(set([doc['metadata']['movie_title'] for doc in all_documents]))
unique_critics = len(set([doc['metadata']['critic_name'] for doc in all_documents]))
print(f"• Unique movies: {unique_movies}")
print(f"• Unique critics: {unique_critics}")
print(f"• Average content length: {np.mean([len(doc['content']) for doc in all_documents]):.0f} characters")


🍅 Creating review documents from Rotten Tomatoes data...
🧪 Using sample of 1000 reviews...
✅ Created 1000 total review documents
   - Source: Rotten Tomatoes
   - Reviews with movie metadata included

📄 Sample document:
Movie: Beavers
Genre: Documentary
Director: Stephen Low
Rating: nan
Release Date: nan
Critic: Ivan M. Lincoln
Publication: Deseret News (Salt Lake City)
Score: 3.5/4
Review State: fresh
Sentiment: POSITIVE
Review: Timed to be just long enough for most youngsters' brief attention spans -- and it's pa...

📊 Document Statistics:
• Unique movies: 122
• Unique critics: 660
• Average content length: 333 characters


In [20]:
import tiktoken
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Qdrant
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langgraph.graph import START, StateGraph
from typing_extensions import TypedDict

# Token counting function
def tiktoken_len(text):
    tokens = tiktoken.encoding_for_model("gpt-4o").encode(text)
    return len(tokens)

# Convert our documents to LangChain Document format - each review is already a chunk
print("🔪 Using each review as a separate chunk...")
chunks = []
for doc in all_documents:
    langchain_doc = Document(
        page_content=doc['content'],
        metadata=doc['metadata']
    )
    chunks.append(langchain_doc)

print(f"✅ Created {len(chunks)} chunks from {len(all_documents)} reviews")
print("   Each review is treated as a separate chunk for better semantic coherence")

# Verify chunk sizes
chunk_lengths = [tiktoken_len(chunk.page_content) for chunk in chunks]
max_chunk_length = max(chunk_lengths)
avg_chunk_length = sum(chunk_lengths) / len(chunk_lengths)
print(f"📏 Maximum chunk length: {max_chunk_length} tokens")
print(f"📏 Average chunk length: {avg_chunk_length:.0f} tokens")

# Initialize embedding model
print("🧠 Initializing embedding model...")
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")

# Initialize chat model
chat_model = ChatOpenAI(model="gpt-4o-mini", temperature=0.1)

print("✅ Basic RAG components initialized!")


🔪 Using each review as a separate chunk...
✅ Created 1000 chunks from 1000 reviews
   Each review is treated as a separate chunk for better semantic coherence
📏 Maximum chunk length: 144 tokens
📏 Average chunk length: 88 tokens
🧠 Initializing embedding model...
✅ Basic RAG components initialized!


In [21]:
# External search tools setup
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.tools import Tool
from langchain_core.tools import tool
import json

# Setup external search tools
print("🔧 Setting up external search tools...")

# Option 1: Tavily Search (recommended)
try:
    tavily_search = TavilySearchResults(
        max_results=3,
        search_depth="basic",
        include_answer=True,
        include_raw_content=True
    )
    print("✅ Tavily search tool configured")
    has_tavily = True
except Exception as e:
    print(f"⚠️ Tavily not configured: {e}")
    has_tavily = False

# Create a fallback search function if no external APIs are configured
def fallback_search(query: str) -> str:
    """Fallback search when no external APIs are available"""
    return f"External search not available. Query '{query}' would require external movie database access. Please configure Tavily API key for enhanced search capabilities."

# Choose which search tool to use
if has_tavily:
    external_search_tool = tavily_search
    search_tool_name = "Tavily"
else:
    external_search_tool = Tool(
        name="fallback_search",
        description="Fallback search tool when external APIs are not configured",
        func=fallback_search
    )
    search_tool_name = "Fallback"

print(f"🔍 Using {search_tool_name} for external search")


🔧 Setting up external search tools...
✅ Tavily search tool configured
🔍 Using Tavily for external search


In [22]:
# Setup retrievers first (we'll use naive as default)
from langchain_community.vectorstores import Qdrant
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.retrievers import ParentDocumentRetriever, EnsembleRetriever
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain.storage import InMemoryStore
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_qdrant import QdrantVectorStore
from qdrant_client import QdrantClient, models

print("Setting up retrievers...")

# 1. Naive Retriever (Embedding-based) - Default
def get_base_retriever():
    """Get the base retriever (can be dynamically switched)"""
    global base_retriever
    if 'base_retriever' not in globals() or base_retriever is None:
        vectorstore = Qdrant.from_documents(
            chunks,
            embedding_model,
            location=":memory:",
            collection_name="MovieReviews_Default"
        )
        base_retriever = vectorstore.as_retriever(search_kwargs={"k": 10})
    return base_retriever

# Initialize default retriever
base_retriever = get_base_retriever()
print("✅ Default retriever initialized")


Setting up retrievers...
✅ Default retriever initialized


In [23]:
# Tool 1: Movie Review Search Tool
@tool
def search_movie_reviews(query: str) -> str:
    """
    Search through embedded movie reviews from Rotten Tomatoes.
    Use this for questions about specific movies, ratings, or review content.
    """
    try:
        # Use the current base retriever
        retriever = get_base_retriever()
        docs = retriever.invoke(query)
        
        if not docs:
            return f"No relevant movie reviews found for: {query}"
        
        # Format results
        results = f"Found {len(docs)} relevant movie reviews for '{query}':\n\n"
        
        for i, doc in enumerate(docs, 1):
            metadata = doc.metadata
            content = doc.page_content
            
            results += f"📽️ Result {i}:\n"
            results += f"Movie: {metadata.get('movie_title', 'Unknown')}\n"
            results += f"Critic: {metadata.get('critic_name', 'Unknown')}\n"
            if metadata.get('publication'):
                results += f"Publication: {metadata.get('publication')}\n"
            if metadata.get('original_score'):
                results += f"Score: {metadata.get('original_score')}\n"
            results += f"Content: {content[:200]}...\n\n"
        
        return results
        
    except Exception as e:
        return f"Error searching reviews: {str(e)}"

print("✅ Created search_movie_reviews tool")


✅ Created search_movie_reviews tool


In [24]:
# Tool 2: Movie Statistics Analysis Tool
@tool
def analyze_movie_statistics(movie_name: str = "") -> str:
    """
    Analyze statistics for a specific movie or provide general Rotten Tomatoes dataset statistics.
    Returns ratings, review counts, critic information, and other numerical insights.
    """
    try:
        if movie_name:
            # Search for specific movie in the merged dataset
            movie_data = merged_df[
                merged_df['title_clean'].str.contains(movie_name, case=False, na=False)
            ]
            
            if movie_data.empty:
                return f"No statistics found for '{movie_name}' in the Rotten Tomatoes dataset."
            
            # Get movie information
            movie_info = movie_data.iloc[0]  # Get first match for movie metadata
            movie_reviews = movie_data  # All reviews for this movie
            
            stats = f"Statistics for '{movie_info.get('title_clean', movie_name)}':\n"
            stats += f"═══════════════════════════════════\n"
            
            # Movie metadata
            if movie_info.get('genre_clean'):
                stats += f"🎭 Genre: {movie_info['genre_clean']}\n"
            if movie_info.get('director_clean'):
                stats += f"🎬 Director: {movie_info['director_clean']}\n"
            if movie_info.get('rating'):
                stats += f"🏷️ Rating: {movie_info['rating']}\n"
            if movie_info.get('runtimeMinutes'):
                stats += f"⏱️ Runtime: {movie_info['runtimeMinutes']} minutes\n"
            if movie_info.get('releaseDateTheaters'):
                stats += f"📅 Release Date: {movie_info['releaseDateTheaters']}\n"
            
            # Scores
            if pd.notna(movie_info.get('audienceScore')):
                stats += f"👥 Audience Score: {movie_info['audienceScore']}%\n"
            if pd.notna(movie_info.get('tomatoMeter')):
                stats += f"🍅 Tomatometer: {movie_info['tomatoMeter']}%\n"
            
            # Review statistics
            stats += f"\n📊 Review Analysis:\n"
            stats += f"• Total Reviews: {len(movie_reviews)}\n"
            
            # Review state distribution
            if 'reviewState' in movie_reviews.columns:
                review_states = movie_reviews['reviewState'].value_counts()
                for state, count in review_states.items():
                    stats += f"• {state.title()}: {count} reviews\n"
            
            return stats
        else:
            # General dataset statistics
            stats = f"🍅 Rotten Tomatoes Dataset Statistics:\n"
            stats += f"═══════════════════════════════════\n"
            stats += f"📊 Overview:\n"
            stats += f"• Total Movies: {len(movies_df):,}\n"
            stats += f"• Total Reviews: {len(reviews_df):,}\n"
            stats += f"• Reviews in Current Sample: {len(merged_df):,}\n"
            stats += f"• Average Reviews per Movie: {len(reviews_df)/len(movies_df):.1f}\n"
            
            return stats
            
    except Exception as e:
        return f"Error analyzing statistics: {str(e)}"

# Tool 3: External Movie Search
@tool
def search_external_movie_info(query: str) -> str:
    """
    Search external sites for reviews, ratings, or recent news about a movie.
    """
    try:
        search_string = f'movie {query} reviews ratings'

        if has_tavily:
            result = external_search_tool.invoke({"query": search_string})
            snippets = []
            for item in result[:3]:
                if isinstance(item, dict):
                    url = item.get("url", "")
                    content = (item.get("content", "") or "").strip()
                    snippets.append(f"Source: {url}\n{content[:200]}…")
            return "\n\n".join(snippets) if snippets else "No results found."
        else:
            return external_search_tool.run(search_string)

    except Exception as e:
        return f"External search error: {e}"

# Create the agent's toolbox
agent_tools = [
    search_movie_reviews,
    analyze_movie_statistics, 
    search_external_movie_info
]

print(f"✅ Created {len(agent_tools)} specialized tools:")
for tool in agent_tools:
    print(f"  - {tool.name}")

print("\n✅ All agent tools ready!")


✅ Created 3 specialized tools:
  - search_movie_reviews
  - analyze_movie_statistics
  - search_external_movie_info

✅ All agent tools ready!


In [25]:
# Enhanced Agent State with Tool Selection
from typing_extensions import TypedDict, Annotated
from langgraph.graph.message import add_messages
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage, ToolMessage

class AgentState(TypedDict):
    messages: Annotated[list[BaseMessage], add_messages]
    question: str
    tool_calls: list
    final_answer: str

# Enhanced Agent with tool selection capabilities
print("🤖 Building enhanced agent with tool selection...")

# Create the agent prompt
AGENT_PROMPT = """You are an intelligent movie analysis agent with access to multiple specialized tools.

Your tools:
1. search_movie_reviews: Search embedded movie reviews from Rotten Tomatoes
2. analyze_movie_statistics: Get numerical statistics about movies and datasets  
3. search_external_movie_info: Search external sources when local data is insufficient

Guidelines:
- Start with local review data (search_movie_reviews) for most questions
- Use statistics tools for numerical analysis
- Only use external search when local data is clearly insufficient
- Always explain your reasoning and cite sources
- Provide comprehensive, insightful answers

Current question: {question}
"""

# Create enhanced chat model with tool binding
agent_model = ChatOpenAI(
    model="gpt-4o-mini", 
    temperature=0.1,
    max_tokens=1000
).bind_tools(agent_tools)

def agent_reasoning_node(state: AgentState) -> AgentState:
    """Agent reasoning and tool selection"""
    question = state["question"]
    messages = state.get("messages", [])
    
    # Create the prompt with current question
    prompt_message = HumanMessage(content=AGENT_PROMPT.format(question=question))
    
    # Get agent response with potential tool calls
    response = agent_model.invoke([prompt_message] + messages)
    
    return {
        "messages": [response],
        "tool_calls": response.tool_calls if hasattr(response, 'tool_calls') and response.tool_calls else []
    }

def tool_execution_node(state: AgentState) -> AgentState:
    """Execute selected tools"""
    tool_calls = state.get("tool_calls", [])
    messages = []
    
    for tool_call in tool_calls:
        tool_name = tool_call["name"]
        tool_args = tool_call["args"]
        
        # Find and execute the tool
        for tool in agent_tools:
            if tool.name == tool_name:
                try:
                    result = tool.invoke(tool_args)
                    # Create tool message
                    tool_message = ToolMessage(
                        content=str(result),
                        tool_call_id=tool_call["id"]
                    )
                    messages.append(tool_message)
                except Exception as e:
                    error_message = ToolMessage(
                        content=f"Error executing {tool_name}: {str(e)}",
                        tool_call_id=tool_call["id"]
                    )
                    messages.append(error_message)
                break
    
    return {"messages": messages}

def final_response_node(state: AgentState) -> AgentState:
    """Generate final response based on tool results"""
    messages = state["messages"]
    question = state["question"]
    
    # Create final prompt
    final_prompt = f"""
    Based on the tool results above, provide a comprehensive answer to the question: {question}
    
    Make sure to:
    - Synthesize information from multiple sources
    - Cite specific data points and sources
    - Provide insights beyond just raw data
    - Be conversational but informative
    """
    
    final_response = chat_model.invoke(messages + [HumanMessage(content=final_prompt)])
    
    return {
        "final_answer": final_response.content,
        "messages": [final_response]
    }

print("✅ Enhanced agent nodes defined!")


🤖 Building enhanced agent with tool selection...
✅ Enhanced agent nodes defined!


In [26]:
# Build the enhanced agent graph
print("🔗 Building agent workflow...")

from langgraph.graph import StateGraph, START, END
from langgraph.prebuilt import ToolNode

# Create agent graph
agent_graph = StateGraph(AgentState)

# Add nodes
agent_graph.add_node("agent", agent_reasoning_node)
agent_graph.add_node("tools", ToolNode(agent_tools))
agent_graph.add_node("final_response", final_response_node)

# Add edges
agent_graph.add_edge(START, "agent")

# Conditional edge: if agent makes tool calls, go to tools; otherwise go to final response
def should_continue(state: AgentState) -> str:
    tool_calls = state.get("tool_calls", [])
    if tool_calls:
        return "tools"
    else:
        return "final_response"

agent_graph.add_conditional_edges("agent", should_continue)
agent_graph.add_edge("tools", "final_response")
agent_graph.add_edge("final_response", END)

# Compile the enhanced agent
enhanced_agent = agent_graph.compile()

print("✅ Enhanced agent with tool selection ready!")

# Generate unique project ID for this session
unique_id = uuid4().hex[:8]
project_name = f"Movie-Reviews-RAG-{unique_id}"

# Configure LangSmith if available
if os.getenv("LANGSMITH_API_KEY"):
    os.environ["LANGSMITH_PROJECT"] = project_name
    print(f"🎯 LangSmith project: {project_name}")

print("🚀 Enhanced agent ready for movie analysis!")


🔗 Building agent workflow...
✅ Enhanced agent with tool selection ready!
🎯 LangSmith project: Movie-Reviews-RAG-05e3c697
🚀 Enhanced agent ready for movie analysis!


In [27]:
# # Setup different retrievers for dynamic switching
# print("Setting up different retrievers...")

# # 1. Naive Retriever (Embedding-based)
# def create_naive_retriever():
#     vectorstore = Qdrant.from_documents(
#         chunks,
#         embedding_model,
#         location=":memory:",
#         collection_name="MovieReviews_Naive"
#     )
#     return vectorstore.as_retriever(search_kwargs={"k": 5})

# naive_retriever = create_naive_retriever()
# print("✅ 1. Naive retriever ready")

# # 2. BM25 Retriever (Keyword-based)
# def create_bm25_retriever():
#     bm25 = BM25Retriever.from_documents(chunks)
#     bm25.k = 5
#     return bm25

# bm25_retriever = create_bm25_retriever()
# print("✅ 2. BM25 retriever ready")

# # 3. Multi-Query Retriever
# def create_multi_query_retriever():
#     base_retriever = create_naive_retriever()
#     return MultiQueryRetriever.from_llm(
#         retriever=base_retriever, 
#         llm=chat_model
#     )

# multi_query_retriever = create_multi_query_retriever()
# print("✅ 3. Multi-query retriever ready")

# # 4. Parent Document Retriever
# def create_parent_document_retriever():
#     # Create smaller chunks for parent document retrieval
#     child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
    
#     # Create new QdrantClient and collection for parent docs
#     client = QdrantClient(location=":memory:")
#     client.create_collection(
#         collection_name="movie_reviews_parent_docs",
#         vectors_config=models.VectorParams(size=1536, distance=models.Distance.COSINE)
#     )
    
#     parent_document_vectorstore = QdrantVectorStore(
#         collection_name="movie_reviews_parent_docs", 
#         embedding=embedding_model, 
#         client=client
#     )
    
#     store = InMemoryStore()
#     parent_retriever = ParentDocumentRetriever(
#         vectorstore=parent_document_vectorstore,
#         docstore=store,
#         child_splitter=child_splitter,
#     )
    
#     parent_retriever.add_documents(chunks, ids=None)
#     return parent_retriever

# parent_document_retriever = create_parent_document_retriever()
# print("✅ 4. Parent document retriever ready")

# # 5. Contextual Compression Retriever (with Cohere reranking)
# def create_compression_retriever():
#     base_retriever = create_naive_retriever()
#     compressor = CohereRerank(model="rerank-v3.5")
#     return ContextualCompressionRetriever(
#         base_compressor=compressor, 
#         base_retriever=base_retriever
#     )

# compression_retriever = create_compression_retriever()
# print("✅ 5. Contextual compression retriever ready")

# # 6. Ensemble Retriever (combines multiple approaches)
# def create_ensemble_retriever():
#     # Use fresh instances to avoid conflicts
#     naive = create_naive_retriever()
#     bm25 = create_bm25_retriever()
#     compression = create_compression_retriever()
    
#     retrievers = [bm25, naive, compression]
#     weights = [0.4, 0.4, 0.2]  # Slightly favor BM25 and naive
    
#     return EnsembleRetriever(
#         retrievers=retrievers, 
#         weights=weights
#     )

# ensemble_retriever = create_ensemble_retriever()
# print("✅ 6. Ensemble retriever ready")

# print(f"\n✅ All retrievers ready! You can switch between them by updating 'base_retriever'.")
# print("Available retrievers: naive_retriever, bm25_retriever, multi_query_retriever, parent_document_retriever, compression_retriever, ensemble_retriever")


In [28]:
# Create a query function for the enhanced agent with tracing
def query_enhanced_agent_with_tracing(question: str, run_name: str = None) -> str:
    """Query the enhanced agent with LangSmith tracing"""
    
    # Generate run name if not provided
    if not run_name:
        run_name = f"movie_query_{int(time.time())}"
    
    # Add tags for better organization
    tags = ["movie-reviews", "rag-agent", "multi-tool"]
    
    try:
        # Execute with tracing metadata
        start_time = time.time()
        
        result = enhanced_agent.invoke(
            {
                "question": question,
                "messages": [],
                "tool_calls": [],
                "final_answer": ""
            },
            config={
                "tags": tags,
                "metadata": {
                    "query_type": "movie_analysis",
                    "session_id": unique_id,
                    "run_name": run_name
                }
            }
        )
        
        end_time = time.time()
        execution_time = end_time - start_time
        
        # Return just the answer for simple frontend integration
        return result.get("final_answer", "No answer generated")
        
    except Exception as e:
        return f"Error: {str(e)}"

print("🚀 Enhanced agent ready for movie analysis!")
print("✅ LangSmith tracing configured!")
print("\n🎬 Ready to use! Try: query_enhanced_agent_with_tracing('What are the best rated movies?')")


🚀 Enhanced agent ready for movie analysis!
✅ LangSmith tracing configured!

🎬 Ready to use! Try: query_enhanced_agent_with_tracing('What are the best rated movies?')


In [29]:
# Example usage
print("🎬 Testing the Movie RAG System")
print("=" * 50)

# Test query
test_question = "What are some highly rated movies in the database?"
print(f"Question: {test_question}")
print("\nAnswer:")
answer = query_enhanced_agent_with_tracing(test_question)
print(answer)

print("\n" + "=" * 50)
print("🎯 System ready for frontend integration!")
print("🔧 To switch retrievers, update the 'base_retriever' variable")
print("📊 Check LangSmith for detailed tracing and analytics")


🎬 Testing the Movie RAG System
Question: What are some highly rated movies in the database?

Answer:
Based on the extensive dataset from Rotten Tomatoes, which includes a total of 143,258 movies and over 1.4 million reviews, we can identify some highly rated films that have garnered significant acclaim from both critics and audiences.

### Highly Rated Movies

1. **"The Godfather" (1972)** - Often hailed as one of the greatest films of all time, "The Godfather" boasts a high approval rating on Rotten Tomatoes, typically around 97%. Its compelling narrative and iconic performances have solidified its status in cinematic history.

2. **"Parasite" (2019)** - This South Korean film made waves globally, winning the Academy Award for Best Picture. It holds a remarkable 98% rating on Rotten Tomatoes, praised for its sharp social commentary and masterful storytelling.

3. **"Schindler's List" (1993)** - Steven Spielberg's poignant historical drama is another standout, frequently rated above 90

In [30]:
# Example: Switch to BM25 retriever
# base_retriever = bm25_retriever
# print("Switched to BM25 retriever")

# Example: Switch to Ensemble retriever
# base_retriever = ensemble_retriever
# print("Switched to Ensemble retriever")

# Example: Switch back to Naive retriever
# base_retriever = naive_retriever
# print("Switched to Naive retriever")

print("💡 Uncomment the lines above to switch retrievers")
print(f"Current retriever: {type(base_retriever).__name__}")


💡 Uncomment the lines above to switch retrievers
Current retriever: VectorStoreRetriever
