<a href="https://colab.research.google.com/github/erdincozsertel/MultiGroupGenAI_ProjectDemo_Colab/blob/main/MultiGroupGenAI_ProjectDemo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
Movie/TV Recommender RAG Demo using Haystack & Gemini with Langfuse

Adapted for Movie/TV Recommendations and Langfuse logging
"""

# --- 1. Install Dependencies ---
!pip install -U -q haystack-ai google-ai-haystack sentence-transformers trafilatura langfuse-haystack hf_xet

# --- 2. Set Up API Keys & Langfuse ---
import os
import logging
from google.colab import userdata # Use Colab secrets for API keys
from getpass import getpass

# Fetch keys from Colab secrets
try:
    GOOGLE_API_KEY = userdata.get('GOOGLE_API_KEY')
    SERPERDEV_API_KEY = userdata.get('SERPERDEV_API_KEY')
    # Langfuse Credentials
    LANGFUSE_PUBLIC_KEY = userdata.get('LANGFUSE_PUBLIC_KEY')
    LANGFUSE_SECRET_KEY = userdata.get('LANGFUSE_SECRET_KEY')
    LANGFUSE_HOST = userdata.get('LANGFUSE_HOST')
    if LANGFUSE_HOST is None:
        LANGFUSE_HOST = "https://cloud.langfuse.com"
    HF_TOKEN = userdata.get('HF_TOKEN')

except userdata.SecretNotFoundError as e:
    print(f"API key or Langfuse credential not found in Colab Secrets: {e}. Please add them.")
    print("Secrets needed: GOOGLE_API_KEY, SERPERDEV_API_KEY, LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY (and optionally LANGFUSE_HOST)")
    raise ValueError("Required secrets not configured in Colab Secrets.")


# Set environment variables for Haystack components and Langfuse
os.environ["GOOGLE_API_KEY"] = GOOGLE_API_KEY
os.environ["SERPERDEV_API_KEY"] = SERPERDEV_API_KEY
os.environ["LANGFUSE_PUBLIC_KEY"] = LANGFUSE_PUBLIC_KEY
os.environ["LANGFUSE_SECRET_KEY"] = LANGFUSE_SECRET_KEY
os.environ["LANGFUSE_HOST"] = LANGFUSE_HOST
os.environ["HF_TOKEN"] = HF_TOKEN
# Optional: Enable Haystack content tracing for debugging if needed (Langfuse also captures IO)
os.environ["HAYSTACK_CONTENT_TRACING_ENABLED"] = "true"

# --- Logging Configuration ---
logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.DEBUG)
print("Inital setup is completed")

Inital setup is completed


In [2]:
# --- 3. Import Haystack Components ---
from haystack import Pipeline, Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import HTMLToDocument
from haystack.components.writers import DocumentWriter
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.embedders import (
    SentenceTransformersDocumentEmbedder,
    SentenceTransformersTextEmbedder
)
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.websearch import SerperDevWebSearch
from haystack.components.builders import ChatPromptBuilder
from haystack.components.routers import ConditionalRouter
from haystack.dataclasses import ChatMessage
from haystack_integrations.components.generators.google_ai.chat.gemini import GoogleAIGeminiChatGenerator
# Import Langfuse Connector
from haystack_integrations.components.connectors.langfuse import LangfuseConnector


# --- 4. Define Document Store ---
document_store = InMemoryDocumentStore()

DEBUG:haystack.core.component.component:Registering <class 'haystack.core.super_component.super_component.SuperComponent'> as a component
DEBUG:haystack.core.component.component:Registered Component <class 'haystack.core.super_component.super_component.SuperComponent'>
DEBUG:haystack.core.component.component:Registering <class 'haystack.components.fetchers.link_content.LinkContentFetcher'> as a component
DEBUG:haystack.core.component.component:Registered Component <class 'haystack.components.fetchers.link_content.LinkContentFetcher'>
DEBUG:haystack.core.component.component:Registering <class 'haystack.components.converters.html.HTMLToDocument'> as a component
DEBUG:haystack.core.component.component:Registered Component <class 'haystack.components.converters.html.HTMLToDocument'>
DEBUG:haystack.core.component.component:Registering <class 'haystack.components.writers.document_writer.DocumentWriter'> as a component
DEBUG:haystack.core.component.component:Registered Component <class 'hayst

In [3]:
# --- 3. Import Haystack Components ---

from haystack import Pipeline, Document
from haystack.document_stores.in_memory import InMemoryDocumentStore
from haystack.components.fetchers import LinkContentFetcher
from haystack.components.converters import HTMLToDocument
from haystack.components.writers import DocumentWriter
from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.embedders import (
    SentenceTransformersDocumentEmbedder,
    SentenceTransformersTextEmbedder
)
from haystack.components.retrievers.in_memory import InMemoryEmbeddingRetriever
from haystack.components.websearch import SerperDevWebSearch
from haystack.components.builders import ChatPromptBuilder
from haystack.components.routers import ConditionalRouter
from haystack.dataclasses import ChatMessage # Ensure ChatMessage is imported if not already
from haystack_integrations.components.generators.google_ai.chat.gemini import GoogleAIGeminiChatGenerator
from haystack_integrations.components.connectors.langfuse import LangfuseConnector

# --- 4. Define Document Store ---
document_store = InMemoryDocumentStore()

In [4]:
# --- 5. Build Indexing Pipeline ---
# (Indexing pipeline remains the same as before - Langfuse primarily traces query pipelines)
print("Building Indexing Pipeline...")
indexing_pipeline = Pipeline()
indexing_pipeline.add_component(instance=LinkContentFetcher(), name="fetcher")
indexing_pipeline.add_component(instance=HTMLToDocument(), name="converter")
indexing_pipeline.add_component(instance=DocumentCleaner(), name="cleaner")
indexing_pipeline.add_component(instance=DocumentSplitter(split_by="sentence", split_length=10, split_overlap=2), name="splitter")
indexing_pipeline.add_component(instance=SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-mpnet-base-v2"), name="embedder")
indexing_pipeline.add_component(instance=DocumentWriter(document_store=document_store), name="writer")

indexing_pipeline.connect("fetcher.streams", "converter.sources")
indexing_pipeline.connect("converter.documents", "cleaner")
indexing_pipeline.connect("cleaner", "splitter")
indexing_pipeline.connect("splitter", "embedder")
indexing_pipeline.connect("embedder", "writer")
print("Indexing Pipeline Built.")

# --- 6. Index Your Data Sources ---
urls_to_index = [
  "https://www.imdb.com/chart/top",
  "https://www.imdb.com/chart/toptv",
  "https://myanimelist.net/topanime.php",
  "https://myanimelist.net/topanime.php?type=airing",
  "https://myanimelist.net/topanime.php?type=upcoming",
  "https://myanimelist.net/topanime.php?type=bypopularity",
  "https://www.themoviedb.org/movie",
  "https://www.themoviedb.org/tv",
  "https://www.themoviedb.org/movie/now-playing",
  "https://www.themoviedb.org/tv/on-the-air"
]

if not document_store.count_documents():
    print(f"Indexing {len(urls_to_index)} URLs...")
    try:
        indexing_pipeline.run({"fetcher": {"urls": urls_to_index}})
        print(f"Successfully indexed {document_store.count_documents()} documents.")
    except Exception as e:
        print(f"Error during indexing: {e}")
else:
     print(f"Document store already contains {document_store.count_documents()} documents. Skipping indexing.")

Building Indexing Pipeline...


DEBUG:haystack.core.pipeline.base:Adding component 'fetcher' (<haystack.components.fetchers.link_content.LinkContentFetcher object at 0x7febbe672d50>

Inputs:
  - urls: List[str]
Outputs:
  - streams: List[ByteStream])
DEBUG:haystack.core.pipeline.base:Adding component 'converter' (<haystack.components.converters.html.HTMLToDocument object at 0x7febbe671350>

Inputs:
  - sources: List[Union[str, Path, ByteStream]]
  - meta: Union[Dict[str, Any], List[Dict[str, Any]]]
  - extraction_kwargs: Optional[Dict[str, Any]]
Outputs:
  - documents: List[Document])
DEBUG:haystack.core.pipeline.base:Adding component 'cleaner' (<haystack.components.preprocessors.document_cleaner.DocumentCleaner object at 0x7febbe1bf550>

Inputs:
  - documents: List[Document]
Outputs:
  - documents: List[Document])
DEBUG:haystack.core.pipeline.base:Adding component 'splitter' (<haystack.components.preprocessors.document_splitter.DocumentSplitter object at 0x7febc19db190>

Inputs:
  - documents: List[Document]
Outputs

Indexing Pipeline Built.
Indexing 10 URLs...


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
INFO:haystack.core.pipeline.base:Warming up component embedder...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

INFO:haystack.core.pipeline.pipeline:Running component fetcher
DEBUG:haystack.components.fetchers.link_content:Switched user agent to haystack/LinkContentFetcher/2.12.1
INFO:haystack.core.pipeline.pipeline:Running component converter
INFO:haystack.core.pipeline.pipeline:Running component cleaner
INFO:haystack.core.pipeline.pipeline:Running component splitter
INFO:haystack.core.pipeline.pipeline:Running component embedder


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:haystack.core.pipeline.pipeline:Running component writer


Successfully indexed 13 documents.


In [5]:
# --- 7. Build Advanced RAG Query Pipeline with Langfuse ---
print("Building Query Pipeline with Langfuse...")

prompt_template = """
{% if web_documents %}
    You were asked to answer the following query about movies/TV shows/anime based on the indexed documents, but the context was not enough.
    Answer the question based on the given web search context.
    Provide the sources/links you used from the web search context if possible.

    User Question: {{ query }}

    Web Search Context:
    {% for document in web_documents %}
    URL: {{document.meta.link}}
    Content: {{document.content}}
    ---
    {% endfor %}
{% else %}
    You are a helpful assistant for recommending movies, TV series, and anime.
    Answer the following query based ONLY on the documents provided below (e.g., top lists, descriptions).

    Documents:
    {% for document in documents %}
    {{document.content}}
    {% endfor %}

    Query: {{query}}

    Provide recommendations or answer the question based on the documents.
    If the documents do NOT provide enough information to answer the query or give recommendations, ONLY output the text 'N0_ANSWER'. Do not add any other explanation.
{% endif %}
"""
prompt = [ChatMessage.from_user(prompt_template)]

main_routes = [
    {
        "condition": "{{'N0_ANSWER' in replies[0].text.replace('\\n', '')}}",
        "output" :"{{query}}",
        "output_name": "go_web",
        "output_type": str,
    },
    {
        "condition": "{{'N0_ANSWER' not in replies[0].text.replace('\\n', '')}}",
        "output": "{{replies[0].text}}",
        "output_name": "answer",
        "output_type": str,
    },
]

# Define the query pipeline
query_pipeline = Pipeline(max_runs_per_component=5)

# Add Langfuse Connector FIRST (or anywhere, it hooks automatically)
# Name your Langfuse project appropriately
query_pipeline.add_component("tracer", LangfuseConnector("Movie Recommender RAG Demo 3"))

# Add the rest of the components
query_pipeline.add_component("embedder", SentenceTransformersTextEmbedder(model="sentence-transformers/all-mpnet-base-v2"))
query_pipeline.add_component("retriever", InMemoryEmbeddingRetriever(document_store=document_store, top_k=5))
query_pipeline.add_component("prompt_builder", ChatPromptBuilder(template=prompt))
query_pipeline.add_component("llm", GoogleAIGeminiChatGenerator(model="gemini-1.5-flash", generation_config={"temperature": 0.7}))
query_pipeline.add_component("web_search", SerperDevWebSearch())
query_pipeline.add_component("router", ConditionalRouter(main_routes))

# Connect components (LangfuseConnector doesn't need explicit connection)
query_pipeline.connect("embedder.embedding", "retriever.query_embedding")
query_pipeline.connect("retriever.documents", "prompt_builder.documents")
query_pipeline.connect("prompt_builder.prompt", "llm.messages")
query_pipeline.connect("llm.replies", "router.replies")
query_pipeline.connect("router.go_web", "web_search.query")
query_pipeline.connect("web_search.documents", "prompt_builder.web_documents")

print("Query Pipeline Built with Langfuse.")

DEBUG:haystack.core.pipeline.base:Adding component 'tracer' (<haystack_integrations.components.connectors.langfuse.langfuse_connector.LangfuseConnector object at 0x7febb428f450>

Inputs:
  - invocation_context: Optional[Dict[str, Any]]
Outputs:
  - name: str
  - trace_url: str
  - trace_id: str)
DEBUG:haystack.core.pipeline.base:Adding component 'embedder' (<haystack.components.embedders.sentence_transformers_text_embedder.SentenceTransformersTextEmbedder object at 0x7febb42831d0>

Inputs:
  - text: str
Outputs:
  - embedding: List[float])
DEBUG:haystack.core.pipeline.base:Adding component 'retriever' (<haystack.components.retrievers.in_memory.embedding_retriever.InMemoryEmbeddingRetriever object at 0x7febb4282890>

Inputs:
  - query_embedding: List[float]
  - filters: Optional[Dict[str, Any]]
  - top_k: Optional[int]
  - scale_score: Optional[bool]
  - return_embedding: Optional[bool]
Outputs:
  - documents: List[Document])
DEBUG:haystack.core.pipeline.base:Adding component 'prompt_bu

Building Query Pipeline with Langfuse...
Query Pipeline Built with Langfuse.


In [6]:
# --- 8. Run a Query (Traces will be sent to Langfuse) ---
print("\n--- Running Sample Query (check Langfuse for trace) ---")
query = "Recommend some highly rated sci-fi TV shows."
print(f"Query: {query}")

try:
    pipeline_input = {
        "embedder": {"text": query},
        "prompt_builder": {"query": query},
        "router": {"query": query}
    }
    result = query_pipeline.run(pipeline_input)

    print(result["llm"]["replies"])  # Inspect the replies structure

    if "answer" in result.get("router", {}):
        final_answer = result["router"]["answer"]
        if isinstance(final_answer, list):
             final_answer = final_answer[0] if final_answer else "No answer found."
        print("\nAnswer:")
        print(final_answer)
    else:
         print("\nSorry, I couldn't generate a recommendation based on the available information.")
         if "documents" in result.get("web_search", {}):
             print("\nWeb Search Results considered:")
             for doc in result["web_search"]["documents"]:
                 print(f"- {doc.meta.get('title', 'No Title')}: {doc.meta.get('link', 'No Link')}")

except Exception as e:
    print(f"\nAn error occurred while running the query: {e}")

# --- Example of another query ---
print("\n--- Running Another Query (check Langfuse for trace) ---")
query_2 = "What is the plot of The Shawshank Redemption?"
print(f"Query: {query_2}")

try:
    pipeline_input_2 = {
        "embedder": {"text": query_2},
        "prompt_builder": {"query": query_2},
        "router": {"query": query_2}
    }
    result_2 = query_pipeline.run(pipeline_input_2)

    if "answer" in result_2.get("router", {}):
        final_answer_2 = result_2["router"]["answer"]
        if isinstance(final_answer_2, list):
             final_answer_2 = final_answer_2[0] if final_answer_2 else "No answer found."
        print("\nAnswer:")
        print(final_answer_2)
    else:
         print("\nSorry, I couldn't generate an answer based on the available information.")
         if "documents" in result_2.get("web_search", {}):
             print("\nWeb Search Results considered:")
             for doc in result_2["web_search"]["documents"]:
                 print(f"- {doc.meta.get('title', 'No Title')}: {doc.meta.get('link', 'No Link')}")

except Exception as e:
    print(f"\nAn error occurred while running the query: {e}")

INFO:haystack.core.pipeline.base:Warming up component embedder...
INFO:haystack.core.pipeline.pipeline:Running component embedder



--- Running Sample Query (check Langfuse for trace) ---
Query: Recommend some highly rated sci-fi TV shows.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:haystack.core.pipeline.pipeline:Running component tracer
INFO:haystack.core.pipeline.pipeline:Running component retriever
INFO:haystack.core.pipeline.pipeline:Running component prompt_builder
INFO:haystack.core.pipeline.pipeline:Running component llm
INFO:haystack.core.pipeline.pipeline:Running component router
INFO:haystack.core.pipeline.pipeline:Running component web_search
DEBUG:haystack.components.websearch.serper_dev:Serper Dev returned 14 documents for the query 'Recommend some highly rated sci-fi TV shows.'
INFO:haystack.core.pipeline.pipeline:Running component prompt_builder
INFO:haystack.core.pipeline.pipeline:Running component llm
INFO:haystack.core.pipeline.pipeline:Running component router
INFO:haystack.core.pipeline.base:Warming up component embedder...
INFO:haystack.core.pipeline.pipeline:Running component embedder



An error occurred while running the query: 'llm'

--- Running Another Query (check Langfuse for trace) ---
Query: What is the plot of The Shawshank Redemption?


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:haystack.core.pipeline.pipeline:Running component tracer
INFO:haystack.core.pipeline.pipeline:Running component retriever
INFO:haystack.core.pipeline.pipeline:Running component prompt_builder
INFO:haystack.core.pipeline.pipeline:Running component llm
INFO:haystack.core.pipeline.pipeline:Running component router
INFO:haystack.core.pipeline.pipeline:Running component web_search
DEBUG:haystack.components.websearch.serper_dev:Serper Dev returned 14 documents for the query 'What is the plot of The Shawshank Redemption?'
INFO:haystack.core.pipeline.pipeline:Running component prompt_builder
INFO:haystack.core.pipeline.pipeline:Running component llm
INFO:haystack.core.pipeline.pipeline:Running component router



Answer:
The Shawshank Redemption tells the story of Andy Dufresne, a banker wrongly convicted of murdering his wife and her lover.  He is sentenced to two life sentences at Shawshank State Penitentiary in 1947.  Over a quarter-century, Andy maintains his innocence while forming a friendship with fellow inmate Red.  He uses his intelligence and quiet determination to navigate the brutal prison system, finding solace and eventual redemption through acts of common decency.  The plot centers on his perseverance, his unwavering hope, and the unlikely friendship that sustains him.  While the details of his escape and the ultimate revelation of his innocence are key plot points, the overarching theme is one of hope and enduring human spirit in the face of seemingly insurmountable adversity.



In [7]:
# --- Example of another query ---
print("\n--- Running Another Query (check Langfuse for trace) ---")
query_3 = "Where the meme `Aura Farming` came from?"
print(f"Query: {query_3}")

try:
    pipeline_input_3 = {
        "embedder": {"text": query_3},
        "prompt_builder": {"query": query_3},
        "router": {"query": query_3}
    }
    result_3 = query_pipeline.run(pipeline_input_3)

    if "answer" in result_3.get("router", {}):
        final_answer_3 = result_3["router"]["answer"]
        if isinstance(final_answer_3, list):
             final_answer_3 = final_answer_3[0] if final_answer_3 else "No answer found."
        print("\nAnswer:")
        print(final_answer_3)
    else:
         print("\nSorry, I couldn't generate an answer based on the available information.")
         if "documents" in result_3.get("web_search", {}):
             print("\nWeb Search Results considered:")
             for doc in result_3["web_search"]["documents"]:
                 print(f"- {doc.meta.get('title', 'No Title')}: {doc.meta.get('link', 'No Link')}")

except Exception as e:
    print(f"\nAn error occurred while running the query: {e}")

INFO:haystack.core.pipeline.base:Warming up component embedder...
INFO:haystack.core.pipeline.pipeline:Running component embedder



--- Running Another Query (check Langfuse for trace) ---
Query: Recommend some highly rated sci-fi TV shows.


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

INFO:haystack.core.pipeline.pipeline:Running component tracer
INFO:haystack.core.pipeline.pipeline:Running component retriever
INFO:haystack.core.pipeline.pipeline:Running component prompt_builder
INFO:haystack.core.pipeline.pipeline:Running component llm
INFO:haystack.core.pipeline.pipeline:Running component router
INFO:haystack.core.pipeline.pipeline:Running component web_search
DEBUG:haystack.components.websearch.serper_dev:Serper Dev returned 14 documents for the query 'Recommend some highly rated sci-fi TV shows.'
INFO:haystack.core.pipeline.pipeline:Running component prompt_builder
INFO:haystack.core.pipeline.pipeline:Running component llm
INFO:haystack.core.pipeline.pipeline:Running component router



Answer:
Based on the provided web search results, several highly-rated sci-fi TV shows are repeatedly recommended:

**High Frequency Mentions (Multiple Sources):**

* **Stranger Things:** Mentioned in IMDB, Rotten Tomatoes, Buzzfeed, CNET, and Netflix Tudum.  A very popular choice.
* **Black Mirror:** Mentioned in IMDB, CNET, and Netflix Tudum. Known for its anthology format and thought-provoking episodes.
* **Severance:** Mentioned in IMDB, Buzzfeed, and Metacritic.  Praised for its unique premise and compelling storytelling.
* **Westworld:** Mentioned in Reddit, Rotten Tomatoes, and Quora. Note that Reddit mentions only season 1 as universally good.


**Other Frequently Recommended Shows:**

* **Battlestar Galactica:** Mentioned in Rotten Tomatoes and Metacritic.  A classic sci-fi series.
* **The Expanse:** Mentioned in Rotten Tomatoes and Quora.  A critically acclaimed space opera.
* **Star Trek:** Mentioned in Rotten Tomatoes.  A long-running and influential franchise with many se