In [None]:
%load_ext autoreload
%autoreload 2

import os, sys

sys.path.append(os.path.abspath(".."))
os.chdir("..")

In [None]:
os.getcwd()

In [None]:
import subprocess

In [None]:
try:
    print("Starting batch job...")
    # Run the Bytewax batch dataflow
    # subprocess.run([".\Venev_realtimeRag\Scripts\activate"], check=True, capture_output=True, text=True)
    python_executable = sys.executable
    # subprocess.run([python_executable, "-m", "bytewax.run", "ingestion_pipeline:build_batch_dataflow"], check=True, capture_output=True, text=True)
    process = subprocess.Popen(
            [python_executable, "-m", "bytewax.run", "ingestion_pipeline:build_batch_dataflow"],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )

    # Read stdout line by line
    for line in process.stdout:
        print(line.strip())

    # Read stderr line by line
    for line in process.stderr:
        print(line.strip())

    process.wait()  
    print("Batch job completed successfully.")
except subprocess.CalledProcessError as e:
    print(f"Batch job failed: {e}")
    print(f"Error output: {e.stderr}")

In [None]:
from apscheduler.schedulers.blocking import BlockingScheduler # from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.triggers.cron import CronTrigger
import subprocess
import datetime
import logging

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def run_batch_job():
    """Function to run the Bytewax batch dataflow."""
    try:
        logger.info("Starting batch job...")
        # Run the Bytewax batch dataflow
        subprocess.run(["python", "-m", "bytewax.run", "ingestion_pipeline:build_batch_dataflow"], check=True)
        logger.info("Batch job completed successfully.")
    except subprocess.CalledProcessError as e:
        logger.error(f"Batch job failed: {e}")

# Create a scheduler
scheduler = BlockingScheduler()

# trigger = CronTrigger(year="*", month="*", day="*", hour="*", minute="0", second="10")
# Schedule the job to run daily at 2 AM
scheduler.add_job(run_batch_job, 'interval', seconds=10)

In [None]:
# Start the scheduler

logger.info("Starting scheduler...")
scheduler.start()

In [None]:
import logging

from fire import Fire

from streaming_pipeline import constants, initialize
from streaming_pipeline.embeddings import EmbeddingModelSingleton
from streaming_pipeline.qdrant import build_qdrant_client

logger = logging.getLogger(__name__)


def search(query_string: str):
    """
    Searches for the closest points to the given query string in the vector database.

    Args:
        query_string (str): The query string to search for.

    Returns:
        None
    """

    initialize()

    client = build_qdrant_client()
    model = EmbeddingModelSingleton()

    query_embedding = model(query_string, to_list=True)

    hits = client.search(
        collection_name=constants.VECTOR_DB_OUTPUT_COLLECTION_NAME,
        query_vector=query_embedding,
        limit=2,  # Return 5 closest points
    )
    
    for hit in hits:
        logger.info(hit)


In [None]:
search("What did Sangamo announced today?")

In [None]:
import datetime
from pathlib import Path
from typing import List, Optional

from bytewax.dataflow import Dataflow
from bytewax.testing import TestingSource
from pydantic import parse_obj_as
from qdrant_client import QdrantClient

from streaming_pipeline import mocked
# from streaming_pipeline.alpaca_batch import AlpacaNewsBatchInput
# from streaming_pipeline.alpaca_stream import AlpacaNewsStreamInput
from streaming_pipeline.embeddings import EmbeddingModelSingleton
from streaming_pipeline.models import NewsArticle, Document
from streaming_pipeline.qdrant import QdrantVectorOutput

from bytewax import operators as op

model = EmbeddingModelSingleton(cache_dir=None)

In [None]:
def build_input(is_input_mocked: bool = True,):
    if is_input_mocked:
        return TestingSource(mocked.financial_news)

In [None]:
def build_output(model: EmbeddingModelSingleton, in_memory: bool = False):
    if in_memory:
        return QdrantVectorOutput(
            vector_size=model.max_input_length,
            client=QdrantClient(":memory:"),
        )
    else:
        return QdrantVectorOutput(
            vector_size=model.max_input_length,
        )

In [None]:
from pydantic import TypeAdapter

article_adapter = TypeAdapter(List[NewsArticle])

flow = Dataflow("alpaca_news_input")

alpaca_news_input = op.input("input", flow, build_input())

article_to_class = op.flat_map("class_to_article", alpaca_news_input, lambda messages: article_adapter.validate_python(messages))
_ = op.inspect("articles", article_to_class)


In [None]:
document = op.map("document", article_to_class, lambda article: article.to_document())
_ = op.inspect("inspect_document", document)

In [None]:
compute_chunks = op.map("chunks", document, lambda document: document.compute_chunks(model))
_ = op.inspect("inspect_chunks", compute_chunks)

In [None]:
compute_embeddings = op.map("embeddings", compute_chunks, lambda document: document.compute_embeddings(model)) # flow.map(lambda document: document.compute_embeddings(model))
_ = op.inspect("inspect_embeddings", compute_embeddings)

In [None]:
output = op.output("output", compute_embeddings, build_output(model)) # flow.output("output", _build_output(model, in_memory=debug))

In [1]:
import bytewax

In [None]:
from typing import List
from pathlib import Path
from langchain_core.embeddings import Embeddings

class CustomEmbeddings(Embeddings):
    """
    Wrapper for the custom embedding model to make it compatible with LangChain.
    """
    
    def __init__(self):
        """
        Initialize the custom embeddings wrapper.
        
        Args:
            model_id: The identifier of the pre-trained transformer model
            max_input_length: Maximum length of input text to tokenize
            device: Device to use for running the model
            cache_dir: Directory to cache the pre-trained model files
        """
        
        self.embedding_model = EmbeddingModelSingleton()

    def embed_documents(self, texts: List[str]) -> List[List[float]]:
        """
        Generate embeddings for a list of documents.
        
        Args:
            texts: List of text strings to embed
            
        Returns:
            List of embeddings, one per text
        """
        embeddings = []
        for text in texts:
            embedding = self.embedding_model(text, to_list=True)
            embeddings.append(embedding)
        return embeddings

    def embed_query(self, text: str) -> List[float]:
        """
        Generate embedding for a single query text.
        
        Args:
            text: Text string to embed
            
        Returns:
            Query embedding
        """
        return self.embedding_model(text, to_list=True)

In [None]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_qdrant import QdrantVectorStore, RetrievalMode
from typing import List, Dict
from langchain_core.documents import Document as LangChainDocument

class NewsRAGSystem:
    def __init__(
        self,
        qdrant_url: Optional[str] = None,
        qdrant_api_key: Optional[str] = None,
        openai_api_key: str = "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",
        retrieval_mode: RetrievalMode = RetrievalMode.DENSE,
        content_payload_key: str = "text",
        metadata_payload_key: str = "payload",
    ):
        """
        Initialize the RAG system with Qdrant and OpenAI credentials.
        
        Args:
            qdrant_url: Qdrant Cloud URL
            qdrant_api_key: Qdrant Cloud API key
            collection_name: Name of the collection in Qdrant
            openai_api_key: OpenAI API key
        """
        # Initialize Qdrant client
        self.client = build_qdrant_client(api_key=qdrant_api_key, url=qdrant_url)
        
        # Initialize embeddings
        self.embeddings = CustomEmbeddings()
        
        # Initialize vector store
        self.vectorstore = QdrantVectorStore(
            client=self.client,
            collection_name=constants.VECTOR_DB_OUTPUT_COLLECTION_NAME,
            embedding=self.embeddings,
            content_payload_key=content_payload_key,
            metadata_payload_key=metadata_payload_key,
            retrieval_mode=retrieval_mode
        )
        
        # Initialize retriever
        self.retriever = self.vectorstore.as_retriever(
            search_type="mmr",  # Using MMR for better diversity in results
            search_kwargs={"k": 3}
        )
        
        # Initialize LLM
        self.llm = ChatOpenAI(
            model_name="gpt-4o",
            openai_api_key=openai_api_key,
            base_url="https://openrouter.ai/api/v1"
        )
        
        # Create prompt template
        self.prompt = ChatPromptTemplate.from_template("""You are a helpful assistant that answers questions about financial news articles.
        Use the following pieces of context to answer the question at the end.
        If you don't know the answer, just say that you don't know. Don't try to make up an answer.
        
        Context:
        {context}
        
        Question: {question}
        
        Helpful Answer:""")
        
        # Initialize RAG chain
        self.chain = self._create_rag_chain()
    
    def _format_docs(self, docs: List[LangChainDocument]) -> str:
        """Format documents into a string."""
        print("########### The Documents ########### :: ", docs)
        return "\n\n".join(f'Content {i}:\n{doc.page_content}' for i, doc in enumerate(docs))
    
    def _create_rag_chain(self):
        """
        Create the RAG chain using the latest LangChain syntax.
        """
        # Define the RAG chain
        chain = (
            RunnableParallel(
                {"context": self.retriever | self._format_docs, "question": RunnablePassthrough()}
            )
            | self.prompt
            | self.llm
            | StrOutputParser()
        )
        
        return chain
    
    def query(self, question: str) -> Dict:
        """
        Query the RAG system with a question.
        
        Args:
            question: The question to ask
            
        Returns:
            Dict containing the answer and source documents
        """
        # Get the answer
        answer = self.chain.invoke(question)
        
        # Get source documents
        docs = self.retriever.invoke(question)
        
        # Format source documents
        sources = []
        for doc in docs:
            source = {
                "headline": doc.metadata.get("headline", "N/A"),
                "url": doc.metadata.get("url", "N/A"),
                "created_at": doc.metadata.get("created_at", "N/A"),
                "symbols": doc.metadata.get("symbols", []),
                "author": doc.metadata.get("author", "N/A")
            }
            sources.append(source)
        
        return {
            "answer": answer,
            "sources": sources
        }
    
    def query_by_filters(
        self,
        question: str,
        symbols: List[str] = None,
        date_from: str = None,
        date_to: str = None
    ) -> Dict:
        """
        Query with additional filters for symbols and date range.
        
        Args:
            question: The question to ask
            symbols: List of stock symbols to filter by
            date_from: Start date in ISO format
            date_to: End date in ISO format
            
        Returns:
            Dict containing the answer and filtered source documents
        """
        # Build filter conditions
        filter_conditions = {}
        
        if symbols:
            filter_conditions["symbols"] = {"$in": symbols}
            
        if date_from or date_to:
            filter_conditions["created_at"] = {}
            if date_from:
                filter_conditions["created_at"]["$gte"] = date_from
            if date_to:
                filter_conditions["created_at"]["$lte"] = date_to
        
        # Update retriever search parameters
        self.retriever.search_kwargs["filter"] = filter_conditions
        
        # Get results
        result = self.query(question)
        
        # Reset retriever search parameters
        self.retriever.search_kwargs.pop("filter", None)
        
        return result

In [None]:

# Initialize the RAG system
rag_system = NewsRAGSystem()


In [None]:

# Basic query
question = "What was the FDA designation given to Sangamo Therapeutics for their Fabry Disease treatment?"
result = rag_system.query(question)


In [None]:
result

In [None]:
rag_system.vectorstore.similarity_search(
    "What was the FDA designation given to Sangamo Therapeutics for their Fabry Disease treatment?", k=2
)