In [None]:
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_community.document_loaders import DirectoryLoader, TextLoader

path = "/Users/debadeepta.dey/datasets/barclays"
loader = DirectoryLoader(path, glob="**/*.md")
docs = loader.load()

In [None]:
# Method 1: Split by Markdown Headers (Most intelligent for markdown)
# This preserves the document structure and creates logical chunks

def split_markdown_by_headers(document_content):
    """
    Split markdown document by headers, preserving document structure
    """
    # Define headers to split on (from h1 to h3)
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"), 
        ("###", "Header 3"),
    ]
    
    # Create the markdown header text splitter
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on,
        strip_headers=False  # Keep headers in the chunks
    )
    
    # Split the document
    md_header_splits = markdown_splitter.split_text(document_content)
    
    return md_header_splits

# Example usage with your loaded documents
if docs:
    # Take the first document as example
    first_doc = docs[0]
    header_splits = split_markdown_by_headers(first_doc.page_content)
    
    print(f"Original document split into {len(header_splits)} chunks based on headers")
    
    # Display first few chunks
    for i, chunk in enumerate(header_splits[:3]):
        print(f"\n--- Chunk {i+1} ---")
        print(f"Content: {chunk.page_content[:200]}...")
        print(f"Metadata: {chunk.metadata}")
        print(f"Full length: {len(chunk.page_content)} characters")

In [None]:
# Method 2: Recursive Character Text Splitter (Good fallback)
# This method is useful when documents don't have clear header structure

def split_markdown_recursive(document_content, chunk_size=1000, chunk_overlap=200):
    """
    Split markdown using recursive character splitter with markdown-aware separators
    """
    # Define separators that work well for markdown
    markdown_separators = [
        "\n\n",  # Double newline (paragraph breaks)
        "\n",    # Single newline
        " ",     # Space
        ""       # Character level
    ]
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=markdown_separators,
        length_function=len,
    )
    
    # Split the document
    chunks = text_splitter.split_text(document_content)
    
    return chunks

# Example usage
if docs:
    first_doc = docs[0]
    recursive_chunks = split_markdown_recursive(
        first_doc.page_content, 
        chunk_size=2048,  # Adjust based on your needs
        chunk_overlap=200
    )
    
    print(f"\nRecursive splitting created {len(recursive_chunks)} chunks")
    
    # Display first few chunks
    for i, chunk in enumerate(recursive_chunks[:3]):
        print(f"\n--- Recursive Chunk {i+1} ---")
        print(f"Content: {chunk[:200]}...")
        print(f"Length: {len(chunk)} characters")

In [None]:
# Method 3: Hybrid Approach (Recommended)
# Combine header-based splitting with recursive splitting for optimal results

def smart_markdown_split(document_content, max_chunk_size=1500, chunk_overlap=200):
    """
    Smart markdown splitting that combines header-based and recursive approaches
    """
    from langchain.schema import Document
    
    # First, try to split by headers
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"), 
        ("###", "Header 3"),
        ("####", "Header 4"),
    ]
    
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on,
        strip_headers=False
    )
    
    try:
        # Split by headers first
        header_splits = markdown_splitter.split_text(document_content)
        
        # If header splits are too large, further split them recursively
        final_chunks = []
        
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=max_chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", " ", ""],
            length_function=len,
        )
        
        for doc in header_splits:
            if len(doc.page_content) > max_chunk_size:
                # Split large chunks further
                sub_chunks = text_splitter.split_text(doc.page_content)
                for i, sub_chunk in enumerate(sub_chunks):
                    # Preserve metadata from header splitting
                    new_metadata = doc.metadata.copy()
                    new_metadata['sub_chunk'] = i
                    final_chunks.append(Document(
                        page_content=sub_chunk,
                        metadata=new_metadata
                    ))
            else:
                final_chunks.append(doc)
                
        return final_chunks
        
    except Exception as e:
        print(f"Header splitting failed: {e}")
        # Fallback to recursive splitting
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=max_chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", " ", ""],
        )
        chunks = text_splitter.split_text(document_content)
        return [Document(page_content=chunk, metadata={}) for chunk in chunks]

# Example usage with the hybrid approach
if docs:
    first_doc = docs[0]
    smart_chunks = smart_markdown_split(
        first_doc.page_content,
        max_chunk_size=1200,
        chunk_overlap=150
    )
    
    print(f"\nSmart splitting created {len(smart_chunks)} chunks")
    
    # Display statistics
    chunk_lengths = [len(chunk.page_content) for chunk in smart_chunks]
    print(f"Average chunk length: {sum(chunk_lengths) / len(chunk_lengths):.0f} characters")
    print(f"Min chunk length: {min(chunk_lengths)} characters")
    print(f"Max chunk length: {max(chunk_lengths)} characters")
    
    # Display first few chunks with metadata
    for i, chunk in enumerate(smart_chunks[:3]):
        print(f"\n--- Smart Chunk {i+1} ---")
        print(f"Metadata: {chunk.metadata}")
        print(f"Content preview: {chunk.page_content[:200]}...")
        print(f"Length: {len(chunk.page_content)} characters")

In [None]:
from langchain.schema import Document

# Utility function to process all your documents
def process_all_documents(docs, output_method='smart', **kwargs):
    """
    Process all loaded documents and return chunks
    
    Args:
        docs: List of loaded documents
        output_method: 'header', 'recursive', or 'smart'
        **kwargs: Additional parameters for the splitting methods
    
    Returns:
        List of all chunks with source document information
    """
    all_chunks = []
    
    for doc_idx, doc in enumerate(docs):
        print(f"Processing document {doc_idx + 1}/{len(docs)}: {doc.metadata.get('source', 'unknown')}")
        
        if output_method == 'header':
            chunks = split_markdown_by_headers(doc.page_content)
        elif output_method == 'recursive':
            chunk_texts = split_markdown_recursive(doc.page_content, **kwargs)
            chunks = [Document(page_content=text, metadata=doc.metadata.copy()) for text in chunk_texts]
        elif output_method == 'smart':
            chunks = smart_markdown_split(doc.page_content, **kwargs)
        else:
            raise ValueError("output_method must be 'header', 'recursive', or 'smart'")
        
        # Add source document information to each chunk
        for chunk_idx, chunk in enumerate(chunks):
            chunk.metadata['source_doc_index'] = doc_idx
            chunk.metadata['chunk_index'] = chunk_idx
            chunk.metadata['original_source'] = doc.metadata.get('source', 'unknown')
            all_chunks.append(chunk)
    
    return all_chunks

# Process all your documents using the smart method
all_processed_chunks = process_all_documents(
    docs, 
    output_method='smart',  # Change to 'header' or 'recursive' if preferred
    max_chunk_size=2000,
    chunk_overlap=200
)

print(f"\nTotal chunks created from all documents: {len(all_processed_chunks)}")

# Show summary statistics
if all_processed_chunks:
    chunk_lengths = [len(chunk.page_content) for chunk in all_processed_chunks]
    print(f"Average chunk length: {sum(chunk_lengths) / len(chunk_lengths):.0f} characters")
    print(f"Chunk length range: {min(chunk_lengths)} - {max(chunk_lengths)} characters")
    
    # Show distribution by source document
    source_counts = {}
    for chunk in all_processed_chunks:
        source = chunk.metadata.get('original_source', 'unknown')
        source_counts[source] = source_counts.get(source, 0) + 1
    
    print(f"\nChunks per source document:")
    for source, count in source_counts.items():
        print(f"  {source}: {count} chunks")

# Initialize knowledge graph

In [None]:
from ragas.testset.graph import KnowledgeGraph
from ragas.testset.graph import Node, NodeType


kg = KnowledgeGraph()
for doc in all_processed_chunks:
    kg.nodes.append(
        Node(
            type=NodeType.DOCUMENT,
            properties={
                "page_content": doc.page_content,
                "document_metadata": doc.metadata,
            },
        )
    )

## VLLM Configuration for RAGAS

The configuration above connects RAGAS to your vLLM server. Here are some key points:

1. **Base URL**: `http://localhost:8003/v1` - your vLLM endpoint
2. **API Key**: Set to "not-needed" since vLLM typically doesn't require authentication
3. **Model Name**: Replace `"your-model-name"` with the actual model you're serving
4. **Temperature**: Controls randomness (0.1 is relatively deterministic)
5. **Max Tokens**: Maximum response length

In [None]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings.base import embedding_factory
from langchain_openai import ChatOpenAI

# Configure vLLM hosted LLM
vllm_llm = ChatOpenAI(
    base_url="http://localhost:8003/v1",
    api_key="asdf",
    model="Qwen/Qwen2.5",  # Replace with your actual model name
    temperature=0.1,
    max_tokens=4096,
)

# Wrap for RAGAS
llm = LangchainLLMWrapper(vllm_llm)


In [None]:
# Test the vLLM connection
print("Testing vLLM connection...")

try:
    # Test the LLM directly
    test_response = vllm_llm.invoke("Hello, this is a test. Please respond briefly.")
    print(f"✅ vLLM connection successful!")
    print(f"Response: {test_response.content}")
    
    # Test with RAGAS wrapper
    from ragas.llms.base import BaseRagasLLM
    if isinstance(llm, BaseRagasLLM):
        print("✅ RAGAS LLM wrapper configured correctly")
    else:
        print("⚠️  RAGAS LLM wrapper might need adjustment")
        
except Exception as e:
    print(f"❌ Error connecting to vLLM: {e}")
    print("Please check:")
    print("1. vLLM server is running")
    print("2. Model name is correct")
    print("3. No firewall blocking the connection")

# vLLM Hosted Embedding Model Configuration

Here's how to configure a vLLM hosted embedding model for use with RAGAS:

## Option 1: Using OpenAI-compatible embedding endpoint
If your vLLM server hosts an embedding model with OpenAI-compatible API

In [None]:
from langchain_openai import OpenAIEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper
import asyncio

# Try each configuration until one works
vllm_embeddings = OpenAIEmbeddings(
        base_url="http://localhost:8001/v1",
        api_key="asdf",
        model='thenlper/gte-large',
        tiktoken_enabled=False,  # Disable tiktoken for vLLM
    )

embedding_model = LangchainEmbeddingsWrapper(vllm_embeddings)

# async def test_embedding_model(vllm_embeddings: OpenAIEmbeddings):
#     """Async function to test the embedding model"""
    
#     # Test with a simple text first
#     print("testing query embedding...")
#     test_result = vllm_embeddings.embed_query("Risk management is crucial for financial institutions.")
#     print(f"query embedding dimensions: {len(test_result)}")

#     # Test with texts
#     print("testing text embedding...")
#     test_texts = [
#         "This is a test document about financial analysis.",
#         "Machine learning models are used in banking.",
#         "Risk management is crucial for financial institutions."
#     ]
#     test_results = vllm_embeddings.embed_documents(test_texts)
#     print(f"Text embedding dimensions: {len(test_results[0])} for {len(test_results)} texts")
    
#     # If successful, wrap for RAGAS and test it through the wrapper
#     embedding_model = LangchainEmbeddingsWrapper(vllm_embeddings)

#     print("Testing wrapped embedding model query ...")
#     embedding_result = await embedding_model.embed_query("Risk management is crucial for financial institutions.")
#     print(f"Wrapped query embedding dimensions: {len(embedding_result)}")

#     print("Testing wrapped embedding model text ...")
#     embedding_results = await embedding_model.embed_texts(test_texts, is_async=True)
#     print(f"Wrapped text embedding dimensions: {len(embedding_results[0])} for {len(embedding_results)} texts")

#     print(f"✅ Successfully configured vLLM embedding model")
#     return embedding_model

# # Run the async function
# try:
#     embedding_model = asyncio.run(test_embedding_model(vllm_embeddings))
#     print(f"🎉 Using vLLM embedding model successfully!")
# except Exception as e:
#     print(f"❌ Failed with: {str(e)[:100]}...")
#     embedding_model = None


# Option 2: local embedding model

In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
local_embeddings = HuggingFaceEmbeddings(
    model_name="all-MiniLM-L6-v2",
    model_kwargs={"device": "cpu"} # Or "cuda" for GPU
)

res = local_embeddings.embed_query("Who is this?")  # Test local embedding model
print(res)


# Default synthetic data generation

In [None]:
from ragas.testset.transforms import default_transforms, apply_transforms


# define your LLM and Embedding Model
# here we are using the same LLM and the configured embedding model
transformer_llm = llm
trans = default_transforms(documents=all_processed_chunks, llm=transformer_llm, embedding_model=local_embeddings)
apply_transforms(kg, trans)
print(kg)

In [27]:
from ragas.testset import TestsetGenerator
from ragas.testset.synthesizers import default_query_distribution

generator = TestsetGenerator(llm=llm, embedding_model=embedding_model, knowledge_graph=kg)
query_distribution = default_query_distribution(llm)
testset = generator.generate(testset_size=10, query_distribution=query_distribution)
testset.to_pandas()



ValueError: No nodes that satisfied the given filer. Try changing the filter.

# Custom multi-hop question answer generation

In [None]:
from ragas.testset.transforms import Parallel, apply_transforms
from ragas.testset.transforms import (
    HeadlinesExtractor,
    HeadlineSplitter,
    KeyphrasesExtractor,
    OverlapScoreBuilder,
)


headline_extractor = HeadlinesExtractor(llm=llm)
headline_splitter = HeadlineSplitter(min_tokens=300, max_tokens=1000)
keyphrase_extractor = KeyphrasesExtractor(
    llm=llm, property_name="keyphrases", max_num=10
)
relation_builder = OverlapScoreBuilder(
    property_name="keyphrases",
    new_property_name="overlap_score",
    threshold=0.01,
    distance_threshold=0.9,
)

transforms = [
    headline_extractor,
    headline_splitter,
    keyphrase_extractor,
    relation_builder,
]

apply_transforms(kg, transforms=transforms)

In [None]:
from ragas.testset.persona import Persona

person1 = Persona(
    name="financial analyst at Barclays",
    role_description="A junior financial analyst at Barclays curious on workings on financial systems at Barclays ",
)
persona2 = Persona(
    name="",
    role_description="AI developer building a financial system for Barclays",
)
persona_list = [person1, persona2]

In [None]:
from dataclasses import dataclass
import typing as t
from ragas.testset.synthesizers.multi_hop.base import (
    MultiHopQuerySynthesizer,
    MultiHopScenario,
)
from ragas.testset.synthesizers.prompts import (
    ThemesPersonasInput,
    ThemesPersonasMatchingPrompt,
)


@dataclass
class MyMultiHopQuery(MultiHopQuerySynthesizer):

    theme_persona_matching_prompt = ThemesPersonasMatchingPrompt()

    async def _generate_scenarios(
        self,
        n: int,
        knowledge_graph,
        persona_list,
        callbacks,
    ) -> t.List[MultiHopScenario]:

        # query and get (node_a, rel, node_b) to create multi-hop queries
        results = knowledge_graph.find_two_nodes_single_rel(
            relationship_condition=lambda rel: (
                True if rel.type == "keyphrases_overlap" else False
            )
        )

        num_sample_per_triplet = max(1, n // len(results))

        scenarios = []
        for triplet in results:
            if len(scenarios) < n:
                node_a, node_b = triplet[0], triplet[-1]
                overlapped_keywords = triplet[1].properties["overlapped_items"]
                if overlapped_keywords:

                    # match the keyword with a persona for query creation
                    themes = list(dict(overlapped_keywords).keys())
                    prompt_input = ThemesPersonasInput(
                        themes=themes, personas=persona_list
                    )
                    persona_concepts = (
                        await self.theme_persona_matching_prompt.generate(
                            data=prompt_input, llm=self.llm, callbacks=callbacks
                        )
                    )

                    overlapped_keywords = [list(item) for item in overlapped_keywords]

                    # prepare and sample possible combinations
                    base_scenarios = self.prepare_combinations(
                        [node_a, node_b],
                        overlapped_keywords,
                        personas=persona_list,
                        persona_item_mapping=persona_concepts.mapping,
                        property_name="keyphrases",
                    )

                    # get number of required samples from this triplet
                    base_scenarios = self.sample_diverse_combinations(
                        base_scenarios, num_sample_per_triplet
                    )

                    scenarios.extend(base_scenarios)

        return scenarios

query = MyMultiHopQuery(llm=llm)
scenarios = await query.generate_scenarios(
    n=10, knowledge_graph=kg, persona_list=persona_list
)

scenarios[4]