In [1]:
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_community.document_loaders import DirectoryLoader, TextLoader

path = "/Users/debadeepta.dey/datasets/barclays"
loader = DirectoryLoader(path, glob="**/*.md")
docs = loader.load()

In [2]:
# Method 1: Split by Markdown Headers (Most intelligent for markdown)
# This preserves the document structure and creates logical chunks

def split_markdown_by_headers(document_content):
    """
    Split markdown document by headers, preserving document structure
    """
    # Define headers to split on (from h1 to h3)
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"), 
        ("###", "Header 3"),
    ]
    
    # Create the markdown header text splitter
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on,
        strip_headers=False  # Keep headers in the chunks
    )
    
    # Split the document
    md_header_splits = markdown_splitter.split_text(document_content)
    
    return md_header_splits

# # Example usage with your loaded documents
# if docs:
#     # Take the first document as example
#     first_doc = docs[0]
#     header_splits = split_markdown_by_headers(first_doc.page_content)
    
#     print(f"Original document split into {len(header_splits)} chunks based on headers")
    
#     # Display first few chunks
#     for i, chunk in enumerate(header_splits[:3]):
#         print(f"\n--- Chunk {i+1} ---")
#         print(f"Content: {chunk.page_content[:200]}...")
#         print(f"Metadata: {chunk.metadata}")
#         print(f"Full length: {len(chunk.page_content)} characters")

In [3]:
# Method 2: Recursive Character Text Splitter (Good fallback)
# This method is useful when documents don't have clear header structure

def split_markdown_recursive(document_content, chunk_size=1000, chunk_overlap=200):
    """
    Split markdown using recursive character splitter with markdown-aware separators
    """
    # Define separators that work well for markdown
    markdown_separators = [
        "\n\n",  # Double newline (paragraph breaks)
        "\n",    # Single newline
        " ",     # Space
        ""       # Character level
    ]
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=markdown_separators,
        length_function=len,
    )
    
    # Split the document
    chunks = text_splitter.split_text(document_content)
    
    return chunks

# # Example usage
# if docs:
#     first_doc = docs[0]
#     recursive_chunks = split_markdown_recursive(
#         first_doc.page_content, 
#         chunk_size=2048,  # Adjust based on your needs
#         chunk_overlap=200
#     )
    
#     print(f"\nRecursive splitting created {len(recursive_chunks)} chunks")
    
#     # Display first few chunks
#     for i, chunk in enumerate(recursive_chunks[:3]):
#         print(f"\n--- Recursive Chunk {i+1} ---")
#         print(f"Content: {chunk[:200]}...")
#         print(f"Length: {len(chunk)} characters")

In [4]:
# Method 3: Hybrid Approach (Recommended)
# Combine header-based splitting with recursive splitting for optimal results

def smart_markdown_split(document_content, max_chunk_size=1500, chunk_overlap=200):
    """
    Smart markdown splitting that combines header-based and recursive approaches
    """
    from langchain.schema import Document
    
    # First, try to split by headers
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"), 
        ("###", "Header 3"),
        ("####", "Header 4"),
    ]
    
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on,
        strip_headers=False
    )
    
    try:
        # Split by headers first
        header_splits = markdown_splitter.split_text(document_content)
        
        # If header splits are too large, further split them recursively
        final_chunks = []
        
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=max_chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", " ", ""],
            length_function=len,
        )
        
        for doc in header_splits:
            if len(doc.page_content) > max_chunk_size:
                # Split large chunks further
                sub_chunks = text_splitter.split_text(doc.page_content)
                for i, sub_chunk in enumerate(sub_chunks):
                    # Preserve metadata from header splitting
                    new_metadata = doc.metadata.copy()
                    new_metadata['sub_chunk'] = i
                    final_chunks.append(Document(
                        page_content=sub_chunk,
                        metadata=new_metadata
                    ))
            else:
                final_chunks.append(doc)
                
        return final_chunks
        
    except Exception as e:
        print(f"Header splitting failed: {e}")
        # Fallback to recursive splitting
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=max_chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", " ", ""],
        )
        chunks = text_splitter.split_text(document_content)
        return [Document(page_content=chunk, metadata={}) for chunk in chunks]

# # Example usage with the hybrid approach
# if docs:
#     first_doc = docs[0]
#     smart_chunks = smart_markdown_split(
#         first_doc.page_content,
#         max_chunk_size=1200,
#         chunk_overlap=150
#     )
    
#     print(f"\nSmart splitting created {len(smart_chunks)} chunks")
    
#     # Display statistics
#     chunk_lengths = [len(chunk.page_content) for chunk in smart_chunks]
#     print(f"Average chunk length: {sum(chunk_lengths) / len(chunk_lengths):.0f} characters")
#     print(f"Min chunk length: {min(chunk_lengths)} characters")
#     print(f"Max chunk length: {max(chunk_lengths)} characters")
    
#     # Display first few chunks with metadata
#     for i, chunk in enumerate(smart_chunks[:3]):
#         print(f"\n--- Smart Chunk {i+1} ---")
#         print(f"Metadata: {chunk.metadata}")
#         print(f"Content preview: {chunk.page_content[:200]}...")
#         print(f"Length: {len(chunk.page_content)} characters")

In [5]:
from langchain.schema import Document

# Utility function to process all your documents
def process_all_documents(docs, output_method='smart', **kwargs):
    """
    Process all loaded documents and return chunks
    
    Args:
        docs: List of loaded documents
        output_method: 'header', 'recursive', or 'smart'
        **kwargs: Additional parameters for the splitting methods
    
    Returns:
        List of all chunks with source document information
    """
    all_chunks = []
    
    for doc_idx, doc in enumerate(docs):
        print(f"Processing document {doc_idx + 1}/{len(docs)}: {doc.metadata.get('source', 'unknown')}")
        
        if output_method == 'header':
            chunks = split_markdown_by_headers(doc.page_content)
        elif output_method == 'recursive':
            chunk_texts = split_markdown_recursive(doc.page_content, **kwargs)
            chunks = [Document(page_content=text, metadata=doc.metadata.copy()) for text in chunk_texts]
        elif output_method == 'smart':
            chunks = smart_markdown_split(doc.page_content, **kwargs)
        else:
            raise ValueError("output_method must be 'header', 'recursive', or 'smart'")
        
        # Add source document information to each chunk
        for chunk_idx, chunk in enumerate(chunks):
            chunk.metadata['source_doc_index'] = doc_idx
            chunk.metadata['chunk_index'] = chunk_idx
            chunk.metadata['original_source'] = doc.metadata.get('source', 'unknown')
            all_chunks.append(chunk)
    
    return all_chunks

# Process all your documents using the smart method
all_processed_chunks = process_all_documents(
    docs, 
    output_method='smart',  # Change to 'header' or 'recursive' if preferred
    max_chunk_size=10000,
    chunk_overlap=0 # deliberately set to 0
)

print(f"\nTotal chunks created from all documents: {len(all_processed_chunks)}")

# Show summary statistics
if all_processed_chunks:
    chunk_lengths = [len(chunk.page_content) for chunk in all_processed_chunks]
    print(f"Average chunk length: {sum(chunk_lengths) / len(chunk_lengths):.0f} characters")
    print(f"Chunk length range: {min(chunk_lengths)} - {max(chunk_lengths)} characters")
    
    # Show distribution by source document
    source_counts = {}
    for chunk in all_processed_chunks:
        source = chunk.metadata.get('original_source', 'unknown')
        source_counts[source] = source_counts.get(source, 0) + 1
    
    print(f"\nChunks per source document:")
    for source, count in source_counts.items():
        print(f"  {source}: {count} chunks")

Processing document 1/1: /Users/debadeepta.dey/datasets/barclays/rise-insights-report-making-data-count-with-ai-DIGITAL.md

Total chunks created from all documents: 13
Average chunk length: 9404 characters
Chunk length range: 2316 - 10000 characters

Chunks per source document:
  /Users/debadeepta.dey/datasets/barclays/rise-insights-report-making-data-count-with-ai-DIGITAL.md: 13 chunks


In [6]:
# Use gpt-4o-mini Azure OpenAI model
import os
from pathlib import Path
from dotenv import load_dotenv
from ragas.llms import LangchainLLMWrapper
from langchain_openai import AzureChatOpenAI

# Specify the directory containing your .env file
env_directory = "/Users/debadeepta.dey/sources/syftr/runtime-secrets"  # Change this to your desired directory
env_file_path = Path(env_directory) / "azure_openai_gpt_4o_mini.env"

# Load environment variables from the specified directory
load_dotenv(dotenv_path=env_file_path)

# Verify the .env file was found and loaded
if env_file_path.exists():
    print(f"✅ Loaded .env from: {env_file_path}")
else:
    print(f"⚠️  .env file not found at: {env_file_path}")
    print("Please create the .env file with your Azure OpenAI credentials")

# Configure Azure OpenAI GPT-4o-mini
azure_llm = AzureChatOpenAI(
    azure_endpoint=os.getenv("AZURE_OPENAI_ENDPOINT"),
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),
    api_version=os.getenv("AZURE_OPENAI_API_VERSION"),
    azure_deployment=os.getenv("AZURE_OPENAI_DEPLOYMENT"),
    temperature=0.0,
    max_tokens=16384,
)

# Test the model 
print(azure_llm.invoke("Who are you?"))

# Wrap for RAGAS
llm = LangchainLLMWrapper(azure_llm)

print("Azure OpenAI LLM configured successfully!")

✅ Loaded .env from: /Users/debadeepta.dey/sources/syftr/runtime-secrets/azure_openai_gpt_4o_mini.env
content='I am an AI language model created by OpenAI, designed to assist with a wide range of questions and tasks by providing information, answering queries, and engaging in conversation. How can I help you today?' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 42, 'prompt_tokens': 11, 'total_tokens': 53, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_efad92c60b', 'id': 'chatcmpl-C1KqBbH5rfDvFNFEhK8ihUh13bCTR', 'service_tier': None, 'prompt_filter_results': [{'prompt_index': 0, 'content_filter_results': {'hate': {'filtered': False, 'severity': 'safe'}, 'self_harm': {'filtered': False, 'severity': 'safe'}, 'sexual': {'fil

## VLLM Configuration for RAGAS

The configuration above connects RAGAS to your vLLM server. Here are some key points:

1. **Base URL**: `http://localhost:8003/v1` - your vLLM endpoint
2. **API Key**: Set to "not-needed" since vLLM typically doesn't require authentication
3. **Model Name**: Replace `"your-model-name"` with the actual model you're serving
4. **Temperature**: Controls randomness (0.1 is relatively deterministic)
5. **Max Tokens**: Maximum response length

In [7]:
# from ragas.llms import LangchainLLMWrapper
# from ragas.embeddings.base import embedding_factory
# from langchain_openai import ChatOpenAI

# # Configure vLLM hosted LLM
# vllm_llm = ChatOpenAI(
#     base_url="http://localhost:8014/v1",
#     api_key="asdf",
#     model="nvidia/Llama-3_3-Nemotron-Super-49B",  # Replace with your actual model name
#     temperature=0.0,
#     max_tokens=32768,
# )

# # Wrap for RAGAS
# llm = LangchainLLMWrapper(vllm_llm)


In [8]:
# # Test the vLLM connection
# print("Testing vLLM connection...")

# try:
#     # Test the LLM directly
#     test_response = vllm_llm.invoke("Hello, this is a test. Please respond briefly.")
#     print(f"✅ vLLM connection successful!")
#     print(f"Response: {test_response.content}")
    
#     # Test with RAGAS wrapper
#     from ragas.llms.base import BaseRagasLLM
#     if isinstance(llm, BaseRagasLLM):
#         print("✅ RAGAS LLM wrapper configured correctly")
#     else:
#         print("⚠️  RAGAS LLM wrapper might need adjustment")
        
# except Exception as e:
#     print(f"❌ Error connecting to vLLM: {e}")
#     print("Please check:")
#     print("1. vLLM server is running")
#     print("2. Model name is correct")
#     print("3. No firewall blocking the connection")

# vLLM Hosted Embedding Model Configuration

Here's how to configure a vLLM hosted embedding model for use with RAGAS:

## Option 1: Using OpenAI-compatible embedding endpoint
If your vLLM server hosts an embedding model with OpenAI-compatible API

In [9]:
# from langchain_openai import OpenAIEmbeddings
# from ragas.embeddings import LangchainEmbeddingsWrapper
# import asyncio

# # Try each configuration until one works
# vllm_embeddings = OpenAIEmbeddings(
#         base_url="http://localhost:8001/v1",
#         api_key="asdf",
#         model='thenlper/gte-large',
#         tiktoken_enabled=False,  # Disable tiktoken for vLLM
#     )

# embedding_model = LangchainEmbeddingsWrapper(vllm_embeddings)

# async def test_embedding_model(vllm_embeddings: OpenAIEmbeddings):
#     """Async function to test the embedding model"""
    
#     # Test with a simple text first
#     print("testing query embedding...")
#     test_result = vllm_embeddings.embed_query("Risk management is crucial for financial institutions.")
#     print(f"query embedding dimensions: {len(test_result)}")

#     # Test with texts
#     print("testing text embedding...")
#     test_texts = [
#         "This is a test document about financial analysis.",
#         "Machine learning models are used in banking.",
#         "Risk management is crucial for financial institutions."
#     ]
#     test_results = vllm_embeddings.embed_documents(test_texts)
#     print(f"Text embedding dimensions: {len(test_results[0])} for {len(test_results)} texts")
    
#     # If successful, wrap for RAGAS and test it through the wrapper
#     embedding_model = LangchainEmbeddingsWrapper(vllm_embeddings)

#     print("Testing wrapped embedding model query ...")
#     embedding_result = await embedding_model.embed_query("Risk management is crucial for financial institutions.")
#     print(f"Wrapped query embedding dimensions: {len(embedding_result)}")

#     print("Testing wrapped embedding model text ...")
#     embedding_results = await embedding_model.embed_texts(test_texts, is_async=True)
#     print(f"Wrapped text embedding dimensions: {len(embedding_results[0])} for {len(embedding_results)} texts")

#     print(f"✅ Successfully configured vLLM embedding model")
#     return embedding_model

# # Run the async function
# try:
#     embedding_model = asyncio.run(test_embedding_model(vllm_embeddings))
#     print(f"🎉 Using vLLM embedding model successfully!")
# except Exception as e:
#     print(f"❌ Failed with: {str(e)[:100]}...")
#     embedding_model = None


# Option 2: local embedding model

In [10]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper

local_embeddings = HuggingFaceEmbeddings(
    model_name="WhereIsAI/UAE-Large-V1",
    model_kwargs={"device": "mps"} # Or "cuda" for GPU, "mps" for Mac 
)
local_embeddings = LangchainEmbeddingsWrapper(local_embeddings)

res = local_embeddings.embed_query("Who is this?")  # Test local embedding model
print(res)

res = local_embeddings.embed_text("Who is this?")
print(res)



  local_embeddings = HuggingFaceEmbeddings(


[-0.22206830978393555, 0.051233645528554916, -0.491749107837677, -0.4322299361228943, 0.5002784132957458, -0.2561419606208801, -0.22091081738471985, 0.11627580225467682, 0.4728822112083435, 0.08462154865264893, 0.3242437243461609, -0.19702470302581787, -0.36262261867523193, -0.2812041640281677, -0.36116212606430054, 0.08653219789266586, 0.08922451734542847, -0.16965575516223907, -0.39888814091682434, -0.2973982095718384, 0.6447439789772034, 0.7650296092033386, -0.6067692637443542, -0.3339347541332245, 0.3801654875278473, -0.3341101408004761, -0.16917163133621216, -0.010672621428966522, 0.13589155673980713, 0.7128159999847412, -0.6217780113220215, -0.042145416140556335, -0.07324082404375076, -1.0624228715896606, -0.01020888052880764, -0.7508808374404907, 0.5139471292495728, -0.9261746406555176, -0.4445686340332031, -0.7835230827331543, -0.4164794087409973, 0.0833824947476387, 0.30117857456207275, -0.637333333492279, -0.7911930084228516, 0.09582684934139252, -0.3917580246925354, 0.086835

# Default synthetic data generation

In [11]:
from ragas.testset.transforms import (
    default_transforms, 
    apply_transforms, 
    EmbeddingExtractor, 
    SummaryExtractor, 
    TitleExtractor,
    HeadlinesExtractor,
    HeadlineSplitter,
    KeyphrasesExtractor,
    HeadlineSplitter,
    OverlapScoreBuilder,
)
from ragas.testset.graph import KnowledgeGraph
from ragas.testset.graph import Node, NodeType


# initialize your knowledge graph
kg = KnowledgeGraph()

for chunk in all_processed_chunks:
    kg.nodes.append(
        Node(
            type=NodeType.DOCUMENT,
            properties={"page_content": chunk.page_content, "metadata": chunk.metadata},
        )
    )
print(kg)

KnowledgeGraph(nodes: 13, relationships: 0)


In [12]:
# headline extractor
headline_extractor = HeadlinesExtractor(llm=llm)
apply_transforms(kg, headline_extractor)

Applying HeadlinesExtractor:   0%|          | 0/13 [00:00<?, ?it/s]

In [13]:
# get all the default transforms as well
print("Before")
print(kg)
trans = default_transforms(documents=docs, llm=llm, embedding_model=local_embeddings)
for tran in trans:
    print(f"Applying transform: {tran}")
    apply_transforms(kg, tran)
print("After")
print(kg)

Before
KnowledgeGraph(nodes: 13, relationships: 0)
Applying transform: HeadlinesExtractor(name='HeadlinesExtractor', filter_nodes=<function default_transforms.<locals>.<lambda> at 0x1544a68e0>, llm=LangchainLLMWrapper(langchain_llm=AzureChatOpenAI(...)), merge_if_possible=True, max_token_limit=32000, tokenizer=<Encoding 'o200k_base'>, property_name='headlines', prompt=HeadlinesExtractorPrompt(instruction=Extract the most important max_num headlines from the given text that can be used to split the text into independent sections.Focus on Level 2 and Level 3 headings., examples=[(TextWithExtractionLimit(text='                Introduction\n                Overview of the topic...\n\n                Main Concepts\n                Explanation of core ideas...\n\n                Detailed Analysis\n                Techniques and methods for analysis...\n\n                Subsection: Specialized Techniques\n                Further details on specialized techniques...\n\n                Future 

Applying HeadlinesExtractor:   0%|          | 0/12 [00:00<?, ?it/s]

Property 'headlines' already exists in node '73c6ae'. Skipping!
Property 'headlines' already exists in node '2e48c4'. Skipping!
Property 'headlines' already exists in node '346385'. Skipping!
Property 'headlines' already exists in node '14ef3e'. Skipping!
Property 'headlines' already exists in node '8aba8e'. Skipping!
Property 'headlines' already exists in node '509c2a'. Skipping!
Property 'headlines' already exists in node '72f538'. Skipping!
Property 'headlines' already exists in node '8c5063'. Skipping!
Property 'headlines' already exists in node '40ac1d'. Skipping!
Property 'headlines' already exists in node '5f75a7'. Skipping!
Property 'headlines' already exists in node 'dcca5e'. Skipping!
Property 'headlines' already exists in node '8111fd'. Skipping!


Applying transform: HeadlineSplitter(name='HeadlineSplitter', filter_nodes=<function default_filter at 0x30cb2aca0>, min_tokens=500, max_tokens=1000)


Applying HeadlineSplitter:   0%|          | 0/13 [00:00<?, ?it/s]

Applying transform: SummaryExtractor(name='SummaryExtractor', filter_nodes=<function default_transforms.<locals>.<lambda> at 0x121443e20>, llm=LangchainLLMWrapper(langchain_llm=AzureChatOpenAI(...)), merge_if_possible=True, max_token_limit=32000, tokenizer=<Encoding 'o200k_base'>, property_name='summary', prompt=SummaryExtractorPrompt(instruction=Summarize the given text in less than 10 sentences., examples=[(StringIO(text='Artificial intelligence\n\nArtificial intelligence is transforming various industries by automating tasks that previously required human intelligence. From healthcare to finance, AI is being used to analyze vast amounts of data quickly and accurately. This technology is also driving innovations in areas like self-driving cars and personalized recommendations.'), StringIO(text='AI is revolutionizing industries by automating tasks, analyzing data, and driving innovations like self-driving cars and personalized recommendations.'))], language=english))


Applying SummaryExtractor:   0%|          | 0/12 [00:00<?, ?it/s]

Applying transform: CustomNodeFilter(name='CustomNodeFilter', filter_nodes=<function default_transforms.<locals>.<lambda> at 0x30cb8aac0>, llm=LangchainLLMWrapper(langchain_llm=AzureChatOpenAI(...)), scoring_prompt=QuestionPotentialPrompt(instruction=Given a document summary and node content, score the content of the node in 1 to 5 range., examples=[], language=english), min_score=2, rubrics={'score1_description': 'The page content is irrelevant or does not align with the main themes or topics of the document summary.', 'score2_description': "The page content partially aligns with the document summary, but it includes unrelated details or lacks critical information related to the document's main themes.", 'score3_description': 'The page content generally reflects the document summary but may miss key details or lack depth in addressing the main themes.', 'score4_description': 'The page content aligns well with the document summary, covering the main themes and topics with minor gaps or

Applying CustomNodeFilter:   0%|          | 0/33 [00:00<?, ?it/s]

Applying transform: <ragas.testset.transforms.engine.Parallel object at 0x35786a810>


Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/72 [00:00<?, ?it/s]

Applying transform: <ragas.testset.transforms.engine.Parallel object at 0x3578c1460>


Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

After
KnowledgeGraph(nodes: 44, relationships: 124)


In [14]:
# get summary embeddings required by automatic persona generator
summary_embedding_extractor = EmbeddingExtractor(embedding_model=local_embeddings,
                                         property_name="summary_embedding",
                                         embed_property_name="summary")
apply_transforms(kg, summary_embedding_extractor)
print("Embedding extraction complete")


Applying EmbeddingExtractor:   0%|          | 0/44 [00:00<?, ?it/s]

unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '<class 'NoneType'>'
unable to apply transformation: node.property('summary') must be a string, found '

Embedding extraction complete


In [15]:
# get regular embeddings
regular_embedding_extractor = EmbeddingExtractor(
    embedding_model=local_embeddings,
)
apply_transforms(kg, regular_embedding_extractor)

Applying EmbeddingExtractor:   0%|          | 0/44 [00:00<?, ?it/s]

Property 'embedding' already exists in node '5c5bdc'. Skipping!


In [16]:
# headline extractor
headline_extractor = HeadlinesExtractor(llm=llm)
apply_transforms(kg, headline_extractor)

Applying HeadlinesExtractor:   0%|          | 0/44 [00:00<?, ?it/s]

Property 'headlines' already exists in node '8aba8e'. Skipping!
Property 'headlines' already exists in node '8c5063'. Skipping!
Property 'headlines' already exists in node '5c5bdc'. Skipping!
Property 'headlines' already exists in node '73c6ae'. Skipping!
Property 'headlines' already exists in node '346385'. Skipping!
Property 'headlines' already exists in node '2e48c4'. Skipping!
Property 'headlines' already exists in node '14ef3e'. Skipping!
Property 'headlines' already exists in node '509c2a'. Skipping!
Property 'headlines' already exists in node '8111fd'. Skipping!
Property 'headlines' already exists in node 'dcca5e'. Skipping!
Property 'headlines' already exists in node '40ac1d'. Skipping!
Property 'headlines' already exists in node '72f538'. Skipping!
Property 'headlines' already exists in node '5f75a7'. Skipping!
Property 'headlines' already exists in node '5c5bdc'. Skipping!


In [17]:
# get all the default transforms as well
print("Before")
print(kg)
trans = default_transforms(documents=docs, llm=llm, embedding_model=local_embeddings)
apply_transforms(kg, trans)
print("After")
print(kg)

Before
KnowledgeGraph(nodes: 44, relationships: 124)


Applying HeadlinesExtractor:   0%|          | 0/12 [00:00<?, ?it/s]

Property 'headlines' already exists in node '2e48c4'. Skipping!
Property 'headlines' already exists in node '73c6ae'. Skipping!
Property 'headlines' already exists in node '72f538'. Skipping!
Property 'headlines' already exists in node '8111fd'. Skipping!
Property 'headlines' already exists in node '8aba8e'. Skipping!
Property 'headlines' already exists in node '8c5063'. Skipping!
Property 'headlines' already exists in node '346385'. Skipping!
Property 'headlines' already exists in node '509c2a'. Skipping!
Property 'headlines' already exists in node '14ef3e'. Skipping!
Property 'headlines' already exists in node 'dcca5e'. Skipping!
Property 'headlines' already exists in node '40ac1d'. Skipping!
Property 'headlines' already exists in node '5f75a7'. Skipping!


Applying HeadlineSplitter:   0%|          | 0/44 [00:00<?, ?it/s]

Applying SummaryExtractor:   0%|          | 0/12 [00:00<?, ?it/s]

Property 'summary' already exists in node '2e48c4'. Skipping!
Property 'summary' already exists in node '73c6ae'. Skipping!
Property 'summary' already exists in node '346385'. Skipping!
Property 'summary' already exists in node '8aba8e'. Skipping!
Property 'summary' already exists in node '14ef3e'. Skipping!
Property 'summary' already exists in node '72f538'. Skipping!
Property 'summary' already exists in node '8c5063'. Skipping!
Property 'summary' already exists in node '8111fd'. Skipping!
Property 'summary' already exists in node 'dcca5e'. Skipping!
Property 'summary' already exists in node '40ac1d'. Skipping!
Property 'summary' already exists in node '5f75a7'. Skipping!
Property 'summary' already exists in node '509c2a'. Skipping!


Applying CustomNodeFilter:   0%|          | 0/104 [00:00<?, ?it/s]

Node 61fc50e7-2b18-4cf9-9558-c02fd3cd5bc4 does not have a summary. Skipping filtering.
Node 10d3e1fd-d012-4786-b02d-64589165eee7 does not have a summary. Skipping filtering.
Node 0f900e93-e8db-4f6e-a862-eaa7b3f555c1 does not have a summary. Skipping filtering.
Node bb7400c0-e285-46fb-8e11-182f54c0e16b does not have a summary. Skipping filtering.
Node 4a13f5a9-58cc-4f6b-9b73-0ccbf53e20d8 does not have a summary. Skipping filtering.
Node f718a707-5af2-4563-b623-5a8a9bbf877c does not have a summary. Skipping filtering.
Node 8ceb6a22-03a8-4a97-845c-e9cc0f6a76c6 does not have a summary. Skipping filtering.
Node 58657099-10ee-4fbb-bd68-34712b4595b0 does not have a summary. Skipping filtering.
Node 1b0322cd-8a3a-4587-8a29-72cb9c577631 does not have a summary. Skipping filtering.
Node 3ab3ed67-8ae8-4ff6-a9f6-f395073f7e0e does not have a summary. Skipping filtering.
Node ad0946d6-f8be-4526-bc7d-b7debb663564 does not have a summary. Skipping filtering.
Node 8060a35a-fbf8-4c69-8a72-8975cca6d8bb d

Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/216 [00:00<?, ?it/s]

Property 'summary_embedding' already exists in node '8c5063'. Skipping!
Property 'summary_embedding' already exists in node '40ac1d'. Skipping!
Property 'summary_embedding' already exists in node '2e48c4'. Skipping!
Property 'summary_embedding' already exists in node '346385'. Skipping!
Property 'summary_embedding' already exists in node '8aba8e'. Skipping!
Property 'summary_embedding' already exists in node '73c6ae'. Skipping!
Property 'summary_embedding' already exists in node '72f538'. Skipping!
Property 'summary_embedding' already exists in node '5f75a7'. Skipping!
Property 'summary_embedding' already exists in node '509c2a'. Skipping!
Property 'summary_embedding' already exists in node 'dcca5e'. Skipping!
Property 'summary_embedding' already exists in node '8111fd'. Skipping!
Property 'summary_embedding' already exists in node '14ef3e'. Skipping!
Property 'themes' already exists in node 'd1feee'. Skipping!
Property 'themes' already exists in node 'cd5862'. Skipping!
Property 'them

Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

After
KnowledgeGraph(nodes: 118, relationships: 675)


In [18]:
# Add this debugging code to see what relationships exist
print("=== Knowledge Graph Debug Info ===")
print(f"Total nodes: {len(kg.nodes)}")
print(f"Total relationships: {len(kg.relationships)}")

# Check relationship types
rel_types = set()
for rel in kg.relationships:
    rel_types.add(rel.type)
    
print(f"Relationship types found: {rel_types}")

=== Knowledge Graph Debug Info ===
Total nodes: 118
Total relationships: 675
Relationship types found: {'child', 'entities_overlap', 'cosine_similarity', 'next'}


In [19]:
print(kg)

KnowledgeGraph(nodes: 118, relationships: 675)


In [20]:
from ragas.testset import TestsetGenerator
from ragas.testset.synthesizers import default_query_distribution

generator = TestsetGenerator(llm=llm, embedding_model=local_embeddings, knowledge_graph=kg)
query_distribution = default_query_distribution(llm)
testset = generator.generate(testset_size=10, query_distribution=query_distribution)
testset_pd = testset.to_pandas()
testset_pd.to_json('barclays_synthetic_multihop.json', orient='records', indent=2)

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/12 [00:00<?, ?it/s]

### Custom persona 

In [21]:
from ragas.testset.graph import KnowledgeGraph
from ragas.testset.graph import Node, NodeType


kg = KnowledgeGraph()
for doc in docs:
    kg.nodes.append(
        Node(
            type=NodeType.DOCUMENT,
            properties={
                "page_content": doc.page_content,
                "document_metadata": doc.metadata,
            },
        )
    )

In [22]:
from ragas.testset.transforms import Parallel, apply_transforms
from ragas.testset.transforms import (
    HeadlinesExtractor,
    HeadlineSplitter,
    KeyphrasesExtractor,
    OverlapScoreBuilder,
)


headline_extractor = HeadlinesExtractor(llm=llm)
headline_splitter = HeadlineSplitter(min_tokens=300, max_tokens=1000)
keyphrase_extractor = KeyphrasesExtractor(
    llm=llm, property_name="keyphrases", max_num=10
)
relation_builder = OverlapScoreBuilder(
    property_name="keyphrases",
    new_property_name="overlap_score",
    threshold=0.01,
    distance_threshold=0.9,
)

transforms = [
    headline_extractor,
    headline_splitter,
    keyphrase_extractor,
    relation_builder,
]

apply_transforms(kg, transforms=transforms)

Applying HeadlinesExtractor:   0%|          | 0/1 [00:00<?, ?it/s]

Applying HeadlineSplitter:   0%|          | 0/1 [00:00<?, ?it/s]

Applying KeyphrasesExtractor:   0%|          | 0/23 [00:00<?, ?it/s]

Applying OverlapScoreBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

In [23]:
# Let's generate using custom personas
from ragas.testset.persona import Persona

person1 = Persona(
    name="Banking Executive",
    role_description="Explore AI and data strategies to modernize financial services and unlock new revenue streams. Asks only one question at a time.",
)
persona2 = Persona(
    name="FinTech Founder",
    role_description="Leverage AI and data innovation to build competitive, regulation-aware financial products. Asks only one question at a time.",
)
persona_list = [person1, persona2]

In [24]:
from dataclasses import dataclass
import typing as t
from ragas.testset.synthesizers.multi_hop.base import (
    MultiHopQuerySynthesizer,
    MultiHopScenario,
)
from ragas.testset.synthesizers.prompts import (
    ThemesPersonasInput,
    ThemesPersonasMatchingPrompt,
)


@dataclass
class MyMultiHopQuery(MultiHopQuerySynthesizer):

    theme_persona_matching_prompt = ThemesPersonasMatchingPrompt()

    async def _generate_scenarios(
        self,
        n: int,
        knowledge_graph,
        persona_list,
        callbacks,
    ) -> t.List[MultiHopScenario]:

        # query and get (node_a, rel, node_b) to create multi-hop queries
        results = kg.find_two_nodes_single_rel(
            relationship_condition=lambda rel: (
                True if rel.type == "keyphrases_overlap" else False
            )
        )

        num_sample_per_triplet = max(1, n // len(results))

        scenarios = []
        for triplet in results:
            if len(scenarios) < n:
                node_a, node_b = triplet[0], triplet[-1]
                overlapped_keywords = triplet[1].properties["overlapped_items"]
                if overlapped_keywords:

                    # match the keyword with a persona for query creation
                    themes = list(dict(overlapped_keywords).keys())
                    prompt_input = ThemesPersonasInput(
                        themes=themes, personas=persona_list
                    )
                    persona_concepts = (
                        await self.theme_persona_matching_prompt.generate(
                            data=prompt_input, llm=self.llm, callbacks=callbacks
                        )
                    )

                    overlapped_keywords = [list(item) for item in overlapped_keywords]

                    # prepare and sample possible combinations
                    base_scenarios = self.prepare_combinations(
                        [node_a, node_b],
                        overlapped_keywords,
                        personas=persona_list,
                        persona_item_mapping=persona_concepts.mapping,
                        property_name="keyphrases",
                    )

                    # get number of required samples from this triplet
                    base_scenarios = self.sample_diverse_combinations(
                        base_scenarios, num_sample_per_triplet
                    )

                    scenarios.extend(base_scenarios)

        return scenarios

query = MyMultiHopQuery(llm=llm)
scenarios = await query.generate_scenarios(
    n=200, knowledge_graph=kg, persona_list=persona_list
)

scenarios[4]

MultiHopScenario(
nodes=2
combinations=['artificial intelligence', 'Artificial Intelligence technologies']
style=QueryStyle.PERFECT_GRAMMAR
length=QueryLength.LONG
persona=name='FinTech Founder' role_description='Leverage AI and data innovation to build competitive, regulation-aware financial products. Asks only one question at a time.')

In [26]:
result = await query.generate_sample(scenario=scenarios[1])
print(result.user_input)
print(result.reference)
print(result.reference_contexts)


How does the concept of trust in AI and data sharing influence the relationship between consumers and financial institutions?
Trust is a fundamental principle in banking, influencing how consumers perceive the use of their data by financial institutions. As individuals engage with various online services, they often relinquish control over their personal data, relying on trust that it will be handled respectfully and appropriately. This trust is crucial, especially when considering the sensitive nature of the data involved, such as personal preferences and financial information. The emergence of transparent data-sharing protocols could empower consumers to manage their data more effectively, potentially increasing their trust in financial institutions. By allowing consumers to set their own data-sharing policies and review how their data is used, banks can foster a more trusting relationship, which is essential for the successful implementation of AI technologies in financial services.

In [29]:
for scenario in scenarios:
    result = await query.generate_sample(scenario=scenario)
    print(f"Scenario: {result.user_input}")
    print("-" * 80)

Scenario: How has the Open Banking model influenced the commercialisation of data in financial services, and what opportunities does it present for startups?
--------------------------------------------------------------------------------
Scenario: How does the concept of trust in AI and data sharing influence the relationship between consumers and financial institutions?
--------------------------------------------------------------------------------
Scenario: How can increasing trust in AI and data sharing enhance consumer relationships in financial services?
--------------------------------------------------------------------------------
Scenario: What role does the Open Banking model play in enhancing user control and consent over personal data in financial services?
--------------------------------------------------------------------------------
Scenario: How do artificial intelligence technologies enhance the customer experience in financial services, and what role does data comm

CancelledError: 