In [1]:
import os
from langchain.text_splitter import RecursiveCharacterTextSplitter, MarkdownHeaderTextSplitter
from langchain_community.document_loaders import DirectoryLoader, TextLoader

path = "/Users/debadeepta.dey/datasets/barclays"
loader = DirectoryLoader(path, glob="**/*.md")
docs = loader.load()

In [2]:
# Method 1: Split by Markdown Headers (Most intelligent for markdown)
# This preserves the document structure and creates logical chunks

def split_markdown_by_headers(document_content):
    """
    Split markdown document by headers, preserving document structure
    """
    # Define headers to split on (from h1 to h3)
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"), 
        ("###", "Header 3"),
    ]
    
    # Create the markdown header text splitter
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on,
        strip_headers=False  # Keep headers in the chunks
    )
    
    # Split the document
    md_header_splits = markdown_splitter.split_text(document_content)
    
    return md_header_splits

# # Example usage with your loaded documents
# if docs:
#     # Take the first document as example
#     first_doc = docs[0]
#     header_splits = split_markdown_by_headers(first_doc.page_content)
    
#     print(f"Original document split into {len(header_splits)} chunks based on headers")
    
#     # Display first few chunks
#     for i, chunk in enumerate(header_splits[:3]):
#         print(f"\n--- Chunk {i+1} ---")
#         print(f"Content: {chunk.page_content[:200]}...")
#         print(f"Metadata: {chunk.metadata}")
#         print(f"Full length: {len(chunk.page_content)} characters")

In [3]:
# Method 2: Recursive Character Text Splitter (Good fallback)
# This method is useful when documents don't have clear header structure

def split_markdown_recursive(document_content, chunk_size=1000, chunk_overlap=200):
    """
    Split markdown using recursive character splitter with markdown-aware separators
    """
    # Define separators that work well for markdown
    markdown_separators = [
        "\n\n",  # Double newline (paragraph breaks)
        "\n",    # Single newline
        " ",     # Space
        ""       # Character level
    ]
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=markdown_separators,
        length_function=len,
    )
    
    # Split the document
    chunks = text_splitter.split_text(document_content)
    
    return chunks

# # Example usage
# if docs:
#     first_doc = docs[0]
#     recursive_chunks = split_markdown_recursive(
#         first_doc.page_content, 
#         chunk_size=2048,  # Adjust based on your needs
#         chunk_overlap=200
#     )
    
#     print(f"\nRecursive splitting created {len(recursive_chunks)} chunks")
    
#     # Display first few chunks
#     for i, chunk in enumerate(recursive_chunks[:3]):
#         print(f"\n--- Recursive Chunk {i+1} ---")
#         print(f"Content: {chunk[:200]}...")
#         print(f"Length: {len(chunk)} characters")

In [4]:
# Method 3: Hybrid Approach (Recommended)
# Combine header-based splitting with recursive splitting for optimal results

def smart_markdown_split(document_content, max_chunk_size=1500, chunk_overlap=200):
    """
    Smart markdown splitting that combines header-based and recursive approaches
    """
    from langchain.schema import Document
    
    # First, try to split by headers
    headers_to_split_on = [
        ("#", "Header 1"),
        ("##", "Header 2"), 
        ("###", "Header 3"),
        ("####", "Header 4"),
    ]
    
    markdown_splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=headers_to_split_on,
        strip_headers=False
    )
    
    try:
        # Split by headers first
        header_splits = markdown_splitter.split_text(document_content)
        
        # If header splits are too large, further split them recursively
        final_chunks = []
        
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=max_chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", " ", ""],
            length_function=len,
        )
        
        for doc in header_splits:
            if len(doc.page_content) > max_chunk_size:
                # Split large chunks further
                sub_chunks = text_splitter.split_text(doc.page_content)
                for i, sub_chunk in enumerate(sub_chunks):
                    # Preserve metadata from header splitting
                    new_metadata = doc.metadata.copy()
                    new_metadata['sub_chunk'] = i
                    final_chunks.append(Document(
                        page_content=sub_chunk,
                        metadata=new_metadata
                    ))
            else:
                final_chunks.append(doc)
                
        return final_chunks
        
    except Exception as e:
        print(f"Header splitting failed: {e}")
        # Fallback to recursive splitting
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=max_chunk_size,
            chunk_overlap=chunk_overlap,
            separators=["\n\n", "\n", " ", ""],
        )
        chunks = text_splitter.split_text(document_content)
        return [Document(page_content=chunk, metadata={}) for chunk in chunks]

# # Example usage with the hybrid approach
# if docs:
#     first_doc = docs[0]
#     smart_chunks = smart_markdown_split(
#         first_doc.page_content,
#         max_chunk_size=1200,
#         chunk_overlap=150
#     )
    
#     print(f"\nSmart splitting created {len(smart_chunks)} chunks")
    
#     # Display statistics
#     chunk_lengths = [len(chunk.page_content) for chunk in smart_chunks]
#     print(f"Average chunk length: {sum(chunk_lengths) / len(chunk_lengths):.0f} characters")
#     print(f"Min chunk length: {min(chunk_lengths)} characters")
#     print(f"Max chunk length: {max(chunk_lengths)} characters")
    
#     # Display first few chunks with metadata
#     for i, chunk in enumerate(smart_chunks[:3]):
#         print(f"\n--- Smart Chunk {i+1} ---")
#         print(f"Metadata: {chunk.metadata}")
#         print(f"Content preview: {chunk.page_content[:200]}...")
#         print(f"Length: {len(chunk.page_content)} characters")

In [5]:
from langchain.schema import Document

# Utility function to process all your documents
def process_all_documents(docs, output_method='smart', **kwargs):
    """
    Process all loaded documents and return chunks
    
    Args:
        docs: List of loaded documents
        output_method: 'header', 'recursive', or 'smart'
        **kwargs: Additional parameters for the splitting methods
    
    Returns:
        List of all chunks with source document information
    """
    all_chunks = []
    
    for doc_idx, doc in enumerate(docs):
        print(f"Processing document {doc_idx + 1}/{len(docs)}: {doc.metadata.get('source', 'unknown')}")
        
        if output_method == 'header':
            chunks = split_markdown_by_headers(doc.page_content)
        elif output_method == 'recursive':
            chunk_texts = split_markdown_recursive(doc.page_content, **kwargs)
            chunks = [Document(page_content=text, metadata=doc.metadata.copy()) for text in chunk_texts]
        elif output_method == 'smart':
            chunks = smart_markdown_split(doc.page_content, **kwargs)
        else:
            raise ValueError("output_method must be 'header', 'recursive', or 'smart'")
        
        # Add source document information to each chunk
        for chunk_idx, chunk in enumerate(chunks):
            chunk.metadata['source_doc_index'] = doc_idx
            chunk.metadata['chunk_index'] = chunk_idx
            chunk.metadata['original_source'] = doc.metadata.get('source', 'unknown')
            all_chunks.append(chunk)
    
    return all_chunks

# Process all your documents using the smart method
all_processed_chunks = process_all_documents(
    docs, 
    output_method='smart',  # Change to 'header' or 'recursive' if preferred
    max_chunk_size=10000,
    chunk_overlap=0 # deliberately set to 0
)

print(f"\nTotal chunks created from all documents: {len(all_processed_chunks)}")

# Show summary statistics
if all_processed_chunks:
    chunk_lengths = [len(chunk.page_content) for chunk in all_processed_chunks]
    print(f"Average chunk length: {sum(chunk_lengths) / len(chunk_lengths):.0f} characters")
    print(f"Chunk length range: {min(chunk_lengths)} - {max(chunk_lengths)} characters")
    
    # Show distribution by source document
    source_counts = {}
    for chunk in all_processed_chunks:
        source = chunk.metadata.get('original_source', 'unknown')
        source_counts[source] = source_counts.get(source, 0) + 1
    
    print(f"\nChunks per source document:")
    for source, count in source_counts.items():
        print(f"  {source}: {count} chunks")

Processing document 1/1: /Users/debadeepta.dey/datasets/barclays/rise-insights-report-making-data-count-with-ai-DIGITAL.md

Total chunks created from all documents: 13
Average chunk length: 9404 characters
Chunk length range: 2316 - 10000 characters

Chunks per source document:
  /Users/debadeepta.dey/datasets/barclays/rise-insights-report-making-data-count-with-ai-DIGITAL.md: 13 chunks


## VLLM Configuration for RAGAS

The configuration above connects RAGAS to your vLLM server. Here are some key points:

1. **Base URL**: `http://localhost:8003/v1` - your vLLM endpoint
2. **API Key**: Set to "not-needed" since vLLM typically doesn't require authentication
3. **Model Name**: Replace `"your-model-name"` with the actual model you're serving
4. **Temperature**: Controls randomness (0.1 is relatively deterministic)
5. **Max Tokens**: Maximum response length

In [6]:
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings.base import embedding_factory
from langchain_openai import ChatOpenAI

# Configure vLLM hosted LLM
vllm_llm = ChatOpenAI(
    base_url="http://localhost:8002/v1",
    api_key="asdf",
    model="deepseek-ai/DeepSeek-R1-Distill-Llama-70B",  # Replace with your actual model name
    temperature=0.0,
    max_tokens=32768,
)

# Wrap for RAGAS
llm = LangchainLLMWrapper(vllm_llm)


In [7]:
# Test the vLLM connection
print("Testing vLLM connection...")

try:
    # Test the LLM directly
    test_response = vllm_llm.invoke("Hello, this is a test. Please respond briefly.")
    print(f"✅ vLLM connection successful!")
    print(f"Response: {test_response.content}")
    
    # Test with RAGAS wrapper
    from ragas.llms.base import BaseRagasLLM
    if isinstance(llm, BaseRagasLLM):
        print("✅ RAGAS LLM wrapper configured correctly")
    else:
        print("⚠️  RAGAS LLM wrapper might need adjustment")
        
except Exception as e:
    print(f"❌ Error connecting to vLLM: {e}")
    print("Please check:")
    print("1. vLLM server is running")
    print("2. Model name is correct")
    print("3. No firewall blocking the connection")

Testing vLLM connection...
✅ vLLM connection successful!
Response: 

Hello! Got your test message.
✅ RAGAS LLM wrapper configured correctly


# vLLM Hosted Embedding Model Configuration

Here's how to configure a vLLM hosted embedding model for use with RAGAS:

## Option 1: Using OpenAI-compatible embedding endpoint
If your vLLM server hosts an embedding model with OpenAI-compatible API

In [8]:
# from langchain_openai import OpenAIEmbeddings
# from ragas.embeddings import LangchainEmbeddingsWrapper
# import asyncio

# # Try each configuration until one works
# vllm_embeddings = OpenAIEmbeddings(
#         base_url="http://localhost:8001/v1",
#         api_key="asdf",
#         model='thenlper/gte-large',
#         tiktoken_enabled=False,  # Disable tiktoken for vLLM
#     )

# embedding_model = LangchainEmbeddingsWrapper(vllm_embeddings)

# async def test_embedding_model(vllm_embeddings: OpenAIEmbeddings):
#     """Async function to test the embedding model"""
    
#     # Test with a simple text first
#     print("testing query embedding...")
#     test_result = vllm_embeddings.embed_query("Risk management is crucial for financial institutions.")
#     print(f"query embedding dimensions: {len(test_result)}")

#     # Test with texts
#     print("testing text embedding...")
#     test_texts = [
#         "This is a test document about financial analysis.",
#         "Machine learning models are used in banking.",
#         "Risk management is crucial for financial institutions."
#     ]
#     test_results = vllm_embeddings.embed_documents(test_texts)
#     print(f"Text embedding dimensions: {len(test_results[0])} for {len(test_results)} texts")
    
#     # If successful, wrap for RAGAS and test it through the wrapper
#     embedding_model = LangchainEmbeddingsWrapper(vllm_embeddings)

#     print("Testing wrapped embedding model query ...")
#     embedding_result = await embedding_model.embed_query("Risk management is crucial for financial institutions.")
#     print(f"Wrapped query embedding dimensions: {len(embedding_result)}")

#     print("Testing wrapped embedding model text ...")
#     embedding_results = await embedding_model.embed_texts(test_texts, is_async=True)
#     print(f"Wrapped text embedding dimensions: {len(embedding_results[0])} for {len(embedding_results)} texts")

#     print(f"✅ Successfully configured vLLM embedding model")
#     return embedding_model

# # Run the async function
# try:
#     embedding_model = asyncio.run(test_embedding_model(vllm_embeddings))
#     print(f"🎉 Using vLLM embedding model successfully!")
# except Exception as e:
#     print(f"❌ Failed with: {str(e)[:100]}...")
#     embedding_model = None


# Option 2: local embedding model

In [9]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from ragas.embeddings import LangchainEmbeddingsWrapper

local_embeddings = HuggingFaceEmbeddings(
    model_name="WhereIsAI/UAE-Large-V1",
    model_kwargs={"device": "mps"} # Or "cuda" for GPU, "mps" for Mac 
)
local_embeddings = LangchainEmbeddingsWrapper(local_embeddings)

res = local_embeddings.embed_query("Who is this?")  # Test local embedding model
print(res)

res = local_embeddings.embed_text("Who is this?")
print(res)



  local_embeddings = HuggingFaceEmbeddings(


[-0.22206830978393555, 0.051233645528554916, -0.491749107837677, -0.4322299361228943, 0.5002784132957458, -0.2561419606208801, -0.22091081738471985, 0.11627580225467682, 0.4728822112083435, 0.08462154865264893, 0.3242437243461609, -0.19702470302581787, -0.36262261867523193, -0.2812041640281677, -0.36116212606430054, 0.08653219789266586, 0.08922451734542847, -0.16965575516223907, -0.39888814091682434, -0.2973982095718384, 0.6447439789772034, 0.7650296092033386, -0.6067692637443542, -0.3339347541332245, 0.3801654875278473, -0.3341101408004761, -0.16917163133621216, -0.010672621428966522, 0.13589155673980713, 0.7128159999847412, -0.6217780113220215, -0.042145416140556335, -0.07324082404375076, -1.0624228715896606, -0.01020888052880764, -0.7508808374404907, 0.5139471292495728, -0.9261746406555176, -0.4445686340332031, -0.7835230827331543, -0.4164794087409973, 0.0833824947476387, 0.30117857456207275, -0.637333333492279, -0.7911930084228516, 0.09582684934139252, -0.3917580246925354, 0.086835

# Default synthetic data generation

In [10]:
from ragas.testset.transforms import (
    default_transforms, 
    apply_transforms, 
    EmbeddingExtractor, 
    SummaryExtractor, 
    TitleExtractor,
    HeadlinesExtractor,
    HeadlineSplitter,
    KeyphrasesExtractor,
    HeadlineSplitter,
    OverlapScoreBuilder,
)
from ragas.testset.graph import KnowledgeGraph
from ragas.testset.graph import Node, NodeType


# initialize your knowledge graph
kg = KnowledgeGraph()

for chunk in all_processed_chunks:
    kg.nodes.append(
        Node(
            type=NodeType.DOCUMENT,
            properties={"page_content": chunk.page_content, "metadata": chunk.metadata},
        )
    )
print(kg)

KnowledgeGraph(nodes: 13, relationships: 0)


In [12]:
# headline extractor
headline_extractor = HeadlinesExtractor(llm=llm)
apply_transforms(kg, headline_extractor)

Applying HeadlinesExtractor:   0%|          | 0/13 [00:00<?, ?it/s]

Property 'headlines' already exists in node '7c0fe5'. Skipping!
Property 'headlines' already exists in node '0dc2e0'. Skipping!
Property 'headlines' already exists in node 'fde44f'. Skipping!
Property 'headlines' already exists in node 'a0d61a'. Skipping!
Property 'headlines' already exists in node '469e64'. Skipping!
Property 'headlines' already exists in node 'bd6a12'. Skipping!
Property 'headlines' already exists in node 'c5e01c'. Skipping!
Property 'headlines' already exists in node '25fd44'. Skipping!
Property 'headlines' already exists in node 'bfeff3'. Skipping!
Property 'headlines' already exists in node 'ef0551'. Skipping!
Property 'headlines' already exists in node '1019b3'. Skipping!
Property 'headlines' already exists in node '6148ee'. Skipping!
Property 'headlines' already exists in node '0239dd'. Skipping!


In [13]:
# get all the default transforms as well
print("Before")
print(kg)
trans = default_transforms(documents=docs, llm=llm, embedding_model=local_embeddings)
for tran in trans:
    print(f"Applying transform: {tran}")
    apply_transforms(kg, tran)
print("After")
print(kg)

Before
KnowledgeGraph(nodes: 13, relationships: 0)
Applying transform: HeadlinesExtractor(name='HeadlinesExtractor', filter_nodes=<function default_transforms.<locals>.<lambda> at 0x35075b100>, llm=LangchainLLMWrapper(langchain_llm=ChatOpenAI(...)), merge_if_possible=True, max_token_limit=32000, tokenizer=<Encoding 'o200k_base'>, property_name='headlines', prompt=HeadlinesExtractorPrompt(instruction=Extract the most important max_num headlines from the given text that can be used to split the text into independent sections.Focus on Level 2 and Level 3 headings., examples=[(TextWithExtractionLimit(text='                Introduction\n                Overview of the topic...\n\n                Main Concepts\n                Explanation of core ideas...\n\n                Detailed Analysis\n                Techniques and methods for analysis...\n\n                Subsection: Specialized Techniques\n                Further details on specialized techniques...\n\n                Future Direc

Applying HeadlinesExtractor:   0%|          | 0/12 [00:00<?, ?it/s]

Property 'headlines' already exists in node '25fd44'. Skipping!
Property 'headlines' already exists in node 'bd6a12'. Skipping!
Property 'headlines' already exists in node 'a0d61a'. Skipping!
Property 'headlines' already exists in node '1019b3'. Skipping!
Property 'headlines' already exists in node '0239dd'. Skipping!
Property 'headlines' already exists in node '7c0fe5'. Skipping!
Property 'headlines' already exists in node '469e64'. Skipping!
Property 'headlines' already exists in node '0dc2e0'. Skipping!
Property 'headlines' already exists in node 'c5e01c'. Skipping!
Property 'headlines' already exists in node 'bfeff3'. Skipping!
Property 'headlines' already exists in node 'fde44f'. Skipping!
Property 'headlines' already exists in node '6148ee'. Skipping!


Applying transform: HeadlineSplitter(name='HeadlineSplitter', filter_nodes=<function default_filter at 0x3097f7380>, min_tokens=500, max_tokens=1000)


Applying HeadlineSplitter:   0%|          | 0/13 [00:00<?, ?it/s]

Applying transform: SummaryExtractor(name='SummaryExtractor', filter_nodes=<function default_transforms.<locals>.<lambda> at 0x1314ebe20>, llm=LangchainLLMWrapper(langchain_llm=ChatOpenAI(...)), merge_if_possible=True, max_token_limit=32000, tokenizer=<Encoding 'o200k_base'>, property_name='summary', prompt=SummaryExtractorPrompt(instruction=Summarize the given text in less than 10 sentences., examples=[(StringIO(text='Artificial intelligence\n\nArtificial intelligence is transforming various industries by automating tasks that previously required human intelligence. From healthcare to finance, AI is being used to analyze vast amounts of data quickly and accurately. This technology is also driving innovations in areas like self-driving cars and personalized recommendations.'), StringIO(text='AI is revolutionizing industries by automating tasks, analyzing data, and driving innovations like self-driving cars and personalized recommendations.'))], language=english))


Applying SummaryExtractor:   0%|          | 0/12 [00:00<?, ?it/s]

Applying transform: CustomNodeFilter(name='CustomNodeFilter', filter_nodes=<function default_transforms.<locals>.<lambda> at 0x350771f80>, llm=LangchainLLMWrapper(langchain_llm=ChatOpenAI(...)), scoring_prompt=QuestionPotentialPrompt(instruction=Given a document summary and node content, score the content of the node in 1 to 5 range., examples=[], language=english), min_score=2, rubrics={'score1_description': 'The page content is irrelevant or does not align with the main themes or topics of the document summary.', 'score2_description': "The page content partially aligns with the document summary, but it includes unrelated details or lacks critical information related to the document's main themes.", 'score3_description': 'The page content generally reflects the document summary but may miss key details or lack depth in addressing the main themes.', 'score4_description': 'The page content aligns well with the document summary, covering the main themes and topics with minor gaps or mini

Applying CustomNodeFilter:   0%|          | 0/31 [00:00<?, ?it/s]

Applying transform: <ragas.testset.transforms.engine.Parallel object at 0x357c829f0>


Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/74 [00:00<?, ?it/s]

Applying transform: <ragas.testset.transforms.engine.Parallel object at 0x35671fd40>


Applying [CosineSimilarityBuilder, OverlapScoreBuilder]:   0%|          | 0/2 [00:00<?, ?it/s]

After
KnowledgeGraph(nodes: 45, relationships: 114)


In [14]:
print(kg.nodes[0])

Node(id: bfeff3, type: NodeType.DOCUMENT, properties: ['page_content', 'metadata', 'headlines', 'summary', 'summary_embedding'])


In [None]:
summary_extractor = SummaryExtractor(llm=llm)
apply_transforms(kg, summary_extractor)
print("Summary extraction complete")

In [None]:
title_extractor = TitleExtractor(llm=llm)
apply_transforms(kg, title_extractor)
print("Title extraction complete")


In [None]:
keyphrases_extractor = KeyphrasesExtractor(llm=llm)
apply_transforms(kg, keyphrases_extractor)
print("Keyphrases extraction complete")

In [None]:
# get summary embeddings required by automatic persona generator
summary_embedding_extractor = EmbeddingExtractor(embedding_model=local_embeddings,
                                         property_name="summary_embedding",
                                         embed_property_name="summary")
apply_transforms(kg, summary_embedding_extractor)
print("Embedding extraction complete")


In [None]:
# get regular embeddings
regular_embedding_extractor = EmbeddingExtractor(
    embedding_model=local_embeddings,
)
apply_transforms(kg, regular_embedding_extractor)

In [None]:
# headline extractor
headline_extractor = HeadlinesExtractor(llm=llm)
apply_transforms(kg, headline_extractor)

In [None]:
from ragas.testset.transforms import ( 
    CosineSimilarityBuilder,
)
print("Before")
print(kg)
cosine_similarity_builder = CosineSimilarityBuilder(threshold=0.5)
apply_transforms(kg, cosine_similarity_builder)
print("After")
print(kg)

In [None]:
from ragas.testset.transforms import ( 
    OverlapScoreBuilder,
)

print(f"Before")
print(kg)
overlap_score_builder = OverlapScoreBuilder(property_name="keyphrases", 
                                            threshold=0.3)
apply_transforms(kg, overlap_score_builder)
print(kg)

In [None]:
# get all the default transforms as well
print("Before")
print(kg)
trans = default_transforms(documents=docs, llm=llm, embedding_model=local_embeddings)
apply_transforms(kg, trans)
print("After")
print(kg)

In [15]:
# Add this debugging code to see what relationships exist
print("=== Knowledge Graph Debug Info ===")
print(f"Total nodes: {len(kg.nodes)}")
print(f"Total relationships: {len(kg.relationships)}")

# Check relationship types
rel_types = set()
for rel in kg.relationships:
    rel_types.add(rel.type)
    
print(f"Relationship types found: {rel_types}")

=== Knowledge Graph Debug Info ===
Total nodes: 45
Total relationships: 114
Relationship types found: {'entities_overlap', 'cosine_similarity', 'next', 'child'}


In [None]:
print(kg)

In [17]:
from ragas.testset import TestsetGenerator
from ragas.testset.synthesizers import default_query_distribution

generator = TestsetGenerator(llm=llm, embedding_model=local_embeddings, knowledge_graph=kg)
query_distribution = default_query_distribution(llm)
testset = generator.generate(testset_size=10, query_distribution=query_distribution)
testset.to_pandas()



Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/12 [00:00<?, ?it/s]

Unnamed: 0,user_input,reference_contexts,reference,synthesizer_name
0,How Rise Connect help innovators in finance se...,[Data commercialisation\n\n\n| Page | Title |\...,Rise Connect provide platform for innovators t...,single_hop_specifc_query_synthesizer
1,How has Open Banking contributed to innovation...,"[""\n-\n\n\nRise Insights report \#HomeofFinTec...",Open Banking has enabled fintech companies to ...,single_hop_specifc_query_synthesizer
2,How does rise.barclays approach the commercial...,[may find that customers grant them equal or g...,Rise.barclays finds that customers often grant...,single_hop_specifc_query_synthesizer
3,What role did the University of Edinburgh play...,[The impact of data commerce\n\n\nresilience t...,The University of Edinburgh established the De...,single_hop_specifc_query_synthesizer
4,How data commercialisation help with data ethics?,[<1-hop>\n\nrich-widmann-a816a54b\n\n\n17 / ri...,Data commercialisation help with data ethics b...,multi_hop_abstract_query_synthesizer
5,How is Barclays ensuring the ethical use of AI...,"[<1-hop>\n\nEthics in AI\n\n\nDavid Bholat, Ba...",Barclays is ensuring the ethical use of AI in ...,multi_hop_abstract_query_synthesizer
6,What are the key ethical considerations in AI ...,"[<1-hop>\n\nEthics in AI\n\n\nDavid Bholat, Ba...",The key ethical considerations in AI include t...,multi_hop_abstract_query_synthesizer
7,How doe's artifical intelijence help investors...,"[<1-hop>\n\nBanks, for example, would be able ...",Artificial Intelligence (AI) significantly enh...,multi_hop_abstract_query_synthesizer
8,What strategies can financial institutions use...,[<1-hop>\n\nThe risks of AI in investments\n\n...,Financial institutions can commercialize data ...,multi_hop_specific_query_synthesizer
9,What does Michael Payne believe about AI's pot...,"[<1-hop>\n\nEthics in AI\n\n\nDavid Bholat, Ba...",Michael Payne is excited about AI's mainstream...,multi_hop_specific_query_synthesizer


In [19]:
testset_pd = testset.to_pandas()
testset_pd.to_json('barclays_synthetic_multihop.json', orient='records', indent=2)

