In [1]:
# 1. Retrieve data (Optional - can use your own dataset / files instead)

!git clone https://huggingface.co/datasets/explodinggradients/Sample_Docs_Markdown

Cloning into 'Sample_Docs_Markdown'...
remote: Enumerating objects: 31, done.[K
remote: Counting objects: 100% (21/21), done.[K
remote: Compressing objects: 100% (21/21), done.[K
remote: Total 31 (delta 4), reused 0 (delta 0), pack-reused 10 (from 1)[K
Unpacking objects: 100% (31/31), 132.02 KiB | 6.00 MiB/s, done.


In [4]:
# 2. Load data into document objects

from langchain_community.document_loaders import DirectoryLoader

path = "Sample_Docs_Markdown/"
loader = DirectoryLoader(path, glob="**/*.md", exclude="README.md")
docs = loader.load()

In [None]:
# 3. Construct knowledge graph

"""
The knowledge graph in RAGAs consists of Node objects which hold information.
They have a type and a set of properties.
Additionally, a knowledge graph could contain relationships.
The graph is crucial when it comes to generation of synthetic test data, since 
it serves as a context for the generation of personas and scenarios.
"""

from ragas.testset.graph import (
    Node,
    NodeType,
    KnowledgeGraph,
)

kg = KnowledgeGraph()

for doc in docs:
    kg.add(
        Node(
            type=NodeType.DOCUMENT,
            properties={"page_content": doc.page_content, "document_metadata": doc.metadata}
        )
    )

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
# 4. Instantiate required objects

# Objects we will use to interact with our running instance of Ollama
from langchain_ollama.llms import OllamaLLM
from langchain_ollama.embeddings import OllamaEmbeddings

# Configuration where one can set a timeout and additional parameters
from ragas.run_config import RunConfig

# A cache which saves data for future re-runs on disk
from ragas.cache import DiskCacheBackend

# Wrappers around Langchain objects required by prompts when generating 
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper

run_config = RunConfig(
    timeout=14400, # 4 hours (Note since llama3.1:8b is not particularly strong you may need more time)
    max_retries=15,
    max_wait=60,
    log_tenacity=True
)

# Caching which makes the applying of transformations much faster and 
# saves output for future re-runs.
cacher = DiskCacheBackend(cache_dir=".cache")

ollama_llm = OllamaLLM(
    model="llama3.1",
    base_url="http://localhost:11434",
    temperature=0.1,
    num_ctx=24000,
    format="json"
)

ollama_embeddings = OllamaEmbeddings(
    model="mxbai-embed-large",
    base_url="http://localhost:11434"
)

langchain_llm = LangchainLLMWrapper(
    langchain_llm=ollama_llm,
    run_config=run_config,
    cache=cacher
)

langchain_embeddings = LangchainEmbeddingsWrapper(
    embeddings=ollama_embeddings,
    run_config=run_config,
    cache=cacher
)

In [None]:
# 5. Create the transformation pipeline

from ragas.testset.transforms.engine import Parallel
from ragas.testset.transforms import default_transforms
from ragas.testset.transforms.extractors.llm_based import (
    KeyphrasesExtractor,
    TopicDescriptionExtractor
)

"""
default_transforms()

This function defines a default set of transformations for processing a knowledge graph.  
It extracts key information from documents, splits them into smaller chunks if necessary,  
and computes relationships between these chunks.

The sequence of transformations works as follows:

1. Document Length Analysis
    - Documents are categorized into three bins based on token length:
    - (0-100 tokens): Too short → raises an error.
    - (101-500 tokens): Medium-length processing.
    - (501+ tokens): Long documents undergo additional processing.

2. Headline Extraction & Splitting (for long documents, 501+ tokens)  
    - Extracts headlines from large documents to create logical sections.  
    - Splits long documents into smaller chunks at headline boundaries.

3. Summary Extraction  
    - Generates a concise summary for each document to facilitate quick understanding.

4. Named Entity Recognition (NER) & Theme Extraction  
    - NERExtractor identifies and extracts named entities (e.g., people, organizations, locations).  
    - ThemesExtractor detects overarching topics/themes in each chunk.

5. Embedding Generation
    - Uses an embedding model to convert summaries into vector representations for similarity-based retrieval.

6. Cosine Similarity Computation  
    - Measures semantic similarity between documents based on their summary embeddings.  
    - Creates relationships between similar documents using a threshold (0.7 for long docs, 0.5 for medium docs).

7. NER-Based Overlap Score Computation
    - Computes overlap scores between extracted named entities in different chunks.  
    - Helps detect if two chunks talk about similar entities.

8. Custom Node Filtering 
    - Filters nodes to keep only relevant chunks for processing.

9. Parallel Processing for Efficiency
    - Certain transformations run in parallel to improve performance:
      - Summary embeddings, theme extraction, and NER run together.
      - Cosine similarity and entity overlap scoring run together.

-> Final Outcome:
    - A structured set of document transformations that extracts headlines, summaries, key entities, themes, embeddings, 
        and relationships between different chunks/documents
    - Used to construct a knowledge graph for downstream retrieval-augmented generation (RAG) tasks.
"""
transforms = default_transforms(
    docs,
    langchain_llm,
    langchain_embeddings
)

"""
Using 2 additional extractors for keyphrases and topic description.
Both of those extractors are going to be performed in parallel.
Finally, we extend the default transformation by 2 additional ones
    by adding them before the cosine distance similarity transformation.
"""
keyphrase_extractor = KeyphrasesExtractor(
    llm=langchain_llm,
    max_num=15
)

topic_description_extractor = TopicDescriptionExtractor(
    llm=langchain_llm
)

parallel_transforms = Parallel(
    keyphrase_extractor,
    topic_description_extractor
)

transforms.insert(-1, parallel_transforms)

In [16]:
# 6. Apply the transformations to the knowledge graph

from ragas.testset.transforms import apply_transforms

apply_transforms(kg, transforms, run_config)

Applying HeadlinesExtractor:   0%|          | 0/6 [00:00<?, ?it/s]Property 'headlines' already exists in node 'cf944d'. Skipping!
Property 'headlines' already exists in node '872d4b'. Skipping!
Property 'headlines' already exists in node '45230e'. Skipping!
Property 'headlines' already exists in node 'd41106'. Skipping!
Property 'headlines' already exists in node 'fb8933'. Skipping!
Property 'headlines' already exists in node 'cf944d'. Skipping!
Applying HeadlineSplitter:   0%|          | 0/24 [00:00<?, ?it/s] unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformati

In [28]:
# 7. Generate personas

from persona import generate_personas_from_kg

"""
A persona is an entity/role which interacts with the system.
It groups people together under a common theme and goals.
Example: a Senior DevOps engineer, a Junior Data Scientist, a Marketing Manager in the context of an IT company 

Persona object consists of a name and a description.
The name is used to identify the persona and the description is used to describe the role of the persona.
"""

personas = generate_personas_from_kg(
    kg=kg,
    llm=langchain_llm,
    num_personas=10
)

Generating personas: 100%|██████████| 10/10 [01:38<00:00,  9.87s/it]


In [29]:
personas

[Persona(name='Human Resources Manager', role_description='Oversees diversity, inclusion, and equality initiatives within the organization.'),
 Persona(name='DIB Team Lead', role_description='Oversees diversity, inclusion, and belonging initiatives, ensuring effective communication and team member engagement.'),
 Persona(name='Diversity and Inclusion Program Manager', role_description='Focuses on creating an equitable work environment by promoting allyship, diversity, and inclusion strategies.'),
 Persona(name='Diana Rodriguez', role_description='Serves as a member of the Advisory Group to foster diversity and promote inclusive practices within an organization.'),
 Persona(name='Liam McNally', role_description='Curates and promotes programs of events that celebrate diversity, inclusion, and belonging within the workplace.'),
 Persona(name='Global Diversity and Inclusion Director', role_description='Oversees initiatives to promote diversity, equity, and inclusion across various regions 

In [26]:
personas

[Persona(name='DIB Team Member', role_description='Participates in quarterly roundtables organized by the Diversity, Inclusion & Belonging (DIB) Team to discuss and build connections related to diversity, inclusion, and belonging issues within teams.'),
 Persona(name='DIB Team Member', role_description='Participates in quarterly roundtables organized by the Diversity, Inclusion & Belonging (DIB) Team to discuss and build connections related to diversity, inclusion, and belonging issues within teams.'),
 Persona(name='DIB Team Member', role_description='Participates in quarterly roundtables organized by the Diversity, Inclusion & Belonging (DIB) Team to discuss and build connections related to diversity, inclusion, and belonging issues within teams.'),
 Persona(name='DIB Team Member', role_description='Participates in quarterly roundtables organized by the Diversity, Inclusion & Belonging (DIB) Team to discuss and build connections related to diversity, inclusion, and belonging issues w

In [None]:
from ragas.testset.synthesizers.single_hop.specific import SingleHopSpecificQuerySynthesizer
from ragas.testset.synthesizers.multi_hop.specific import MultiHopSpecificQuerySynthesizer
from ragas.testset.synthesizers.multi_hop.abstract import MultiHopAbstractQuerySynthesizer

""" 
There are two main types of queries in RAGAs:
    -> SingleHopQuery where the context relevant for answering a question lies in a single document/chunk
    -> MultiHopQuery where the context relevant for answering a question lies in multiple documents/chunks
Additionally, for each of those queries there's a Specific or Abstract query variant:
    -> Specific one which pertains to a fact. 
        Example: When did WW1 break out? (Can be precisely answered, there's no room for guessing/interpretation)
    -> Abstract one which is more about testing the reasoning capabilities of the LLM. 
        Example: Why did WW1 break out? (There's room for interpretation in this case)

"""

"""
What `SingleHopSpecificQuerySynthesizer` provides:
    -> Can filter out all nodes which match/contain the selected property
    -> Generate a scenorio (hidden function, marked with underscore)
        If no nodes have been found with the specified entity a ValueError is thrown
"""



single_hop_specific_entities = SingleHopSpecificQuerySynthesizer(
    llm=langchain_llm,
    property_name="entities"
)

single_hop_specific_keyphrases = SingleHopSpecificQuerySynthesizer(
    llm=langchain_llm,
    property_name="keyphrases"
)

single_hop_specific_headlines = SingleHopSpecificQuerySynthesizer(
    llm=langchain_llm,
    property_name="headlines"
)

single_hop_specific_themes = SingleHopSpecificQuerySynthesizer(
    llm=langchain_llm,
    property_name="themes"
)

multi_hop_specific_entities = MultiHopSpecificQuerySynthesizer(
    llm=langchain_llm
)

multi_hop_abstract_entities = MultiHopAbstractQuerySynthesizer(
    llm=langchain_llm
)

query_distribution = [
    (single_hop_specific_entities, 0.125),
    (single_hop_specific_keyphrases, 0.125),
    (single_hop_specific_headlines, 0.125),
    (single_hop_specific_themes, 0.125),
    (multi_hop_specific_entities, 0.25),
    (multi_hop_abstract_entities, 0.25),
]

In [None]:
from ragas.testset.synthesizers import default_query_distribution

In [9]:
from dotenv import load_dotenv
from ragas.testset import TestsetGenerator

load_dotenv()

generator = TestsetGenerator(
    llm=langchain_llm,
    embedding_model=langchain_embeddings,
    knowledge_graph=kg
)

dataset = generator.generate_with_langchain_docs(
    docs,
    testset_size=50,
    query_distribution=query_distribution,
    run_config=run_config,
    with_debugging_logs=True,
)

Applying SummaryExtractor:   0%|          | 0/14 [00:00<?, ?it/s] Property 'summary' already exists in node 'd09b23'. Skipping!
Property 'summary' already exists in node 'ba784b'. Skipping!
Property 'summary' already exists in node 'e76f93'. Skipping!
Property 'summary' already exists in node '506bfd'. Skipping!
Property 'summary' already exists in node '818e1a'. Skipping!
Property 'summary' already exists in node '11314d'. Skipping!
Applying [EmbeddingExtractor, ThemesExtractor, NERExtractor]:   0%|          | 0/22 [00:00<?, ?it/s]Property 'summary_embedding' already exists in node 'd09b23'. Skipping!
Property 'summary_embedding' already exists in node 'ba784b'. Skipping!
Property 'summary_embedding' already exists in node 'e76f93'. Skipping!
Property 'summary_embedding' already exists in node '506bfd'. Skipping!
Property 'summary_embedding' already exists in node '818e1a'. Skipping!
Property 'summary_embedding' already exists in node '11314d'. Skipping!
Generating personas: 100%|████

ValueError: No clusters found in the knowledge graph. Try changing the relationship condition.

In [27]:
for node in kg.nodes:
    if node.type == NodeType.DOCUMENT:
        print(node.properties.keys())

dict_keys(['page_content', 'document_metadata', 'keyphrases', 'topic_description'])
dict_keys(['page_content', 'document_metadata', 'headlines', 'summary', 'summary_embedding', 'keyphrases', 'topic_description'])
dict_keys(['page_content', 'document_metadata', 'headlines', 'summary', 'summary_embedding', 'keyphrases', 'topic_description'])
dict_keys(['page_content', 'document_metadata', 'keyphrases', 'topic_description'])
dict_keys(['page_content', 'document_metadata', 'keyphrases', 'topic_description'])
dict_keys(['page_content', 'document_metadata', 'keyphrases', 'topic_description'])
dict_keys(['page_content', 'document_metadata', 'headlines', 'summary', 'summary_embedding', 'keyphrases', 'topic_description'])
dict_keys(['page_content', 'document_metadata', 'keyphrases', 'topic_description'])
dict_keys(['page_content', 'document_metadata', 'headlines', 'summary', 'summary_embedding', 'keyphrases', 'topic_description'])
dict_keys(['page_content', 'document_metadata', 'keyphrases', 't

In [13]:
for node in kg.nodes:
    print(node.properties.get("themes", None))

None
None
None
None
None
None
None
None
None
None
None
None
['Ally', 'Diversity and inclusion', 'Belonging', 'Self-education', 'Active listening', 'Empathy and emotional intelligence', 'Humility', 'Non-defensive', 'Willingness to take feedback', 'Courage', 'Self-awareness', 'Privilege', 'Oppression', 'Power', 'Marginalized groups', 'Performative allyship']
['Allyship', 'Empathy', 'Privilege', 'Oppression', 'Social capital', 'Marginalized groups', 'Power dynamics', 'Authority', 'Education', 'Personal growth']
['Diversity on teams', 'Allyship', 'Recruiting and hiring', 'Sourcing', 'Interviewing', 'Compensation', 'Guidance and support', 'Mentorship', 'Difficult conversations', 'Performance conversations']
['Allyship', 'Diversity', 'Inclusion', 'Belonging', 'Equality', 'Empathy', 'Effective Listening', 'Building Trust', 'Anti-racism', ' Ally Training']
['Sales Sponsorship Pilot Program', 'Mentorship', 'Sponsorship', 'GitLab', 'Career advancement', 'Leadership development', 'Communication s