In [None]:
# 1. Retrieve data (Optional - can use your own dataset / files instead)

!git clone https://huggingface.co/datasets/explodinggradients/Sample_Docs_Markdown

Cloning into 'Sample_Docs_Markdown'...
remote: Enumerating objects: 31, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (27/27), done.[K
remote: Total 31 (delta 5), reused 0 (delta 0), pack-reused 4 (from 1)[K
Unpacking objects: 100% (31/31), 131.87 KiB | 1.45 MiB/s, done.


In [2]:
# 2. Load data

from langchain_community.document_loaders import DirectoryLoader

path = "Sample_Docs_Markdown/"
loader = DirectoryLoader(path, glob="**/*.md")
docs = loader.load()

In [5]:
# 3. Construct objects

from langchain_ollama.llms import OllamaLLM
from langchain_ollama.embeddings import OllamaEmbeddings

from ragas.run_config import RunConfig
from ragas.cache import DiskCacheBackend
from ragas.llms import LangchainLLMWrapper
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.testset.synthesizers import default_query_distribution

run_config = RunConfig(
    timeout=7200, # 2 hours
    max_retries=15,
    max_wait=60,
    log_tenacity=True
)

cacher = DiskCacheBackend(cache_dir=".cache")

ollama_llm = OllamaLLM(
    model="llama3.1",
    base_url="http://localhost:11434",
    temperature=0.1,
    num_ctx=24000,
    format="json"
)

ollama_embeddings = OllamaEmbeddings(
    model="mxbai-embed-large",
    base_url="http://localhost:11434"
)

langchain_llm = LangchainLLMWrapper(
    langchain_llm=ollama_llm,
    run_config=run_config,
    cache=cacher
)

langchain_embeddings = LangchainEmbeddingsWrapper(
    embeddings=ollama_embeddings,
    run_config=run_config,
    cache=cacher
)

query_distribution = default_query_distribution(langchain_llm)

In [6]:
from ragas.testset import TestsetGenerator

# 4. Generate the synthetic test dataset

generator = TestsetGenerator(
    llm=langchain_llm,
    embedding_model=langchain_embeddings
)

dataset = generator.generate_with_langchain_docs(
    docs,
    testset_size=30,
    query_distribution=query_distribution,
    run_config=run_config,
    with_debugging_logs=True,
)

Applying HeadlineSplitter:   0%|          | 0/12 [00:00<?, ?it/s] unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
unable to apply transformation: 'headlines' property not found in this node
Applying SummaryExtractor:   0%|          | 0/6 [00:00<?, ?it/s] Property 'summary' already exists in node '7fc061'. Skipping!
Applying CustomNodeFilter:   0%|          | 0/12 [00:00<?, ?it/s]Prompt fix_output_format failed to parse output: The output parser failed to parse the output including retries.
Prompt fix_output_format failed to parse output: The output parser failed to parse the output inc

In [17]:
# 5. Inspect the synthetic testset (Optional)

dataset.samples

[TestsetSample(eval_sample=SingleTurnSample(user_input='What does it mean to be an ally at GitLab?', retrieved_contexts=None, reference_contexts=['What is an ally? A diversity, inclusion and belonging "ally" is someone who is willing to take action in support of another person, in order to remove barriers that impede that person from contributing their skills and talents in the workplace or community. Being an ally is a verb, this means that you proactively and purposefully take action and is not something forced upon you. How to be an ally It is not required to be an ally to work at GitLab. At GitLab it is required to be inclusive. Being an ally goes a step beyond being inclusive to taking action to support marginalized groups. The first step in being an ally is self-educating. This ally lab will provide you with some of the tools, resources and learning activities to help you grow as an ally. Skills and Behaviors of allies To be an effective ally it is important to understand some of

In [None]:
# 6. Ingest documents in R2R if not already done (Optional)

import nest_asyncio
from pathlib import Path
from r2r import R2RClient, R2RException

nest_asyncio.apply()

client = R2RClient(
    base_url="http://localhost:7272",
    timeout=600
)

dir_path = Path("Sample_Docs_Markdown")
for item in dir_path.iterdir():
    if item.is_file() and item.suffix == '.md':
        try:
            client.documents.create(
                file_path=str(item),
                ingestion_mode="custom",
                run_with_orchestration=True   
            )
            print(f"Ingested file: {item.name}")
        except R2RException as r2re:
            print(f"Couldn't ingest file: {item.name} due to {str(r2re)}")
        except Exception as e:
            print(f"Couldn't ingest file: {item.name} due to {str(e)}")


In [20]:
# 7. Fill out the retrieved_contexts and response fields

import re
import ollama
import nest_asyncio
from r2r import R2RClient, R2RException

nest_asyncio.apply()

client = R2RClient(
    base_url="http://localhost:7272",
    timeout=600
)

search_settings = {
    "use_semantic_search": True,
    "limit": 5,
    "offset": 0,
    "include_metadatas": False,
    "include_scores": True,
    "search_strategy": "vanilla",
}
    
rag_generation_config = {
    "temperature": 0.1,
    "top_p": 1,
    "max_tokens_to_sample": 512
}

template = """
## Task:
 
Answer the query given below using the provided context. Keep your answer very short and concise!
     
 - Aim to answer in 2-3 sentences whenever possible
 - If a longer answer is needed, make it as concise as possible focusing on the relevant
 - For step-by-step guides, use numbered steps with each step on a new line
 - If there're multiple answers, use numbered steps with each step on a new line
 - DO NOT use line item references for the context
 - If there is no context available locally to answer, inform the user of insufficient information
 - NEVER provide an answer if there's no context that discusses it
 - NEVER reason about a possible answer! If no context can answer the query there should be NO answer
 
 ### Query:
 
 {query}
 
 
 ### Context:
 
 {context}
 
 
 ### Query:
 
 {query}
 
 
 # Reminder: Provide short and concise answers and NEVER answer something that is not in the provided context!
 
 ## Response:
 """
    
def summarize_ctx_template(context: str) -> str:
    summarize_prompt = f"""
    Summarize the following context while preserving all key information:
    
    {context}
    
    Provide a concise summary that includes all essential facts, data points, and information.
    Try to stay under 4 sentences. Only provide the summary and no further explanation or details.
    Don't mention things like: Here is a concise summary of the key information.
    """
    return summarize_prompt
    
# Make sure to use a different variable if something goes wrong
final_dataset = dataset
for i, sample in enumerate(final_dataset.samples):
    try:
        # Submit a query
        response = client.retrieval.rag(
            query=sample.eval_sample.user_input,
            search_mode="custom",
            search_settings=search_settings,
            rag_generation_config=rag_generation_config,
            task_prompt_override=template
        ).results

        # After getting the response summarize the context
        full_ctx = "\n".join([re.sub(r"\n+", "\n", chunk.text) for chunk in response.search_results.chunk_search_results])
        
        # Generate the summary by using the LLM
        summary_ctx = ollama.generate(
            model="llama3.1",
            prompt = summarize_ctx_template(full_ctx),
            options = {
                "temperature": 0.1,
                "num_predict": 512
            }
        )['response']
        
        llm_response = response.completion
        retrieved_context = summary_ctx

        final_dataset.samples[i].eval_sample.response = llm_response
        final_dataset.samples[i].eval_sample.retrieved_contexts = [retrieved_context]

        print(f"Configured sample: {i + 1} out of {len(final_dataset.samples)}")
        
    except ollama.RequestError | ollama.ResponseError as oe:
        print(f"Something went wrong when submitting query: {sample.eval_sample.user_input} due to {str(oe)}")
    except R2RException as r2re:
        print(f"Something went wrong when submitting query: {sample.eval_sample.user_input} due to {str(r2re)}")
    except Exception as e:
        print(f"Something went wrong when submitting query: {sample.eval_sample.user_input} due to {str(e)}")

Configured sample: 1 out of 30
Configured sample: 2 out of 30
Configured sample: 3 out of 30
Configured sample: 4 out of 30
Configured sample: 5 out of 30
Configured sample: 6 out of 30
Configured sample: 7 out of 30
Configured sample: 8 out of 30
Configured sample: 9 out of 30
Configured sample: 10 out of 30
Configured sample: 11 out of 30
Configured sample: 12 out of 30
Configured sample: 13 out of 30
Configured sample: 14 out of 30
Configured sample: 15 out of 30
Configured sample: 16 out of 30
Configured sample: 17 out of 30
Configured sample: 18 out of 30
Configured sample: 19 out of 30
Configured sample: 20 out of 30
Configured sample: 21 out of 30
Configured sample: 22 out of 30
Configured sample: 23 out of 30
Configured sample: 24 out of 30
Configured sample: 25 out of 30
Configured sample: 26 out of 30
Configured sample: 27 out of 30
Configured sample: 28 out of 30
Configured sample: 29 out of 30
Configured sample: 30 out of 30


In [28]:
# 8. Save the dataset

final_dataset.to_jsonl("dataset.jsonl")
final_dataset.to_csv("dataset.csv")