### Ragas: Synthetic Test Data generation

In [1]:
# %pip install ragas

In [14]:
import os
import json
from dotenv import load_dotenv
from ragas.testset.generator import TestsetGenerator
from ragas.testset.evolutions import simple, reasoning, multi_context
from pathlib import Path
from langchain.document_loaders import TextLoader
from langchain.schema import Document

load_dotenv(override=True)

PARENT_PATH = Path.cwd().parent
CHUNK_PATH = PARENT_PATH / 'data' / 'temp'

os.environ["OPENAI_API_KEY"] = os.environ["OPENAI_API_EMBEDDING_KEY_INTERNAL"]

def get_documents():
    docs = []
    file_path = os.path.join(CHUNK_PATH, 'temp.txt')  # Specify your actual file name here

    try:
        # Use TextLoader to load the content of the file
        loader = TextLoader(file_path, autodetect_encoding=True)
        documents = loader.load()

        # Since TextLoader returns one big document with the whole file content,
        # we need to split it into lines and parse each as a JSON document
        for line in documents[0].page_content.splitlines():
            if not line.strip():
                continue  # Skip empty lines

            document = json.loads(line)  # Load each line as a JSON object

            # Separate the 'text' field from metadata
            text_content = document.pop('text', None)  # Get 'text' and remove it from document
            metadata = document  # All other fields are considered metadata

            # Parse the 'text' field if it's a stringified JSON
            if text_content and isinstance(text_content, str):
                try:
                    text_content = json.loads(text_content)  # Parse the stringified JSON in 'text'
                except json.JSONDecodeError:
                    pass  # If parsing fails, leave the 'text' field as a string

            # Ensure that page_content is always a string
            if isinstance(text_content, (dict, list)):
                text_content = json.dumps(text_content)  # Convert JSON to string

            # Convert to LangChain Document
            langchain_document = Document(
                page_content=text_content,  # The 'text' content becomes the page_content
                metadata=metadata  # Metadata goes into metadata field
            )

            # Add the LangChain Document to the list
            docs.append(langchain_document)

    except Exception as e:
        print(f"Error reading the file: {e}")

    return docs



documents = get_documents()
print(documents[0])

generator = TestsetGenerator.with_openai()
testset = generator.generate_with_langchain_docs(documents, test_size=20, distributions={simple: 0.25, reasoning: 0.45, multi_context: 0.30})

page_content='{"Type": "PMDG", "Country": "India", "Crop": "Rice/Paddy", "PAN": "20187800870", "Language": "English", "PMDGVersionID": "105687", "PMDGParentID": "105209", "Title": "Bakanae disease of paddy", "CropCommonNames": "Foolish Seedling", "ProblemScientificName": "Fusarium moniliforme", "Image1Caption": "Root growth from infected node (O.P. Sharma, Bugwood.org)"}' metadata={'Country': 'India', 'Language': 'English', 'Last_Updated': '2024-08-30T11:06:38.065529+00:00', 'Source': '20187800870', 'Title': 'Bakanae disease of paddy', 'Type': 'PMDG'}


  generator = TestsetGenerator.with_openai()


embedding nodes:   0%|          | 0/92 [00:00<?, ?it/s]

Filename and doc_id are the same for all nodes.


Generating:   0%|          | 0/20 [00:00<?, ?it/s]

In [15]:
test_df = testset.to_pandas()
test_df.head()

Unnamed: 0,question,contexts,ground_truth,evolution_type,metadata,episode_done
0,What is the active ingredient in the bio-contr...,"[{""Type"": ""BioControl-Product"", ""Country"": ""Ke...",The active ingredient in the bio-control produ...,simple,"[{'Country': 'Kenya', 'Language': 'English', '...",True
1,What is the active ingredient in the bio-contr...,"[{""Type"": ""BioControl-Product"", ""Country"": ""Ke...",The active ingredient in the bio-control produ...,simple,"[{'Country': 'Kenya', 'Language': 'English', '...",True
2,What are the restrictions and guidelines for u...,"[{""Pesticides(Chemical)"": {""YellowRestrictions...",The restrictions and guidelines for using pest...,simple,"[{'Country': 'Kenya', 'Language': 'English', '...",True
3,What crops is Bio-cure B used for in India?,"[{""Type"": ""Countrywise-Biocontrol-Use"", ""Produ...",Bio-cure B is used for rice in India.,simple,"[{'Country': 'India', 'Crops': '['rice']', 'La...",True
4,What are some methods of biocontrol for managi...,"[{""Biocontrol(Natural)"": {""GreenDirectControlD...",Some methods of biocontrol for managing diseas...,simple,"[{'Country': 'Kenya', 'Language': 'English', '...",True


In [16]:
csv_file_path = os.path.join(CHUNK_PATH, 'temp_output.csv')  
test_df.to_csv(csv_file_path, index=False)

print(f"Test set exported to {csv_file_path}")

Test set exported to C:\Users\MallickD\CABIGitProjects\PlantwisePlus.GAIAPoc.Microservice.Web\data\temp\temp_output.csv
