# Synthetic Dataset Generation

In [1]:
from langchain_aws import ChatBedrock

llm = ChatBedrock(
    model_id="amazon.titan-text-premier-v1:0",
    model_kwargs={
        "max_tokens": 1000,
        "temperature": 0.5,
        "top_p": 0.9,
    },
)

In [14]:
from typing import List, Optional
from langchain_huggingface import HuggingFaceEmbeddings
from deepeval.models import DeepEvalBaseEmbeddingModel

from greencompute_backend.services.llm.svc import EMBEDDINGS_MODEL

class CustomEmbeddingModel(DeepEvalBaseEmbeddingModel):
    def __init__(self):
        pass

    def load_model(self):
        return HuggingFaceEmbeddings(
			model_name=EMBEDDINGS_MODEL
		)

    def embed_text(self, text: str) -> List[float]:
        embedding_model = self.load_model()
        return embedding_model.embed_query(text)

    def embed_texts(self, texts: List[str]) -> List[List[float]]:
        embedding_model = self.load_model()
        return embedding_model.embed_documents(texts)

    async def a_embed_text(self, text: str) -> List[float]:
        embedding_model = self.load_model()
        return await embedding_model.aembed_query(text)

    async def a_embed_texts(self, texts: List[str]) -> List[List[float]]:
        embedding_model = self.load_model()
        return await embedding_model.aembed_documents(texts)

    def get_model_name(self):
        "Custom HugginFace Embeddings Model"

In [28]:
from deepeval.models.base_model import DeepEvalBaseLLM
from pydantic import BaseModel

class AmazonTitan(DeepEvalBaseLLM):
    def __init__(
        self,
        model
    ):
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str, schema: BaseModel) -> BaseModel:
        chat_model = LMFormatEnforcer(self.load_model(), json_schema=schema.model_json_schema())
        return chat_model.invoke(prompt).content

    async def a_generate(self, prompt: str, schema: BaseModel | None = None) -> BaseModel:
        return self.generate(prompt, schema)

    def get_model_name(self):
        return "Custom Bedrock Model"

# Replace these with real values
custom_model = ChatBedrock(
    model_id="amazon.titan-text-premier-v1:0",
    max_tokens=1000,
    temperature=0.7,
)
amazon_titan = AmazonTitan(model=custom_model)
# print(amazon_titan.generate("Write me a joke"))

In [None]:
from deepeval.synthesizer import Synthesizer
from dotenv import load_dotenv; load_dotenv()

synthesizer = Synthesizer(
    model=amazon_titan,
    critic_model=amazon_titan,
    embedder=CustomEmbeddingModel(),
    async_mode=False
)
synthesizer.generate_goldens_from_docs(
    document_paths=['output.txt'],
)

In [None]:
from deepeval.synthesizer import Synthesizer
from dotenv import load_dotenv; load_dotenv()

synthesizer = Synthesizer()
synthesizer.generate_goldens_from_docs(
    document_paths=['output.txt'],
)