# Load PDF

In [1]:
from langchain_community.document_loaders import FileSystemBlobLoader
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.parsers import PyPDFParser

loader = GenericLoader(
    blob_loader=FileSystemBlobLoader(
        path="../datasets",
        glob="**/*.pdf",
    ),
    blob_parser=PyPDFParser(),
)
docs = loader.load()

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.
Ignoring wrong pointing object 68 0 (offset 0)
Ignoring wrong pointing object 72 0 (offset 0)


In [2]:
from dotenv import load_dotenv
load_dotenv()

True

# Test Generation

In [3]:
from pprint import pp
import os
from llm import ModelGardenLLM
from embeddings import ModelGardenEmbeddings, OllamaRagasEmbeddings
from langchain_ollama import OllamaLLM, OllamaEmbeddings

llm_type = os.getenv('LLM_TYPE')
model = os.getenv('MODEL_GARDEN_MODEL')
embedding = os.getenv('EMBEDDING_MODEL')

if llm_type == "model_garden":
    url = os.getenv('MODEL_GARDEN_URL')
    embed_url = os.getenv('EMBEDDING_URL')
    llm = ModelGardenLLM(api_url=url, model=model)
    embeds = ModelGardenEmbeddings(api_url=embed_url, model=embedding)
elif llm_type == "ollama":
    llm = OllamaLLM(model=model, temperature=0.4)
    embeds = OllamaRagasEmbeddings(model=embedding)
else:
    raise ValueError(f"Unsupported LLM type: {llm_type}")
pp(llm_type)

'model_garden'


In [4]:
from ragas.testset import TestsetGenerator

generator = TestsetGenerator(llm=llm, embedding_model=embeds)
dataset = generator.generate_with_langchain_docs(docs, testset_size=1)

Applying SummaryExtractor:   0%|          | 0/4 [00:00<?, ?it/s]

Applying CustomNodeFilter:   0%|          | 0/4 [00:00<?, ?it/s]

Applying EmbeddingExtractor:   0%|          | 0/4 [00:00<?, ?it/s]

Applying ThemesExtractor:   0%|          | 0/4 [00:00<?, ?it/s]

Applying NERExtractor:   0%|          | 0/4 [00:00<?, ?it/s]

Applying CosineSimilarityBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

Applying OverlapScoreBuilder:   0%|          | 0/1 [00:00<?, ?it/s]

Generating personas:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Scenarios:   0%|          | 0/3 [00:00<?, ?it/s]

Generating Samples:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
df = dataset.to_pandas()
df

Unnamed: 0,user_input,reference_contexts,reference,persona_name,query_style,query_length,synthesizer_name
0,"According to the provided text, what is the pr...",[H Ho ow w w we e b bu ui il lt t ‘ ‘B BA A...,The primary purpose of building ‘BARITO’ is to...,DevOps Engineer - GO-JEK,PERFECT_GRAMMAR,MEDIUM,single_hop_specific_query_synthesizer
1,How did GoPay address the challenges of a comp...,"[<1-hop>\n\nbigger, the ecosystem has also bec...","GoPay, as part of GoTo Financial, tackled the ...",,,,multi_hop_abstract_query_synthesizer
2,According to a RedSeer Industry Report release...,"[<1-hop>\n\nGoPay, as part of Indonesia’s tech...",According to a RedSeer Industry Report release...,,,,multi_hop_specific_query_synthesizer


In [6]:
df_final = df[['user_input', 'reference_contexts', 'reference']]
df_final.to_csv('test_dataset.csv', index=False)
df_final

Unnamed: 0,user_input,reference_contexts,reference
0,"According to the provided text, what is the pr...",[H Ho ow w w we e b bu ui il lt t ‘ ‘B BA A...,The primary purpose of building ‘BARITO’ is to...
1,How did GoPay address the challenges of a comp...,"[<1-hop>\n\nbigger, the ecosystem has also bec...","GoPay, as part of GoTo Financial, tackled the ..."
2,According to a RedSeer Industry Report release...,"[<1-hop>\n\nGoPay, as part of Indonesia’s tech...",According to a RedSeer Industry Report release...
