In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("example_data/layout-parser-paper.pdf")
pages = loader.load_and_split()

In [None]:
import os
from langchain_ollama import Ollama
from langchain.vectorstores import Chroma
from langchain.embeddings import OllamaEmbeddings
from langchain.schema import Document
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate
import pandas as pd

# Configure Ollama
embedding_model = "mxbai-embed-large"
generation_model = "llama3.1"

ollama_embeddings = Ollama(model=embedding_model)
ollama_llm = Ollama(model=generation_model)

# Initialize vector store (ChromaDB)
def initialize_vector_store(docs, persist_directory="vector_store"):
    vector_store = Chroma.from_documents(
        docs,
        ollama_embeddings,
        persist_directory=persist_directory,
    )
    return vector_store

# Load and Embed Data into Vector Store
def prepare_vector_store(data):
    docs = [Document(page_content=context) for context in data]
    vector_store = initialize_vector_store(docs)
    return vector_store

# Generate Synthetic Data
def generate_synthetic_data(vector_store, user_inputs, num_samples=1):
    synthetic_data = []

    for user_input in user_inputs:
        # Retrieve reference context
        retriever = vector_store.as_retriever(search_kwargs={"k": 1})
        context = retriever.get_relevant_documents(user_input)[0].page_content

        # Generate ground truth answer using the retrieved context
        answer_prompt = f"Context:\n{context}\n\nQuestion:\n{user_input}\n\nProvide a concise and accurate answer based on the context:"
        answer = ollama_llm.generate(answer_prompt)

        # Store the synthetic dataset
        synthetic_data.append({
            "user_input": user_input,
            "reference_context": context,
            "ground_truth": answer,
        })

    return synthetic_data

# Example Contexts (Replace with your dataset)
example_contexts = [
    "The Eiffel Tower is located in Paris, France. It was completed in 1889 and is one of the most recognizable structures in the world.",
    "The Great Wall of China stretches over 13,000 miles and was built to protect against invasions. It is a UNESCO World Heritage site."
]

# Embed contexts into the vector store
vector_store = prepare_vector_store(example_contexts)

# Generate User Inputs (Highly Automatic)
user_input_prompts = [
    "Generate a question about the Eiffel Tower.",
    "Generate a question about the Great Wall of China."
]

user_inputs = [ollama_llm.generate(prompt) for prompt in user_input_prompts]

# Generate Synthetic Dataset
synthetic_dataset = generate_synthetic_data(vector_store, user_inputs, num_samples=5)

# Save Dataset to CSV
df = pd.DataFrame(synthetic_dataset)
df.to_csv("synthetic_rag_evaluation_data.csv", index=False)
print("Synthetic data saved to synthetic_rag_evaluation_data.csv.")
