# Generate Sythetic Data

### Setup

In [2]:
%pip install --upgrade --quiet  langchain langchain_experimental

Note: you may need to restart the kernel to use updated packages.


In [6]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_experimental.tabular_synthetic_data.base import SyntheticDataGenerator
from langchain_experimental.tabular_synthetic_data.openai import OPENAI_TEMPLATE
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)
from langchain_ollama import ChatOllama
from pydantic import BaseModel

## 1. Define your data model

**schema**: a structure of a dataset

In [7]:
class MedicalBilling(BaseModel):
    patient_id: int
    patient_name: str
    diagnosis_code: str
    procedure_code: str
    total_charge: float
    insurance_claim_amount: float

## 2. Sample data

**seed**: representative of the kind of data you want

In [8]:
examples = [
    {
        "example": """Patient ID: 123456, Patient Name: John Doe, Diagnosis Code: 
        J20.9, Procedure Code: 99203, Total Charge: $500, Insurance Claim Amount: $350"""
    },
    {
        "example": """Patient ID: 789012, Patient Name: Johnson Smith, Diagnosis 
        Code: M54.5, Procedure Code: 99213, Total Charge: $150, Insurance Claim Amount: $120"""
    },
    {
        "example": """Patient ID: 345678, Patient Name: Emily Stone, Diagnosis Code: 
        E11.9, Procedure Code: 99214, Total Charge: $300, Insurance Claim Amount: $250"""
    },
]

## 3. Craft a prompt template

In [9]:
OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

## 4. Creating the data generator

In [20]:
llm=ChatOllama( model="llama3.1", temperature=1)
synthetic_data_generator = SyntheticDataGenerator(
    llm=llm,
    template=prompt_template
)

## 5. Generate synthetic data

In [21]:
synthetic_results = synthetic_data_generator.generate(
    subject="medical_billing",
    extra="the name must be chosen at random. Make it something you wouldn't normally choose.",
    runs=10,
)

In [22]:
print(synthetic_results)

['Here are five examples of synthetic medical billing data:\n\nPatient ID: 901234, Patient Name: Balthazar McSnodgrass, Diagnosis Code: R05.9, Procedure Code: 99201, Total Charge: $1200, Insurance Claim Amount: $900\n\nPatient ID: 678901, Patient Name: Zephyr Wimplebottom, Diagnosis Code: F02.2, Procedure Code: 99204, Total Charge: $2500, Insurance Claim Amount: $2000\n\nPatient ID: 135791, Patient Name: Bertram Pocketwatch, Diagnosis Code: M60.3, Procedure Code: 99210, Total Charge: $800, Insurance Claim Amount: $600\n\nPatient ID: 943215, Patient Name: Clarissa Quizzlethorpe, Diagnosis Code: J30.9, Procedure Code: 99211, Total Charge: $1500, Insurance Claim Amount: $1200\n\nPatient ID: 278491, Patient Name: Godfrey Snoodlewhomper, Diagnosis Code: K55.4, Procedure Code: 99212, Total Charge: $1000, Insurance Claim Amount: $800', 'Here are five examples of synthetic medical billing data with randomly generated names:\n\nPatient ID: 7654321, Patient Name: Thorold Quargsplore, Diagnosis C

### Other implementations

## Generating examplary dataset for extraction benchmarking purposes

## Extraction from generated examples

### Parsers

### Extractors