In [2]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.pydantic_v1 import BaseModel
from langchain_experimental.tabular_synthetic_data.base import SyntheticDataGenerator
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator, OPENAI_TEMPLATE
from langchain_experimental.tabular_synthetic_data.prompts import SYNTHETIC_FEW_SHOT_SUFFIX, SYNTHETIC_FEW_SHOT_PREFIX

In [4]:

class MedicalBilling(BaseModel):
    patient_id: int
    patient_name: str
    diagnosis_code: str
    procedure_code: str
    total_charge: float
    insurance_claim_amount: float

examples = [
    {"example": """Patient ID: unique interger, Patient Name: John Doe, Diagnosis Code:
        J20.9, Procedure Code: 99203, Total Charge: $500, Insurance Claim Amount: $350"""},
    {"example": """Patient ID: 789012, Patient Name: Johnson Smith, Diagnosis
        Code: M.5, 54Procedure Code: 99213, Total Charge: $150, Insurance Claim Amount: $120"""},
    {"example": """Patient ID: 345678, Patient Name: Emily Stone, Diagnosis Code:
        E11.9, Procedure Code: 99214, Total Charge: $300, Insurance Claim Amount: $250"""},
]
OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)
synthetic_data_generator = create_openai_data_generator(
    output_schema=MedicalBilling,
    llm=ChatOpenAI(temperature=0.3),
    prompt=prompt_template,
)
synthetic_results =  synthetic_data_generator.generate(
    subject="medical_billing",
    extra="the name must be chosen at random. Make it something you wouldn't normally choose. choose distinct Patient ID in each record.",
    runs=10,
)
import pandas as pd

# Create a list of dictionaries from the objects
synthetic_data = []
for item in synthetic_results:
    synthetic_data.append({
        'patient_id': item.patient_id,
        'patient_name': item.patient_name,
        'diagnosis_code': item.diagnosis_code,
        'procedure_code': item.procedure_code,
        'total_charge': item.total_charge,
        'insurance_claim_amount': item.insurance_claim_amount
    })

# Create a Pandas DataFrame from the list of dictionaries
synthetic_df = pd.DataFrame(synthetic_data)
print(synthetic_df.shape, synthetic_df.columns,)
synthetic_df

(10, 6) Index(['patient_id', 'patient_name', 'diagnosis_code', 'procedure_code',
       'total_charge', 'insurance_claim_amount'],
      dtype='object')


Unnamed: 0,patient_id,patient_name,diagnosis_code,procedure_code,total_charge,insurance_claim_amount
0,123456,Alice Johnson,A09.9,99204,400.0,300.0
1,987654,Sophia Rodriguez,G47.0,99203,250.0,200.0
2,456789,Oliver Smith,I10,99213,350.0,275.0
3,123456,Eleanor Thompson,B20.1,99205,450.0,350.0
4,987654,Aloysius Jenkins,F32.9,99204,300.0,240.0
5,789012,Zephyr Rodriguez,M12.5,99214,400.0,320.0
6,345678,Octavia Patel,G47.0,99213,350.0,280.0
7,123456,Xavier Thompson,I10,99203,250.0,200.0
8,987654,Aurora Singh,F32.9,99215,450.0,360.0
9,987654,Ezekiel Rodriguez,M54.5,99214,400.0,320.0


In [37]:
synthetic_results = synthetic_data_generator.generate(
    subject="medical_billing",
    extra="the name must be chosen at random. Make it something you wouldn't normally choose.",
    runs=5,
)

TypeError: object list can't be used in 'await' expression

In [5]:
from typing import List, Dict
import pandas as pd
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.chat_models import ChatOpenAI
from pydantic import BaseModel, Field
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator

from pydantic import BaseModel, Field, ValidationError, validator
from typing import Set

# Global registry to keep track of used patient_ids
_patient_id_registry: Set[int] = set()


class MedicalBilling(BaseModel):
    patient_id: int = Field(...)
    patient_name: str
    diagnosis_code: str
    procedure_code: str
    total_charge: float
    insurance_claim_amount: float
    class Config:
        arbitrary_types_allowed = True

    @validator('patient_id')
    def check_patient_id_uniqueness(cls, v):
        if v in _patient_id_registry:
            raise ValueError('patient_id must be unique')
        _patient_id_registry.add(v)
        return v


# Example usage
try:
    bill1 = MedicalBilling(patient_id=1, patient_name="John Doe", diagnosis_code="E11.9", procedure_code="99214", total_charge=300, insurance_claim_amount=250)
    bill2 = MedicalBilling(patient_id=1, patient_name="Jane Doe", diagnosis_code="B34.2", procedure_code="99213", total_charge=200, insurance_claim_amount=150)  # This should raise a ValidationError
except ValidationError as e:
    print(e)

# Utility class for managing synthetic data
class SyntheticDataUtils:
    @staticmethod
    def convert_to_dataframe(data: List[MedicalBilling]) -> pd.DataFrame:
        data_dicts = [item.dict(by_alias=True) for item in data]
        return pd.DataFrame(data_dicts)

# Main class for generating synthetic medical billing data
class SyntheticMedicalBillingGenerator:
    def __init__(self, examples: List[Dict], temperature: float = 1.0):
        self.generate_prompt_template(examples)
        self.synthetic_data_generator = create_openai_data_generator(
            output_schema=MedicalBilling,
            llm=ChatOpenAI(temperature=temperature),
            prompt=self.prompt_template,
        )

    def generate_prompt_template(self, examples: List[Dict]):
        self.prompt_template = FewShotPromptTemplate(
            prefix="Please generate synthetic medical billing data. Examples are:",
            examples=examples,
            suffix="Now generate a new entry based on the pattern observed.",
            input_variables=["subject", "extra"],
            example_prompt=PromptTemplate(input_variables=["example"], template="{example}")
        )

    def generate(self, subject: str, extra: str, runs: int = 100) -> List[MedicalBilling]:
        return self.synthetic_data_generator.generate(
            subject=subject,
            extra=extra,
            runs=runs,
        )

# Example usage
examples = [
    {"example": """Patient ID: 123456, Patient Name: John Doe, Diagnosis Code:
        J20.9, Procedure Code: 99203, Total Charge: $500, Insurance Claim Amount: $350"""},
    {"example": """Patient ID: 789012, Patient Name: Johnson Smith, Diagnosis
        Code: M54.5, Procedure Code: 99213, Total Charge: $150, Insurance Claim Amount: $120"""},
    {"example": """Patient ID: 345678, Patient Name: Emily Stone, Diagnosis Code:
        E11.9, Procedure Code: 99214, Total Charge: $300, Insurance Claim Amount: $250"""},
]

generator = SyntheticMedicalBillingGenerator(examples=examples)

# Generate
synthetic_data = generator.generate(
    subject="medical_billing",
    extra="the name must be chosen at random. Make it something you wouldn't normally choose.",
    runs=100
)

# Convert to DataFrame
synthetic_df = SyntheticDataUtils.convert_to_dataframe(synthetic_data)

# Display the DataFrame
print(synthetic_df.head())
print(synthetic_df.shape)

C:\Users\jdamodhar\AppData\Local\Temp\ipykernel_27088\395335319.py:25: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.6/migration/
  @validator('patient_id')


1 validation error for MedicalBilling
patient_id
  Value error, patient_id must be unique [type=value_error, input_value=1, input_type=int]
    For further information visit https://errors.pydantic.dev/2.6/v/value_error


RuntimeError: no validator found for <class '__main__.MedicalBilling'>, see `arbitrary_types_allowed` in Config

In [8]:
from typing import List, Dict
import pandas as pd
from pydantic import BaseModel, Field, ValidationError, root_validator
from random import randint
from faker import Faker

fake = Faker()

# Global registry to keep track of used patient_ids
_patient_id_registry: set[int] = set()

class MedicalBilling(BaseModel):
    patient_id: int 
    patient_name: str
    diagnosis_code: str
    procedure_code: str
    total_charge: float
    insurance_claim_amount: float

    class Config:
        arbitrary_types_allowed = True

    @root_validator(pre=True)
    def check_patient_id_uniqueness(cls, values):
        patient_id = values.get('patient_id')
        if patient_id in _patient_id_registry:
            raise ValueError('patient_id must be unique')
        _patient_id_registry.add(patient_id)
        return values

def generate_synthetic_data_entry() -> dict:
    return {
        "patient_id": randint(100000, 999999),  # random int as a placeholder
        "patient_name": fake.name(),
        "diagnosis_code": fake.lexify(text="????.?", letters="ABCDEFGHIJKLMNOPQRSTUVWXYZ"),
        "procedure_code": fake.bothify(text="#####", letters="ABCDEFGHIJKLMNOPQRSTUVWXYZ"),
        "total_charge": round(fake.pyfloat(right_digits=2, positive=True, min_value=100, max_value=500), 2),
        "insurance_claim_amount": round(fake.pyfloat(right_digits=2, positive=True, min_value=50, max_value=450), 2)
    }
def generate_unique_patient_id():
    while True:
        new_id = randint(100000, 100009)
        if new_id not in _patient_id_registry:
            return new_id

def generate_and_validate_data(num_entries: int) -> List[MedicalBilling]:
    entries = []
    for _ in range(num_entries):
        data_entry = generate_synthetic_data_entry()
        # data_entry['patient_id'] = generate_unique_patient_id()  # Ensure a unique patient_id
        try:
            validated_entry = MedicalBilling(**data_entry)
            entries.append(validated_entry)
        except ValidationError as e:
            print(e)
    return entries

# Example usage
num_entries = 100  # Specify the number of synthetic entries to generate
synthetic_data = generate_and_validate_data(num_entries)

# Convert to DataFrame
synthetic_df = pd.DataFrame([entry.dict() for entry in synthetic_data])

# Display the DataFrame
print(f"DataFrame shape: {synthetic_df.shape}")
(synthetic_df.head())

C:\Users\jdamodhar\AppData\Local\Temp\ipykernel_29980\4237496820.py:23: PydanticDeprecatedSince20: Pydantic V1 style `@root_validator` validators are deprecated. You should migrate to Pydantic V2 style `@model_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.6/migration/
  @root_validator(pre=True)


DataFrame shape: (100, 6)


Unnamed: 0,patient_id,patient_name,diagnosis_code,procedure_code,total_charge,insurance_claim_amount
0,725015,Penny Welch,NTZK.H,88469,456.7,258.26
1,496524,Lisa Rojas,EEOY.G,92238,262.83,280.44
2,165223,Patrick Nelson,OQIK.M,50720,346.1,252.23
3,713955,Robert Tate,PCKX.X,83744,388.78,396.31
4,462565,Edward Tucker,FMEU.V,55415,456.14,167.97
