# Generate Sythetic Data

### Setup

In [2]:
%pip install --upgrade --quiet  langchain langchain_experimental

Note: you may need to restart the kernel to use updated packages.


In [6]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_experimental.tabular_synthetic_data.base import SyntheticDataGenerator
from langchain_experimental.tabular_synthetic_data.openai import OPENAI_TEMPLATE
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)
from langchain_ollama import ChatOllama
from pydantic import BaseModel

## 1. Define your data model

**schema**: a structure of a dataset

In [7]:
class MedicalBilling(BaseModel):
    patient_id: int
    patient_name: str
    diagnosis_code: str
    procedure_code: str
    total_charge: float
    insurance_claim_amount: float

## 2. Sample data

**seed**: representative of the kind of data you want

In [8]:
examples = [
    {
        "example": """Patient ID: 123456, Patient Name: John Doe, Diagnosis Code: 
        J20.9, Procedure Code: 99203, Total Charge: $500, Insurance Claim Amount: $350"""
    },
    {
        "example": """Patient ID: 789012, Patient Name: Johnson Smith, Diagnosis 
        Code: M54.5, Procedure Code: 99213, Total Charge: $150, Insurance Claim Amount: $120"""
    },
    {
        "example": """Patient ID: 345678, Patient Name: Emily Stone, Diagnosis Code: 
        E11.9, Procedure Code: 99214, Total Charge: $300, Insurance Claim Amount: $250"""
    },
]

## 3. Craft a prompt template

In [9]:
OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

## 4. Creating the data generator

In [20]:
llm=ChatOllama( model="llama3.1", temperature=1)
synthetic_data_generator = SyntheticDataGenerator(
    llm=llm,
    template=prompt_template
)

## 5. Generate synthetic data

In [21]:
synthetic_results = synthetic_data_generator.generate(
    subject="medical_billing",
    extra="the name must be chosen at random. Make it something you wouldn't normally choose.",
    runs=10,
)

In [22]:
print(synthetic_results)

['Here are five examples of synthetic medical billing data:\n\nPatient ID: 901234, Patient Name: Balthazar McSnodgrass, Diagnosis Code: R05.9, Procedure Code: 99201, Total Charge: $1200, Insurance Claim Amount: $900\n\nPatient ID: 678901, Patient Name: Zephyr Wimplebottom, Diagnosis Code: F02.2, Procedure Code: 99204, Total Charge: $2500, Insurance Claim Amount: $2000\n\nPatient ID: 135791, Patient Name: Bertram Pocketwatch, Diagnosis Code: M60.3, Procedure Code: 99210, Total Charge: $800, Insurance Claim Amount: $600\n\nPatient ID: 943215, Patient Name: Clarissa Quizzlethorpe, Diagnosis Code: J30.9, Procedure Code: 99211, Total Charge: $1500, Insurance Claim Amount: $1200\n\nPatient ID: 278491, Patient Name: Godfrey Snoodlewhomper, Diagnosis Code: K55.4, Procedure Code: 99212, Total Charge: $1000, Insurance Claim Amount: $800', 'Here are five examples of synthetic medical billing data with randomly generated names:\n\nPatient ID: 7654321, Patient Name: Thorold Quargsplore, Diagnosis C

### Other implementations

In [1]:
from langchain_experimental.synthetic_data import (
    DatasetGenerator,
    create_data_generation_chain,
)
from langchain_ollama import ChatOllama

In [2]:
# LLM
model = ChatOllama(model="llama3.1", temperature=0.7)
chain = create_data_generation_chain(model)

In [3]:
chain({"fields": ["blue", "yellow"], "preferences":{}})

  chain({"fields": ["blue", "yellow"], "preferences":{}})


{'fields': ['blue', 'yellow'],
 'preferences': {},
 'text': 'In the vibrant town square, a brilliant yellow street performer juggled three bright balls while wearing a stunning blue top hat adorned with intricate golden trimmings and delicate white flowers.'}

In [4]:
chain(
    {
        "fields": {"colors": ["blue", "yellow"]},
        "preferences": {"style": "Make it in a style of a weather forecast."},
    }
)

{'fields': {'colors': ['blue', 'yellow']},
 'preferences': {'style': 'Make it in a style of a weather forecast.'},
 'text': 'Here\'s a detailed and interesting sentence about the fields, crafted in the style of a weather forecast:\n\n"Good morning, folks! We\'ve got a real treat for you today - a brilliant blue sky with just a hint of warmth, courtesy of a gentle yellow sun shining brightly overhead. A high-pressure system is dominating our region, bringing clear conditions and a refreshing breeze to keep things cool and comfortable. As we head into the afternoon, expect a slight increase in temperature, but no worries there, as the pleasant yellow hues will continue to brighten up our day."'}

In [5]:
chain(
    {
        "fields": [
            {"actor": "Tom Hanks", "movies": ["Forrest Gump", "Green Mile"]},
            {"actor": "Mads Mikkelsen", "movies": ["Hannibal", "Another round"]},
        ],
        "preferences": {"minimum_length": 200, "style": "gossip"},
    }
)

{'fields': [{'actor': 'Tom Hanks', 'movies': ['Forrest Gump', 'Green Mile']},
  {'actor': 'Mads Mikkelsen', 'movies': ['Hannibal', 'Another round']}],
 'preferences': {'minimum_length': 200, 'style': 'gossip'},
 'text': 'Here\'s a detailed and interesting sentence about the given fields:\n\n"Tom Hanks\' Oscar-worthy performances in films like Forrest Gump and The Green Mile have long been the stuff of Hollywood legend, but little do people know that his co-star in those movies was actually a method actor who would go on to star in another iconic role as a cannibalistic serial killer in the TV series Hannibal. Meanwhile, across the pond, Danish actor Mads Mikkelsen was busy stealing scenes with his breakout performance in Thomas Vinterberg\'s critically acclaimed drama Another Round, which would cement his status as one of Europe\'s most sought-after leading men - and all this time, he had been quietly admiring Hanks\' work from afar."\n\nThis sentence meets the minimum length requireme

## Generating examplary dataset for extraction benchmarking purposes

In [6]:
inp = [
    {
        "Actor": "Tom Hanks",
        "Film": [
            "Forrest Gump",
            "Saving Private Ryan",
            "The Green Mile",
            "Toy Story",
            "Catch Me If You Can",
        ],
    },
    {
        "Actor": "Tom Hardy",
        "Film": [
            "Inception",
            "The Dark Knight Rises",
            "Mad Max: Fury Road",
            "The Revenant",
            "Dunkirk",
        ],
    },
]

generator = DatasetGenerator(model, {"style": "informal", "minimal length": 500})
dataset = generator(inp)

In [7]:
dataset

[{'fields': {'Actor': 'Tom Hanks',
   'Film': ['Forrest Gump',
    'Saving Private Ryan',
    'The Green Mile',
    'Toy Story',
    'Catch Me If You Can']},
  'preferences': {'style': 'informal', 'minimal length': 500},
  'text': 'Here\'s a detailed and interesting sentence about Tom Hanks using every given field:\n\n"Tom Hanks, the actor behind some of Hollywood\'s most iconic roles in films like Forrest Gump, Saving Private Ryan, The Green Mile, Toy Story, and Catch Me If You Can, is a master of versatility who can seamlessly transition from playing a lovable simpleton to a tough-as-nails soldier, all while making you laugh with his memorable performances as Woody in the Toy Story franchise."\n\nI\'ve used every given field and tried to create an engaging sentence that meets the style and minimal length preferences.'},
 {'fields': {'Actor': 'Tom Hardy',
   'Film': ['Inception',
    'The Dark Knight Rises',
    'Mad Max: Fury Road',
    'The Revenant',
    'Dunkirk']},
  'preferences