In [29]:
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_core.pydantic_v1 import BaseModel
from langchain_experimental.tabular_synthetic_data.openai import (
    OPENAI_TEMPLATE,
    create_openai_data_generator,
)
from langchain_experimental.tabular_synthetic_data.prompts import (
    SYNTHETIC_FEW_SHOT_PREFIX,
    SYNTHETIC_FEW_SHOT_SUFFIX,
)
from langchain_openai import AzureChatOpenAI

from dotenv import load_dotenv
import os

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_GPT35_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT35_DEPLOYMENT_NAME")
OPENAI_GPT4_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT4_DEPLOYMENT_NAME")
api_version="2023-09-01-preview"

In [33]:
#define the data model for the synthetic data
class BankTransaction(BaseModel):
    us_zip_code: int
    bank_transaction_description: str
    product_description: str
    amount: float

In [34]:
examples = [
    {
        "example": """US ZIP CODE: 77449, Bank Transaction Description: British Telecom, product_description: Annual fee, Amount: $350"""
    },
    {
        "example": """US ZIP CODE: 90011, Bank Transaction Description: Uber BV Toll Free Amsterdam NLD, product_description: toll, Amount: $54"""
    },
    {
        "example": """US ZIP CODE: 10458, Bank Transaction Description: THE AMERICAN HEART ASSOCIATION DALLAS TX, product_description: Subscription,Amount: $850"""
    },
    {
        "example": """US ZIP CODE: 95123, Bank Transaction Description: Transferencia Banco Santander Madrid, product_description: Transferencia bancaria, Amount: $550"""
    },
]

In [36]:
OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

In [37]:

llm = AzureChatOpenAI(deployment_name=OPENAI_GPT4_DEPLOYMENT_NAME,
                            model=OPENAI_GPT4_DEPLOYMENT_NAME,
                            openai_api_version=api_version,
                            azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
                            temperature=1
                            )

synthetic_data_generator = create_openai_data_generator(output_schema=BankTransaction,llm= llm, prompt=prompt_template)

In [43]:
synthetic_results = synthetic_data_generator.generate(
    subject="Generate real bank transactions",
    extra="""Choose real US ZIP codes for the us_zip_code. 
    Choose real company names and purchases or payments for the bank_transaction_description""",
    runs=10,
)
display(synthetic_results)

[BankTransaction(us_zip_code=10001, bank_transaction_description='Spotify Premium', product_description='Music Streaming Subscription', amount=9.99),
 BankTransaction(us_zip_code=94105, bank_transaction_description='Whole Foods Market Purchase San Francisco CA', product_description='Groceries', amount=127.45),
 BankTransaction(us_zip_code=90015, bank_transaction_description='Netflix Monthly Subscription', product_description='Streaming Service', amount=15.99),
 BankTransaction(us_zip_code=13210, bank_transaction_description='AT&T Bill Payment', product_description='Mobile Service Bill', amount=65.25),
 BankTransaction(us_zip_code=60601, bank_transaction_description='Amazon Prime Membership', product_description='Online Services Subscription', amount=14.99),
 BankTransaction(us_zip_code=10007, bank_transaction_description='Starbucks Coffee Purchase New York NY', product_description='Coffee and Snacks', amount=23.76),
 BankTransaction(us_zip_code=94105, bank_transaction_description='Appl

In [44]:
import pandas as pd
df = pd.DataFrame(
    [result.dict() for result in synthetic_results]
)
df.head()
df.to_csv("./data/synthetic_bank_transactions.csv", index=False)

## Generating exemplary dataset for extraction benchmarking purposes

In [None]:
inp = [
    {
        "Actor": "Tom Hanks",
        "Film": [
            "Forrest Gump",
            "Saving Private Ryan",
            "The Green Mile",
            "Toy Story",
            "Catch Me If You Can",
        ],
    },
    {
        "Actor": "Tom Hardy",
        "Film": [
            "Inception",
            "The Dark Knight Rises",
            "Mad Max: Fury Road",
            "The Revenant",
            "Dunkirk",
        ],
    },
]

generator = DatasetGenerator(model, {"style": "informal", "minimal length": 500})
dataset = generator(inp)

In [None]:
dataset

[{'fields': {'Actor': 'Tom Hanks',
   'Film': ['Forrest Gump',
    'Saving Private Ryan',
    'The Green Mile',
    'Toy Story',
    'Catch Me If You Can']},
  'preferences': {'style': 'informal', 'minimal length': 500},
  'text': 'Tom Hanks, the versatile and charismatic actor, has graced the silver screen in numerous iconic films including the heartwarming and inspirational "Forrest Gump," the intense and gripping war drama "Saving Private Ryan," the emotionally charged and thought-provoking "The Green Mile," the beloved animated classic "Toy Story," and the thrilling and captivating true story adaptation "Catch Me If You Can." With his impressive range and genuine talent, Hanks continues to captivate audiences worldwide, leaving an indelible mark on the world of cinema.'},
 {'fields': {'Actor': 'Tom Hardy',
   'Film': ['Inception',
    'The Dark Knight Rises',
    'Mad Max: Fury Road',
    'The Revenant',
    'Dunkirk']},
  'preferences': {'style': 'informal', 'minimal length': 500}

## Extraction from generated examples
Okay, let's see if we can now extract output from this generated data and how it compares with our case!

In [None]:
from typing import List

from langchain.chains import create_extraction_chain_pydantic
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from pydantic import BaseModel, Field

In [None]:
class Actor(BaseModel):
    Actor: str = Field(description="name of an actor")
    Film: List[str] = Field(description="list of names of films they starred in")

### Parsers

In [None]:
llm = OpenAI()
parser = PydanticOutputParser(pydantic_object=Actor)

prompt = PromptTemplate(
    template="Extract fields from a given text.\n{format_instructions}\n{text}\n",
    input_variables=["text"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

_input = prompt.format_prompt(text=dataset[0]["text"])
output = llm.invoke(_input.to_string())

parsed = parser.parse(output)
parsed

Actor(Actor='Tom Hanks', Film=['Forrest Gump', 'Saving Private Ryan', 'The Green Mile', 'Toy Story', 'Catch Me If You Can'])

In [None]:
(parsed.Actor == inp[0]["Actor"]) & (parsed.Film == inp[0]["Film"])

True

### Extractors

In [None]:
extractor = create_extraction_chain_pydantic(pydantic_schema=Actor, llm=model)
extracted = extractor.run(dataset[1]["text"])
extracted

[Actor(Actor='Tom Hardy', Film=['Inception', 'The Dark Knight Rises', 'Mad Max: Fury Road', 'The Revenant', 'Dunkirk'])]

In [None]:
(extracted[0].Actor == inp[1]["Actor"]) & (extracted[0].Film == inp[1]["Film"])

True