In [None]:
from langchain_core.prompts import FewShotPromptTemplate, PromptTemplate
from langchain_openai import AzureChatOpenAI
from openai import AzureOpenAI
import textwrap

from dotenv import load_dotenv
import os

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_DEPLOYMENT_ENDPOINT = os.getenv("OPENAI_DEPLOYMENT_ENDPOINT")
OPENAI_GPT35_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT35_DEPLOYMENT_NAME")
OPENAI_GPT4_DEPLOYMENT_NAME = os.getenv("OPENAI_GPT4_DEPLOYMENT_NAME")
api_version="2023-09-01-preview"

client = AzureOpenAI(
        azure_endpoint=OPENAI_DEPLOYMENT_ENDPOINT,
        api_key=OPENAI_API_KEY,
        api_version="2023-09-01-preview"
    )

In [None]:
def call_openAI(request):
    system_message = f"""
    You are an AI assistant that helps generate synthetic data for a data science project. 
   
    1. The data should be generated for a period of 1 year, starting from January 1, 2022, to December 31, 2022.
    2. Do not provide any additional examples to the output, just the CSV format using newlines at the end of each row.

    """

    response = client.chat.completions.create(
        model=OPENAI_GPT4_DEPLOYMENT_NAME,
        messages = [
            {"role":"system","content":system_message},
            {"role":"user","content":request}
            ],
        temperature=0.7,
        max_tokens=800,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )

    return response.choices[0].message.content

def prettyprint(text: str) -> str:
    print(textwrap.fill(text, 60))

In [None]:
request = """
            Generate a CSV file with 10 rows of synthetic data for a data science project.
            Each row should include the following fields: 
            Transaction ID, Account ID, Transaction Date, Transaction Amount, Transaction Type (Credit/Debit), Merchant Name, Merchant Category, US Zip Code and Transaction Description"
            Examples =
            US ZIP CODE: 77449, Bank Transaction Description: British Telecom, product_description: Annual fee, Amount: $350
            US ZIP CODE: 90011, Bank Transaction Description: Uber BV Toll Free Amsterdam NLD, product_description: toll, Amount: $54
            US ZIP CODE: 10458, Bank Transaction Description: THE AMERICAN HEART ASSOCIATION DALLAS TX, product_description: Subscription,Amount: $850
            US ZIP CODE: 95123, Bank Transaction Description: Transferencia Banco Santander Madrid, product_description: Transferencia bancaria, Amount: $550
"""
results = call_openAI(request)

In [None]:
prettyprint(results)

In [None]:
import pandas as pd
import json 
# remove ```csv from the response
results = results.replace("```plaintext", "")
results = results.replace("```", "")
df = pd.read_csv(results, sep=",")
df.head()
# df.to_csv("./data/synthetic_bank_transactions.csv", index=False)

## Generating exemplary dataset for extraction benchmarking purposes

In [None]:
inp = [
    {
        "Actor": "Tom Hanks",
        "Film": [
            "Forrest Gump",
            "Saving Private Ryan",
            "The Green Mile",
            "Toy Story",
            "Catch Me If You Can",
        ],
    },
    {
        "Actor": "Tom Hardy",
        "Film": [
            "Inception",
            "The Dark Knight Rises",
            "Mad Max: Fury Road",
            "The Revenant",
            "Dunkirk",
        ],
    },
]

generator = DatasetGenerator(model, {"style": "informal", "minimal length": 500})
dataset = generator(inp)

In [None]:
dataset

## Extraction from generated examples
Okay, let's see if we can now extract output from this generated data and how it compares with our case!

In [None]:
from typing import List

from langchain.chains import create_extraction_chain_pydantic
from langchain.output_parsers import PydanticOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAI
from pydantic import BaseModel, Field

In [None]:
class Actor(BaseModel):
    Actor: str = Field(description="name of an actor")
    Film: List[str] = Field(description="list of names of films they starred in")

### Parsers

In [None]:
llm = OpenAI()
parser = PydanticOutputParser(pydantic_object=Actor)

prompt = PromptTemplate(
    template="Extract fields from a given text.\n{format_instructions}\n{text}\n",
    input_variables=["text"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)

_input = prompt.format_prompt(text=dataset[0]["text"])
output = llm.invoke(_input.to_string())

parsed = parser.parse(output)
parsed

In [None]:
(parsed.Actor == inp[0]["Actor"]) & (parsed.Film == inp[0]["Film"])

### Extractors

In [None]:
extractor = create_extraction_chain_pydantic(pydantic_schema=Actor, llm=model)
extracted = extractor.run(dataset[1]["text"])
extracted

In [None]:
(extracted[0].Actor == inp[1]["Actor"]) & (extracted[0].Film == inp[1]["Film"])