In [1]:
import pandas as pd
import yaml
import importlib
import os
from typing import Optional, List

from langchain_core.pydantic_v1 import BaseModel, Field, validator
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.output_parsers import PydanticOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.messages import HumanMessage
from langchain_openai import AzureChatOpenAI
from langchain_google_vertexai import ChatVertexAI


import address_formatter_rule

In [2]:
%env ENV=dev

with open(f"./config/config.{os.getenv('ENV')}.yml", 'r') as stream:
    config = yaml.safe_load(stream)

os.environ["AZURE_OPENAI_API_KEY"] = config['OPENAI']['AZURE_OPENAI_API_KEY']
os.environ["AZURE_OPENAI_ENDPOINT"] = config['OPENAI']['AZURE_OPENAI_ENDPOINT']
os.environ["AZURE_OPENAI_API_VERSION"] = config['OPENAI']['API_VERSION']
os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"] = config['OPENAI']['GPT3_MODEL']

env: ENV=dev


#### Input

In [3]:
input = {
    'address': '8/F Blk 2, Kodak Hse 39 Healthy St E North Point, Hong Kong',
    'country_code': 'HKG',
    'language': 'en'
}

input = {
    'address': '香港上環 蘇杭街19-25號 永昌商業大廈16樓B室',
    'country_code': 'HKG',
    'language': 'zh-hk'
}

input = {
    'address': '510410 中國廣東省廣州市 白雲區祥崗外街5號 福豐大廈606室',
    'country_code': 'CHN',
    'language': 'zh-hk'
}

### Prompt

In [4]:



if input['language'] == 'en':
    line_format = address_formatter_rule.line_format_en
else:
    line_format = address_formatter_rule.line_format_zh

line_format_input = line_format.get(input['country_code'], line_format['default'])

class AddressResult(BaseModel):
    """output of the transcript"""
    # Having a good description can help improve extraction results.
    address_line1: Optional[str] = Field(
        default=None, description=line_format_input['address_line1'])
    address_line2: Optional[str] = Field(
        default=None, description=line_format_input['address_line2'])

    address_line3: Optional[str] = Field(
        default=None, description=line_format_input['address_line3'])

    address_line4: Optional[str] = Field(
        default=None, description=line_format_input['address_line4'])




   

In [5]:
if address_formatter_rule.rule.get(input['country_code']) is None:
    address_rule = address_formatter_rule.rule.get('default')
else:
    address_rule = address_formatter_rule.rule.get(input['country_code'])

In [7]:
parser = PydanticOutputParser(pydantic_object=AddressResult)
# Prompt
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are give a one line address and a country code. "
            "Your task is to format the address into multiple lines according to the country code."
            "Here are the rules for the address format:"
            f"{address_rule}\n"
            
            "Wrap the output in `json` tags\n{format_instructions}"
            "return null for the attribute's value."
            ""
        ),
        ("human", "{query}"),
    ]
).partial(format_instructions=parser.get_format_instructions())

In [8]:

gpt_model = AzureChatOpenAI(
    openai_api_version=os.environ["AZURE_OPENAI_API_VERSION"],
    azure_deployment=os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT_NAME"],
    temperature=0,
)

# gpt_model = ChatVertexAI(
#     # model="chat-bison"
#     model="gemini-1.5-pro",
#     temperature=0,
#     max_tokens=None,
#     max_retries=6,
#     stop=None,
# )

In [9]:
chain = prompt | gpt_model | parser

In [10]:

clean_out = chain.invoke({"query": input['address'] })
print(input['address'])
print({
    "add1": clean_out.address_line1,
    "add2": clean_out.address_line2,
    "add3": clean_out.address_line3,
    "add4": clean_out.address_line4
}
)

In [12]:
clean_out

AddressResult(address_line1='510410', address_line2='中國廣東省廣州市白雲區', address_line3='祥崗外街5號', address_line4='福豐大廈606室')