In [1]:
%load_ext autoreload
%autoreload 2

# Import dependencies


In [2]:
import os

from langchain_google_vertexai import ChatVertexAI, VertexAIEmbeddings
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.vectorstores import InMemoryVectorStore
from langchain_core.runnables import chain
from pydantic import BaseModel, Field
from enum import Enum
from langchain_google_genai import ChatGoogleGenerativeAI


from IPython.display import display, Markdown, Latex

from typing import List, Optional

  from .autonotebook import tqdm as notebook_tqdm


# Configure Google credentials

- **NOTE**: Remember change the `GOOGLE_APPLICATION_CREDENTIALS` to the path of your own Google credentials file.


In [3]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = (
    "/home/cuongdm/git-cuongpiger/secret/work/vngcloud/ai-platform/vertex-ai-credential.json"
)

# The Schema


In [4]:
class Person(BaseModel):
    """Information about a person."""

    # ^ Doc-string for the entity Person.
    # This doc-string is sent to the LLM as the description of the schema Person,
    # and it can help to improve extraction results.

    # Note that:
    # 1. Each field is an `optional` -- this allows the model to decline to extract it!
    # 2. Each field has a `description` -- this description is used by the LLM.
    # Having a good description can help improve extraction results.
    name: Optional[str] = Field(default=None, description="The name of the person")
    hair_color: Optional[str] = Field(
        default=None, description="The color of the person's hair if known"
    )
    height_in_meters: Optional[str] = Field(
        default=None, description="Height measured in meters"
    )

# The extractor

In [5]:
# Define a custom prompt to provide instructions and any additional context.
# 1) You can add examples into the prompt template to improve extraction quality
# 2) Introduce additional parameters to take context into account (e.g., include metadata
#    about the document from which the text was extracted.)
prompt_template = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert extraction algorithm. "
            "Only extract relevant information from the text. "
            "If you do not know the value of an attribute asked to extract, "
            "return null for the attribute's value.",
        ),
        # Please see the how-to about improving performance with
        # reference examples.
        # MessagesPlaceholder('examples'),
        ("human", "{text}"),
    ]
)

# LLM model

In [6]:
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")
structured_llm = llm.with_structured_output(schema=Person)

# Test it out

In [15]:
text = "Cường cao 1.68m, tóc đen và xoăn, hơi mủm mỉm"
prompt = prompt_template.invoke({"text": text})
structured_llm.invoke(prompt)

Data(people=[Person(name='Cường', hair_color='đen và xoăn', height_in_meters='1.68')])

In [16]:
text = "Vân là một bạn nữ cao 1m8, tóc đen và dài, rất đô con"
prompt = prompt_template.invoke({"text": text})
structured_llm.invoke(prompt)

Data(people=[Person(name='Vân', hair_color='đen', height_in_meters='1.8')])

# Multiple entities

In [17]:
class Data(BaseModel):
    """Extracted data about people."""

    # Creates a model so that we can extract multiple entities.
    people: List[Person]

In [18]:
structured_llm = llm.with_structured_output(schema=Data)

In [21]:
text = "Mình là Cường, một bạn nam cao 1.68m đẹp trai tóc đen và xoăn. Cường có một anh bạn tên Tứ, cao 1.6m đầu hói"
prompt = prompt_template.invoke({"text": text})
structured_llm.invoke(prompt)

Data(people=[Person(name='Cường', hair_color='đen và xoăn', height_in_meters='1.68'), Person(name='Tứ', hair_color='hói', height_in_meters='1.6')])