<a href="https://colab.research.google.com/github/enya-yx/LangChain-Courses/blob/main/Tagging_and_extraction_py.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install "langchain-google-genai" "langchain" "langchain-core" "langgraph-prebuilt" "google-generativeai" "langchain_community" "docarray" "langchain_experimental"

In [3]:
import google.generativeai as genai
import os
from google.colab import userdata

os.environ["GOOGLE_API_KEY"] = userdata.get('google_api_key')
# Configure the generative AI library with your API key
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])


In [4]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

# Define llm
llm = ChatGoogleGenerativeAI(
    model="gemini-2.5-flash",
    temperature=0,
    verbose=True
)


In [5]:
from typing import List
from pydantic import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
# define pydantic classes and convert to tagging functions
class Tagging(BaseModel):
  """Tag the piece of text with particular information."""
  sentiment: str = Field(description="sentiment of text, should be `pos`, `neg`, or `neutral`")
  language: str = Field(description="language of text (should be ISO 639-1 code)")


In [6]:
from langchain.utils.openai_functions import convert_pydantic_to_openai_function
tagging_function = convert_pydantic_to_openai_function(Tagging)
tagging_function


  tagging_function = convert_pydantic_to_openai_function(Tagging)


{'name': 'Tagging',
 'description': 'Tag the piece of text with particular information.',
 'parameters': {'properties': {'sentiment': {'description': 'sentiment of text, should be `pos`, `neg`, or `neutral`',
    'type': 'string'},
   'language': {'description': 'language of text (should be ISO 639-1 code)',
    'type': 'string'}},
  'required': ['sentiment', 'language'],
  'type': 'object'}}

In [12]:
prompt = ChatPromptTemplate.from_messages([
    ("system", "Think carefully and then tag the text as instructed"),
    ("human", "{input}")
])
llm_with_function = llm.bind(
    functions=[tagging_function]
)
chain = prompt | llm_with_function
chain.invoke({"input": "It's a beautiful day"})
chain.invoke({"input": "冬天有点儿冷"})

AIMessage(content='', additional_kwargs={'function_call': {'name': 'Tagging', 'arguments': '{"language": "zh", "sentiment": "neg"}'}}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []}, id='run--c4ee32f4-0d8c-4b10-ba60-17bd8f5e3cf2-0', tool_calls=[{'name': 'Tagging', 'args': {'language': 'zh', 'sentiment': 'neg'}, 'id': '9de1dfbc-4370-43d6-91f0-5e8a685d00bc', 'type': 'tool_call'}], usage_metadata={'input_tokens': 94, 'output_tokens': 19, 'total_tokens': 251, 'input_token_details': {'cache_read': 0}})

In [17]:
# Apply json output parser
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
chain_output = prompt | llm_with_function | JsonOutputFunctionsParser()
res = chain_output.invoke({"input": "It's a beautiful day"})
for k,v in res.items():
  print(f"{k}: {v}")

language: en
sentiment: pos


In [21]:
from typing import List, Optional
from pydantic import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
from langchain.utils.openai_functions import convert_pydantic_to_openai_function
from langchain.prompts import ChatPromptTemplate

# Define pydantic classes and convert to extraction functions
class Person(BaseModel):
  """Information about a person."""
  name: str = Field(description="person's name")
  age: Optional[int] = Field(description="person's age")

class Information(BaseModel):
  """Information to extract."""
  people: List[Person] = Field(description="List of info about people")

extraction_function = convert_pydantic_to_openai_function(Information)
extraction_llm = llm.bind(functions=[extraction_function])
extraction_prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess. Extract partial info"),
    ("human", "{input}")
])
extraction_chain = extraction_prompt | extraction_llm | JsonOutputFunctionsParser()

In [23]:
message = "Han Meimei is 36. Her friend is Li Lei."
extraction_chain.invoke(message)

{'people': [{'age': 36.0, 'name': 'Han Meimei'}, {'name': 'Li Lei'}]}

In [24]:
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
extraction_key_chain = extraction_prompt | extraction_llm | JsonKeyOutputFunctionsParser(key_name="people")
extraction_key_chain.invoke(message)

[{'age': 36.0, 'name': 'Han Meimei'}, {'name': 'Li Lei'}]

In [45]:
# Extract from a doc
from langchain_community.document_loaders import WebBaseLoader
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import DocArrayInMemorySearch
from langchain_google_genai import GoogleGenerativeAIEmbeddings

loader = WebBaseLoader("https://en.wikipedia.org/wiki/Keigo_Higashino")
docs = loader.load()[0]
page_content = docs.page_content

In [65]:
class Summary(BaseModel):
  """Summary of books mentioned in the document"""
  book_name: str = Field(description="Title of the book")
  description: str = Field(description="Description of the book")

summary_function = convert_pydantic_to_openai_function(Summary)
summary_llm = llm.bind(functions=[summary_function])
summary_prompt = ChatPromptTemplate.from_messages([
    ("system", "Summarize each book mentioned in the document one by one."),
    ("human", "{input}")
])
summary_chain = summary_prompt | summary_llm

In [66]:
res = summary_chain.invoke({"input": page_content})
res
#Pending: add a fitted output parser

AIMessage(content='', additional_kwargs={'function_call': {'name': 'Summary', 'arguments': '{"description": "A 4-volume comic series.", "book_name": "HE\\u2200DS (Heads)"}'}}, response_metadata={'prompt_feedback': {'block_reason': 0, 'safety_ratings': []}, 'finish_reason': 'STOP', 'safety_ratings': []}, id='run--fa4add25-c505-4055-9a4f-22b62afdb55f-0', tool_calls=[{'name': 'Summary', 'args': {'description': 'A mystery novel that inverts the classical whodunit structure, revealing the murderer early on and focusing on the feelings of loyalty and the oppressive weight of human relations as catalysts for murder.', 'book_name': 'The Devotion of Suspect X'}, 'id': 'c440dd01-6fa8-4800-852e-d9849c2e6f89', 'type': 'tool_call'}, {'name': 'Summary', 'args': {'description': 'A mystery novel in the Detective Galileo series.', 'book_name': 'Salvation of a Saint'}, 'id': 'f0420491-915b-4aeb-adb4-4f62ae694c00', 'type': 'tool_call'}, {'name': 'Summary', 'args': {'description': 'A mystery novel in the 