In [1]:
from langchain import PromptTemplate
from pydantic import BaseModel, Field
from langchain.output_parsers import PydanticOutputParser
from typing import List



In [2]:
class ExtractionOutput(BaseModel):
    people: List[str] = Field(description="People mentioned in the body of text")
    organizations: List[str] = Field(description="Organizations mentioned in the body of text")
    countries: List[str] = Field(description="Countries mentioned in the body of text")
    technologies: List[str] = Field(description="Technologies mentioned in the body of text")
    author: List[str] = Field(description="Author of the body of text")

In [3]:
class Relationship(BaseModel):
    initiator: str = Field('Initiator of action in text')
    action: str = Field('Action of initiator towards recipient in text')
    recipient: str = Field('Recipient of action from initiator in text')

In [4]:
class Relationships(BaseModel):
    relationships: List[Relationship] = Field("List of all relationships from text")

In [5]:
extraction_template = """\
You are an Entity Extraction Algorithm. Your job is to extract all fields listed. Extract people, organizations, countries, technologies, and author (if present) from this body of text: {text} {format_instructions}
"""

In [6]:
extraction_parser = PydanticOutputParser(pydantic_object=ExtractionOutput)

In [7]:
rel_template = """
You are a Summarization and Extraction Algorithm. Summarize each paragraph in the body of text: {text}
From that summary, extract the essential relationships and structure then as
initiator, action, and recipient. The action is a two word summaries of the action between the initiator and the recipient. If an entry has none for either initiator or recipient , do not include it in the output. {format_instructions}
"""

In [8]:
rel_parser = PydanticOutputParser(pydantic_object=Relationships)

In [9]:
extraction_prompt = PromptTemplate(
    template=extraction_template,
    input_variables=["text"],
    partial_variables={"format_instructions": extraction_parser.get_format_instructions()},
)

In [10]:
rel_prompt = PromptTemplate(
    template=rel_template,
    input_variables=["text"],
    partial_variables={"format_instructions": rel_parser.get_format_instructions()},
)

In [11]:
rel_prompt.format(text="Chris is a software engineer at Thomson Reuters")

'\nYou are a Summarization and Extraction Algorithm. Summarize each paragraph in the body of text: Chris is a software engineer at Thomson Reuters\nFrom that summary, extract the essential relationships and structure then as\ninitiator, action, and recipient. The action is a two word summaries of the action between the initiator and the recipient. If an entry has none for either initiator or recipient , do not include it in the output. The output should be formatted as a JSON instance that conforms to the JSON schema below.\n\nAs an example, for the schema {"properties": {"foo": {"title": "Foo", "description": "a list of strings", "type": "array", "items": {"type": "string"}}}, "required": ["foo"]}}\nthe object {"foo": ["bar", "baz"]} is a well-formatted instance of the schema. The object {"properties": {"foo": ["bar", "baz"]}} is not well-formatted.\n\nHere is the output schema:\n```\n{"properties": {"relationships": {"title": "Relationships", "default": "List of all relationships fro

In [31]:
from langchain.llms import OpenAI
from configparser import ConfigParser

In [33]:
config = ConfigParser()
config.read('../config.ini')
model_name = "text-davinci-003"
temperature = 0.0
model = OpenAI(
    model_name=model_name,
    temperature=temperature,
    openai_api_key=config["openai"]["key"],
    streaming=True,
    max_tokens=1024
)

In [34]:
from langchain.chains import LLMChain

In [15]:
extraction_chain = LLMChain(llm=model, prompt=extraction_prompt)
relationship_chain = LLMChain(llm=model, prompt=rel_prompt)

In [16]:
resp = extraction_chain.run("Chris Smith is a software engineer at Thomson Reuters and Alex Smith works at Deloitte. Russia does not like United States. Nvidia ships to America. Russia and Ukraine have been experiencing intense combat on the eastern front.")

In [17]:
extraction_parser.parse(resp)

ExtractionOutput(people=['Chris Smith', 'Alex Smith'], organizations=['Thomson Reuters', 'Deloitte'], countries=['Russia', 'United States', 'Ukraine'], technologies=['Nvidia'], author=[])

In [18]:
import requests
from bs4 import  BeautifulSoup

In [19]:
soup = requests.get("https://www.reuters.com/business/energy/renewables-growth-did-not-dent-fossil-fuel-dominance-2022-statistical-review-2023-06-25/").text
text = BeautifulSoup(soup).get_text()

In [20]:
from langchain.text_splitter import CharacterTextSplitter

In [21]:
splitter = CharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=100, chunk_overlap=0
)

In [22]:
texts = splitter.split_text(text)

In [23]:
texts[1]

'Skip to main contentExclusive news, data and analytics for financial market professionalsLearn more aboutRefinitivReuters homeWorldBusinessMarketsSustainabilityLegalBreakingviewsTechnologyInvestigationsMoreMy ViewRegisterSustainable MarketsExploration & ProductionLNGGasClimate ChangeRenewables growth did not dent fossil fuel dominance in 2022, report saysReutersJune 26, 20238:02 AM UTCUpdated  agoCompaniesBp PlcFollowLONDON, June 26 (Reuters) - Global energy demand rose 1% last year and record renewables growth did nothing to shift the dominance of fossil fuels, which still accounted for 82% of supply, the industry\'s Statistical Review of World Energy report said on Monday.Last year was marked by turmoil in the energy markets after Russia\'s invasion of Ukraine, which helped to boost gas and coal prices to record levels in Europe and Asia.The stubborn lead of oil, gas and coal products in covering most energy demand cemented itself in 2022 despite the largest ever increase in renewab

In [24]:
responses = []

In [25]:
len(texts)

2

In [26]:
for chunk in  texts:
    responses.append(extraction_chain.run(chunk))

In [27]:
extraction_parser.parse(responses[1])

ExtractionOutput(people=['Juliet Davenport', 'Kuba Stezycki'], organizations=['Energy Institute', 'KPMG', 'Kearny', 'BP', 'Thomson Reuters', 'Chevron', 'Tetra Technologies', 'EDF', 'Savers'], countries=['Russia', 'Ukraine', 'Europe', 'Asia', 'North America', 'China', 'India', 'Japan', 'Indonesia', 'South and Central America'], technologies=['LNG', 'Solar', 'Wind Power', 'Nuclear', 'Coal', 'Lithium Carbonate', 'Cobalt'], author=['Shadia Nasralla'])

### Evaluation

In [28]:
### See how an agent improves over time when analyzing the task three times

In [None]:
meta_template = """
Entity Extraction has just extracted the following from the below body of text. Your job is to critique Entity Extraction's performance and revise the Entity Extraction's Instructions so that Entity Extraction will more accurately extract entities in the future.

####
{hist}
####

Please reflect on this extraction.

What are ways the Entity Extraction could be better? How can we better describe our objective?
"""