In [1]:
import os
import openai

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [2]:
from typing import List
from pydantic import BaseModel, Field
from langchain.utils.openai_functions import convert_pydantic_to_openai_function

In [3]:
class Tagging(BaseModel):
    """Tag the piece of text with particular info."""
    sentiment: str = Field(description="sentiment of text, should be `pos`, `neg`, or `neutral`")
    language: str = Field(description="language of text (should be ISO 639-1 code)")
    
convert_pydantic_to_openai_function(Tagging)

{'name': 'Tagging',
 'description': 'Tag the piece of text with particular info.',
 'parameters': {'title': 'Tagging',
  'description': 'Tag the piece of text with particular info.',
  'type': 'object',
  'properties': {'sentiment': {'title': 'Sentiment',
    'description': 'sentiment of text, should be `pos`, `neg`, or `neutral`',
    'type': 'string'},
   'language': {'title': 'Language',
    'description': 'language of text (should be ISO 639-1 code)',
    'type': 'string'}},
  'required': ['sentiment', 'language']}}

In [7]:
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI

model = ChatOpenAI(temperature=0)
tagging_functions = [convert_pydantic_to_openai_function(Tagging)]

prompt = ChatPromptTemplate.from_messages([
    ("system", "Think carefully, and then tag the text as instructed"),
    ("user", "{input}")
])

# force it to use the tagging function
model_with_functions = model.bind(
    functions=tagging_functions,
    function_call={"name": "Tagging"}
)

tagging_chain = prompt | model_with_functions

In [8]:
tagging_chain.invoke({"input": "I love langchain"})

AIMessage(content='', additional_kwargs={'function_call': {'name': 'Tagging', 'arguments': '{\n  "sentiment": "pos",\n  "language": "en"\n}'}})

In [10]:
tagging_chain.invoke({"input": "non mi piace questo cibo"})

AIMessage(content='', additional_kwargs={'function_call': {'name': 'Tagging', 'arguments': '{\n  "sentiment": "neg",\n  "language": "it"\n}'}})

In [11]:
# can use the JsonOutputFunctionsParser to get only the pertinent info

from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser
tagging_chain = prompt | model_with_functions | JsonOutputFunctionsParser()
tagging_chain.invoke({"input": "non mi piace questo cibo"})

{'sentiment': 'neg', 'language': 'it'}

### Extraction
Extraction is similar to tagging, but used for extracting multiple pieces of information.

In [13]:
from typing import Optional
class Person(BaseModel):
    """Information about a person."""
    name: str = Field(description="person's name")
    age: Optional[int] = Field(description="person's age")
    
class Information(BaseModel):
    """Information to extract."""
    people: List[Person] = Field(description="List of info about people")

In [15]:
extraction_functions = [convert_pydantic_to_openai_function(Information)]
extraction_model = model.bind(functions=extraction_functions, function_call={"name": "Information"})

In [20]:
age_prompt_str = "Joe and his best friend Dan just met Julie, who is 19."
extraction_model.invoke(age_prompt_str)

AIMessage(content='', additional_kwargs={'function_call': {'name': 'Information', 'arguments': '{\n  "people": [\n    {\n      "name": "Joe",\n      "age": 25\n    },\n    {\n      "name": "Dan",\n      "age": 27\n    },\n    {\n      "name": "Julie",\n      "age": 19\n    }\n  ]\n}'}})

In [29]:
# because the model hallucinated, we should prompt it better

prompt = ChatPromptTemplate.from_messages([
    ("system", "Extract the relevant information, if not explicitly provided do not guess. Extract partial info.  Do not default any optional values.  Zero should never be used for age unless listed explicitly as such in the text."),
    ("human", "{input}")
])
extraction_chain = prompt | extraction_model
extraction_chain.invoke({"input": age_prompt_str})

AIMessage(content='', additional_kwargs={'function_call': {'name': 'Information', 'arguments': '{\n  "people": [\n    {\n      "name": "Joe",\n      "age": 0\n    },\n    {\n      "name": "Dan",\n      "age": 0\n    },\n    {\n      "name": "Julie",\n      "age": 19\n    }\n  ]\n}'}})

In [24]:
from langchain.output_parsers.openai_functions import JsonOutputFunctionsParser

jsonParser = JsonOutputFunctionsParser()
extraction_chain = prompt | extraction_model | jsonParser
extraction_chain.invoke({"input": age_prompt_str})

# jsonParser gets us only the output, but I don't want the "people" variable name.  I just want the list of people

{'people': [{'name': 'Joe', 'age': 0},
  {'name': 'Dan', 'age': 0},
  {'name': 'Julie', 'age': 19}]}

In [30]:
# So, I'll use the KeyOutputparser
from langchain.output_parsers.openai_functions import JsonKeyOutputFunctionsParser
peopleKeyParser = JsonKeyOutputFunctionsParser(key_name="people")
extraction_chain = prompt | extraction_model | peopleKeyParser
extraction_chain.invoke({"input": age_prompt_str})

[{'name': 'Joe', 'age': 0},
 {'name': 'Dan', 'age': 0},
 {'name': 'Julie', 'age': 19}]

### Doing it on a large body of text

In [31]:
from langchain.document_loaders import WebBaseLoader
loader = WebBaseLoader("https://lilianweng.github.io/posts/2023-06-23-agent/")
documents = loader.load()

In [None]:
# Take a peek at the content
doc = documents[0]
page_content = doc.page_content[:10000]
end_page_content = doc.page_content[-10000:]
print(page_content[:1000])

In [39]:
class Overview(BaseModel):
    """Overview of a section of text."""
    summary: str = Field(description="Provide a concise summary of the content.")
    language: str = Field(description="Provide the language that the content is written in.")
    keywords: str = Field(description="Provide keywords related to the content.")
    
overview_tagging_function = convert_pydantic_to_openai_function(Overview)

tagging_model = model.bind(
    functions=[overview_tagging_function],
    function_call={"name":"Overview"}
)
tagging_chain = prompt | tagging_model | JsonOutputFunctionsParser()
output = tagging_chain.invoke({"input": page_content})

In [49]:
# Let's extract the references to papers that are listed in this article
class Paper(BaseModel):
    """Information about papers mentioned."""
    title: str
    author: Optional[str]


class Info(BaseModel):
    """Information to extract"""
    papers: List[Paper]
    
paper_extraction_function = convert_pydantic_to_openai_function(Info)

extraction_model = model.bind(
    functions=[paper_extraction_function], 
    function_call={"name":"Info"}
)
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")
extraction_chain.invoke({"input": page_content})


[{'title': 'Chain of thought prompting elicits reasoning in large language models',
  'author': 'Wei et al.'},
 {'title': 'Tree of Thoughts: Deliberate Problem Solving with Large Language Models',
  'author': 'Yao et al.'},
 {'title': 'Chain of Hindsight Aligns Language Models with Feedback',
  'author': 'Liu et al.'},
 {'title': 'LLM+P: Empowering Large Language Models with Optimal Planning Proficiency',
  'author': 'Liu et al.'},
 {'title': 'ReAct: Synergizing reasoning and acting in language models',
  'author': 'Yao et al.'}]

In [50]:
sys_msg = """A article will be passed to you. Extract from it all papers that are mentioned by this article. 

Do not extract the name of the article itself. If no papers are mentioned that's fine - you don't need to extract any! Just return an empty list.

Do not make up or guess ANY extra information. Only extract what exactly is in the text."""

prompt = ChatPromptTemplate.from_messages([
    ("system", sys_msg),
    ("human", "{input}")
])
extraction_chain = prompt | extraction_model | JsonKeyOutputFunctionsParser(key_name="papers")
extraction_chain.invoke({"input": page_content})

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised ServiceUnavailableError: The server is overloaded or not ready yet..


[{'title': 'Chain of thought (CoT; Wei et al. 2022)', 'author': 'Wei et al.'},
 {'title': 'Tree of Thoughts (Yao et al. 2023)', 'author': 'Yao et al.'},
 {'title': 'LLM+P (Liu et al. 2023)', 'author': 'Liu et al.'},
 {'title': 'ReAct (Yao et al. 2023)', 'author': 'Yao et al.'},
 {'title': 'Reflexion (Shinn & Labash 2023)', 'author': 'Shinn & Labash'},
 {'title': 'Chain of Hindsight (CoH; Liu et al. 2023)',
  'author': 'Liu et al.'},
 {'title': 'Algorithm Distillation (AD; Laskin et al. 2023)',
  'author': 'Laskin et al.'}]

### Extracting on splits
This article is huge, though.  We want to do with extraction on the whole thing, not just the first part.
so we'll need to:
- split the article
- Pass those splits the LLM one by one
- Combine all the results at the end

#### Split

In [52]:


from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(chunk_overlap=0)

splits = text_splitter.split_text(doc.page_content)
len(splits)

14

In [54]:
def flatten(matrix):
    # the matrix is composed of a list of lists.  this function will return a single flattened list with the elements of each list from the matrix
    flattened = []
    for sublist in matrix:
        for item in sublist:
            flattened.append(item)
            
    return flattened

In [55]:
flatten([[1,2],[3,4]])

[1, 2, 3, 4]

#### Pass each split to the LLM

In [63]:
from langchain.schema.runnable import RunnableLambda
prep = RunnableLambda(
    lambda x: [{"input": doc} for doc in text_splitter.split_text(x)]
)


In [65]:
# NOTE: the output of ```prep``` is a list, so that means we want to call the next step multiple times
# to do that we call map()
# then we call flatten() to flatten the list
chain = prep | extraction_chain.map() | flatten

In [68]:
chain.invoke(doc.page_content)

Retrying langchain.chat_models.openai.ChatOpenAI.completion_with_retry.<locals>._completion_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).


[{'title': 'Chain of thought (CoT; Wei et al. 2022)', 'author': 'Wei et al.'},
 {'title': 'Tree of Thoughts (Yao et al. 2023)', 'author': 'Yao et al.'},
 {'title': 'LLM+P (Liu et al. 2023)', 'author': 'Liu et al.'},
 {'title': 'ReAct (Yao et al. 2023)', 'author': 'Yao et al.'},
 {'title': 'Reflexion (Shinn & Labash 2023)', 'author': 'Shinn & Labash'},
 {'title': 'Reflexion framework', 'author': 'Shinn & Labash'},
 {'title': 'Chain of Hindsight', 'author': 'Liu et al.'},
 {'title': 'Algorithm Distillation', 'author': 'Laskin et al.'},
 {'title': 'Algorithm Distillation', 'author': 'Laskin et al. 2023'},
 {'title': 'ED (expert distillation)', 'author': ''},
 {'title': 'RL^2', 'author': 'Duan et al. 2017'},
 {'title': 'LSH: Locality-Sensitive Hashing', 'author': ''},
 {'title': 'ANNOY: Approximate Nearest Neighbors Oh Yeah', 'author': ''},
 {'title': 'HNSW: Hierarchical Navigable Small World', 'author': ''},
 {'title': 'FAISS: Facebook AI Similarity Search', 'author': ''},
 {'title': 'Sca