In [5]:
import re
import os
import requests
from typing import List,Optional
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import BSHTMLLoader
from langchain_core.pydantic_v1 import BaseModel,Field
from langchain_core.prompts import ChatPromptTemplate,MessagesPlaceholder
from langchain_text_splitters import TokenTextSplitter
from langchain_core.documents import Document
from langchain_core.runnables import RunnableLambda
from langchain_text_splitters import CharacterTextSplitter
from langchain_groq import ChatGroq

In [6]:
groq_api_key = os.environ.get("GROQ_API_KEY")

In [15]:
# Download the content
from langchain_community.document_loaders import BSHTMLLoader
# https://en.wikipedia.org/wiki/Constitution_of_India
response = requests.get("https://en.wikipedia.org/wiki/Car")
# Write it to a file
with open("car.html", "w", encoding="utf-8") as f:
    f.write(response.text)
# Load it with an HTML parser
loader = BSHTMLLoader("car.html",open_encoding="utf-8")
document = loader.load()[0]
# Clean up code
# Replace consecutive new lines with a single new line
document.page_content = re.sub("\n\n+", "\n", document.page_content)

In [16]:
print(len(document.page_content))

79227


In [18]:
class KeyDevelopment(BaseModel):
    """Information about a development in the history of cars."""
    year: int = Field(
        ..., description="The year when there was an important historic development.")
    description: str = Field(
        ..., description="What happened in this year? What was the development?"
    )
    evidence: str = Field(
        ...,
        description="Repeat in verbatim the sentence(s) from which the year and description information were extracted",
    )

class ExtractionData(BaseModel):
    """Extracted information about key developments in the history of cars."""

    key_developments: List[KeyDevelopment]

In [19]:
prompt = ChatPromptTemplate.from_messages(
    [
        (
            "system",
            "You are an expert at identifying key historic development in text. "
            "Only extract important historic developments. Extract nothing if no important information can be found in the text.",
        ),
        ("human", "{text}"),
    ]
)

In [20]:
llm = ChatGroq(model = "llama3-8b-8192", max_tokens=500)

In [21]:
extractor = prompt | llm.with_structured_output(schema=ExtractionData,
                                                include_raw=False)

In [22]:
text_spliter = TokenTextSplitter(chunk_size = 2000, chunk_overlap=50)
texts = text_spliter.split_text(document.page_content)

In [23]:
first_few = texts[:3]
extractions = extractor.batch([{"text" : text} for text in first_few],
                              {"max_concurrency": 5},)

In [24]:
key_developments = []

for extraction in extractions:
    key_developments.extend(extraction.key_developments)

key_developments[:10]

[KeyDevelopment(year=1966, description='Toyota Corolla', evidence='Best-selling series of automobile in history'),
 KeyDevelopment(year=1885, description='The original Benz Patent-Motorwagen was the first modern car, built in 1885 and awarded the patent for the concept.', evidence='It was a groundbreaking achievement in the history of cars.'),
 KeyDevelopment(year=1888, description='Bertha Benz, the first long distance driver, drove the Benz Patent-Motorwagen over a distance of 65 kilometers.', evidence='This was a significant milestone in the development of cars.'),
 KeyDevelopment(year=1886, description='Gottlieb Daimler and Wilhelm Maybach worked at the Daimler Motoren Gesellschaft in Stuttgart.', evidence='They were pioneers in the automotive industry.'),
 KeyDevelopment(year=1893, description='The first running, petrol-driven American car was built and road-tested by the Duryea brothers of Springfield, Massachusetts.', evidence='21 September 1893, on Taylor Street in Metro Center 

In [26]:
text_spliter = TokenTextSplitter(chunk_size = 2000, chunk_overlap=50)
texts = text_spliter.split_text(document.page_content)

In [29]:
from langchain_community.embeddings import GPT4AllEmbeddings
vectorstore = FAISS.from_texts(texts, embedding=GPT4AllEmbeddings())
retriever = vectorstore.as_retriever(
    search_kwargs={"k": 1}
) 

In [30]:
rag_extractor = {"text": retriever | (lambda docs: docs[0].page_content)} | extractor

In [31]:
results = rag_extractor.invoke("Key developments associated with cars")

In [32]:
for key_development in results.key_developments:
    print(key_development)

year=1869 description='The rise of cars as a major cause of injury-related deaths worldwide.' evidence='Traffic collisions are the largest cause of injury-related deaths worldwide.[10]'
year=2020 description='The introduction of electric cars' evidence='Many governments use fiscal policies, such as road tax, to discourage the purchase and use of more polluting cars;'
