# Agentic Chunking

In [None]:
!pip install --upgrade --quiet  langchain-core langchain-community langchain-openai

In [None]:
!pip install --quiet pypdf

In [19]:
!pip install --quiet langchain-mistralai

In [20]:
from google.colab import userdata
from langchain.document_loaders import PyPDFLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_mistralai import ChatMistralAI

**Loading Data**

In [4]:
loader = PyPDFLoader("/content/data/Understanding_Climate_Change.pdf")
docs = loader.load()
len(docs), type(docs)

(33, list)

In [7]:
vars(docs[0]).keys()

dict_keys(['id', 'metadata', 'page_content', 'type'])

In [11]:
MISTRAL_API_KEY = userdata.get('MISTRAL_API_KEY')

**Creating Prompt for Agentic Chunking**

In [13]:
agentic_prompt = 'Role: You are an agentic chunker. You will be provided with a content.  \
What you have to do:  \
```  \
1. Decompose the content into clear and simple propositions, ensuring they are  \
interpretable out of context.  \
2. Split compound sentence into simple sentences. Maintain the original phrasing from the input whenever possible.  \
3. For any named entity that is accompanied by additional descriptive information, separate this information into its own distinct proposition.  \
4. Decontextualize the proposition by adding necessary modifier to nouns or entire  \
sentence and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the entities they refer to.  \
5. Present the results as a list of strings, formatted in JSON.  \
```  \
  \
Here is the content: {content}  \
Strictly follow the instructions provided and output in the desired format only.'

**Testing on first 3 pages of PDF**

In [14]:
testing_docs = [docs[i].page_content for i in range(3)]
len(testing_docs), type(testing_docs)

(3, list)

**Creating `Runnable` chains for agentic chunk generation.**

In [21]:
llm = ChatMistralAI(
    model="mistral-large-latest",
    temperature=0,
    max_retries=2,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0.0,
    presence_penalty=0.0,
    stream=False,
    stop=["\n"],
    model_kwargs={
        "use_cache": True
    },
    request_timeout=15
)

In [22]:
prompt = ChatPromptTemplate.from_template(f"Complete task as decribed below. \n{agentic_prompt}")