# Agentic Chunking

In [None]:
!pip install --upgrade --quiet  langchain-core langchain-community langchain-openai

In [None]:
!pip install --quiet pypdf

In [None]:
!pip install --quiet llama-index

In [19]:
!pip install --quiet langchain-mistralai

In [58]:
import re
from google.colab import userdata
from langchain.document_loaders import PyPDFLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_mistralai import ChatMistralAI

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from langchain.document_loaders import PyPDFLoader
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import SimpleDirectoryReader
from langchain_core.documents import Document
from llama_index.core import Document as LlamaIndexDocument
from langchain.embeddings import HuggingFaceEmbeddings

**Loading Data**

In [4]:
loader = PyPDFLoader("/content/data/Understanding_Climate_Change.pdf")
docs = loader.load()
len(docs), type(docs)

(33, list)

In [7]:
vars(docs[0]).keys()

dict_keys(['id', 'metadata', 'page_content', 'type'])

In [11]:
MISTRAL_API_KEY = userdata.get('MISTRAL_API_KEY')

**Creating Prompt for Agentic Chunking**

In [13]:
agentic_prompt = 'Role: You are an agentic chunker. You will be provided with a content.  \
What you have to do:  \
```  \
1. Decompose the content into clear and simple propositions, ensuring they are  \
interpretable out of context.  \
2. Split compound sentence into simple sentences. Maintain the original phrasing from the input whenever possible.  \
3. For any named entity that is accompanied by additional descriptive information, separate this information into its own distinct proposition.  \
4. Decontextualize the proposition by adding necessary modifier to nouns or entire  \
sentence and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the entities they refer to.  \
5. Present the results as a list of strings, formatted in JSON.  \
```  \
  \
Here is the content: {content}  \
Strictly follow the instructions provided and output in the desired format only.'

**Testing on first 3 pages of PDF**

In [14]:
testing_docs = [docs[i].page_content for i in range(3)]
len(testing_docs), type(testing_docs)

(3, list)

**Creating `Runnable` chains for agentic chunk generation.**

In [37]:
llm = ChatMistralAI(
    mistral_api_key = MISTRAL_API_KEY,
    model="mistral-large-latest",
    temperature=0,
    max_retries=2
)

In [38]:
prompt = ChatPromptTemplate.from_template(f"Complete task as decribed below. \n{agentic_prompt}")

In [39]:
output_parser = StrOutputParser()

In [40]:
chain = prompt | llm | output_parser

In [41]:
chain.invoke({"content": testing_docs[0]})

'```json\n[\n  "Climate change refers to significant, long-term changes in the global climate.",\n  "The term \'global climate\' encompasses the planet\'s overall weather patterns.",\n  "The term \'global climate\' includes temperature.",\n  "The term \'global climate\' includes precipitation.",\n  "The term \'global climate\' includes wind patterns.",\n  "The term \'global climate\' includes these patterns over an extended period.",\n  "Over the past century, human activities have significantly contributed to climate change.",\n  "Human activities include the burning of fossil fuels.",\n  "Human activities include deforestation.",\n  "The Earth\'s climate has changed throughout history.",\n  "Over the past 650,000 years, there have been seven cycles of glacial advance and retreat.",\n  "The abrupt end of the last ice age about 11,700 years ago marked the beginning of the modern climate era.",\n  "The abrupt end of the last ice age about 11,700 years ago marked the beginning of human c

In [43]:
para_props = []
for doc in testing_docs:
    chain.invoke({"content": doc})
    para_props.append(chain.invoke({"content": doc}))

In [45]:
for props in para_props:
    print(props[:200])
    print()

```json
[
  "Climate change refers to significant, long-term changes in the global climate.",
  "The term 'global climate' encompasses the planet's overall weather patterns.",
  "The term 'global clim

```json
[
  "Coal is the most carbon-intensive fossil fuel.",
  "The use of coal for electricity generation is a major source of CO2 emissions.",
  "There is a decline in coal use in some regions.",
 

```json
[
  "Ruminant animals produce methane during digestion.",
  "Cows are ruminant animals.",
  "Sheep are ruminant animals.",
  "Manure management practices contribute to methane emissions.",
  "



In [51]:
props = []
for prop in para_props:
    props.extend(re.findall(r'"([^"]*)"', prop))
len(props), type(props)

(179, list)

In [54]:
for p in props[:10]:
  print(p)

Climate change refers to significant, long-term changes in the global climate.
The term 'global climate' encompasses the planet's overall weather patterns.
The term 'global climate' includes temperature.
The term 'global climate' includes precipitation.
The term 'global climate' includes wind patterns.
The term 'global climate' includes these patterns over an extended period.
Over the past century, human activities have significantly contributed to climate change.
Human activities that have contributed to climate change include the burning of fossil fuels.
Human activities that have contributed to climate change include deforestation.
The Earth's climate has changed throughout history.


*We've obtained all the propostions. Now, we just have to apply any chunking like `semantic` or `recursive` to achieve the desired embeddings.*  
*We can also use LLM for the same task to combine sementically coherent chunks but it will incease the time complexity.*

In [60]:
llama_docs = []
for p in props:
    llama_docs.append(
        LlamaIndexDocument(
            text = p
        )
    )
print("Docs: ", len(docs), type(docs))
print("Llama Docs: ", len(llama_docs), type(llama_docs))

Docs:  33 <class 'list'>
Llama Docs:  179 <class 'list'>


In [61]:
llama_docs[0]

Document(id_='1a672b3d-4c15-49f2-8d3e-1d56547eecd8', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, metadata_template='{key}: {value}', metadata_separator='\n', text_resource=MediaResource(embeddings=None, data=None, text='Climate change refers to significant, long-term changes in the global climate.', path=None, url=None, mimetype=None), image_resource=None, audio_resource=None, video_resource=None, text_template='{metadata_str}\n\n{content}')

In [62]:
index_docs = [{'index': i, 'sentence': llama_docs[i].text} for i, sentence in enumerate(llama_docs)]
len(index_docs), type(index_docs)

(179, list)

**Creating Chunks**

In [None]:
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embedded_query = embeddings.embed_query('Hugging Face is a life saver! Arigatho!')
len(embedded_query), type(embedded_query)

In [64]:
chunks = []
distances = []
doc_embeddings = []

for i, d in tqdm(enumerate(index_docs)):
    embedded_d = embeddings.embed_query(d['sentence'])
    doc_embeddings.append(np.array(embedded_d))

    if i == 0:
        chunks.append([d])
    else:
      css = cosine_similarity(doc_embeddings[-1].reshape(1, -1), doc_embeddings[-2].reshape(1, -1))
      distances.append(1 - css)
      if css < 0.7:
        chunks[-1].append(d)
      else:
        chunks.append([d])

179it [00:02, 62.03it/s]


In [66]:
print("Number of document embeddings: ", len(doc_embeddings))
print("Number of chunks: ", len(chunks), type(chunks))

Number of document embeddings:  179
Number of chunks:  84 <class 'list'>


In [83]:
for c in chunks[0]:
    print(c)
    # print(c.sentence)

{'index': 0, 'sentence': 'Climate change refers to significant, long-term changes in the global climate.'}
{'index': 1, 'sentence': "The term 'global climate' encompasses the planet's overall weather patterns."}


In [85]:
final_chunks = [
    {
        'index': i,
        'chunk': ". ".join(c['sentence'] for c in chunk),
        'count': len(chunk)
    }
    for i, chunk in enumerate(chunks)
]
len(final_chunks), type(final_chunks)

(84, list)

In [87]:
for chunk in final_chunks[:10]:
    print(chunk)

{'index': 0, 'chunk': "Climate change refers to significant, long-term changes in the global climate.. The term 'global climate' encompasses the planet's overall weather patterns.", 'count': 2}
{'index': 1, 'chunk': "The term 'global climate' includes temperature.", 'count': 1}
{'index': 2, 'chunk': "The term 'global climate' includes precipitation.", 'count': 1}
{'index': 3, 'chunk': "The term 'global climate' includes wind patterns.", 'count': 1}
{'index': 4, 'chunk': "The term 'global climate' includes these patterns over an extended period.. Over the past century, human activities have significantly contributed to climate change.", 'count': 2}
{'index': 5, 'chunk': 'Human activities that have contributed to climate change include the burning of fossil fuels.', 'count': 1}
{'index': 6, 'chunk': "Human activities that have contributed to climate change include deforestation.. The Earth's climate has changed throughout history.. Over the past 650,000 years, there have been seven cycle