[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pinecone-io/examples/blob/master/generation/langchain/handbook/xx-langchain-chunking.ipynb) [![Open nbviewer](https://raw.githubusercontent.com/pinecone-io/examples/master/assets/nbviewer-shield.svg)](https://nbviewer.org/github/pinecone-io/examples/blob/master/generation/langchain/handbook/xx-langchain-chunking.ipynb)

## Required Libraries

In [None]:
%pip install -qU langchain tiktoken matplotlib seaborn tqdm

## Preparing Data

In [None]:
from bs4 import BeautifulSoup
from pydantic import BaseModel, Field
from pathlib import Path
import re


class Document(BaseModel): 
    page_content: str
    metadata: dict = Field(default_factory=dict)

class ReadDocLoader():
    def __init__(self, path: str):
        self.file_path = path

    def load(self):
         #Load documents
        def _clean_data(data: str) -> str:
            soup = BeautifulSoup(data, "html.parser")
            text = soup.get_text().strip()
            # Remove extra spaces and newlines
            text = re.sub(r"\s+", " ", text)
            # Join sentences together
            sentences = re.split(r"(?<=[.!?]) +", text)
            text = " ".join(sentences).strip()
            return text
        
        docs = []
        for p in Path(self.file_path).rglob("*"):
            if p.is_dir():
                continue
            # Open the HTML file and read its contents
            with open(p, "r") as f:
                text = _clean_data(f.read())
            # Clean the HTML data and create an instance of the Document class
            metadata = {"source": str(p)}
            docs.append(Document(page_content=text, metadata=metadata))
        # Print the page content
        return docs

In [None]:
loader = ReadDocLoader('saved_pages')
docs = loader.load()
len(docs)

In [None]:
docs[10]

We access the plaintext page content like so:

In [None]:
print(docs[0].page_content)

In [None]:
print(docs[5].page_content)

We can also find the source of each document:

In [None]:
docs[5].metadata['source'].replace('saved_pages/', 'https://')

In [None]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [None]:
tiktoken.encoding_for_model('gpt-3.5-turbo')

In [None]:
token_counts = [tiktoken_len(doc.page_content) for doc in docs]

In [None]:
print(f"""Min: {min(token_counts)}
Avg: {int(sum(token_counts) / len(token_counts))}
Max: {max(token_counts)}""")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# set style and color palette for the plot
sns.set_style("whitegrid")
sns.set_palette("muted")

# create histogram
plt.figure(figsize=(12, 6))
sns.histplot(token_counts, kde=False, bins=50)

# customize the plot info
plt.title("Token Counts Histogram")
plt.xlabel("Token Count")
plt.ylabel("Frequency")

plt.show()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,  # number of tokens overlap between chunks
    length_function=tiktoken_len,
    separators=['\n\n', '\n', ' ', '']
)

In [None]:
chunks = text_splitter.split_text(docs[5].page_content)
len(chunks)

In [None]:
tiktoken_len(chunks[0]), tiktoken_len(chunks[1])

In [None]:
import hashlib
m = hashlib.md5()  # this will convert URL into unique ID

url = docs[5].metadata['source'].replace('saved_pages/', 'https://')
print(url)

# convert URL to unique ID
m.update(url.encode('utf-8'))
uid = m.hexdigest()[:12]
print(uid)

In [None]:
data = [
    {
        'id': f'{uid}-{i}',
        'text': chunk,
        'source': url
    } for i, chunk in enumerate(chunks)
]
data

In [None]:
from tqdm.auto import tqdm

documents = []

for doc in tqdm(docs):
    url = doc.metadata['source'].replace('saved_pages', 'https://')
    m.update(url.encode('utf-8'))
    uid = m.hexdigest()[:12]
    chunks = text_splitter.split_text(doc.page_content)
    for i, chunk in enumerate(chunks):
        documents.append({
            'id': f'{uid}-{i}',
            'text': chunk,
            'source': url
        })

len(documents)

In [None]:
import json

with open('train.jsonl', 'w') as f:
    for doc in documents:
        f.write(json.dumps(doc) + '\n')

In [None]:
documents = []

with open('train.jsonl', 'r') as f:
    for line in f:
        documents.append(json.loads(line))

len(documents)

In [None]:
documents[0]

In [None]:
documents = [{
    'id': doc['id'],
    'text': doc['text'],
    'metadata': {'url': doc['source']}
} for doc in documents]

documents[0]

In [None]:
import os

BEARER_TOKEN = os.environ.get("BEARER_TOKEN")

In [None]:
headers = {
    "Authorization": f"Bearer {BEARER_TOKEN}",
}

In [None]:
import requests
from requests.adapters import HTTPAdapter, Retry
from tqdm.auto import tqdm

batch_size = 100
endpoint_url = "https://plankton-app-6cv28.ondigitalocean.app"
s = requests.Session()

# we setup a retry strategy to retry on 5xx errors
retries = Retry(
    total=5,  # number of retries before raising error
    backoff_factor=0.1,
    status_forcelist=[500, 502, 503, 504]
)
s.mount('http://', HTTPAdapter(max_retries=retries))

for i in tqdm(range(0, len(documents), batch_size)):
    i_end = min(len(documents), i+batch_size)
    # make post request that allows up to 5 retries
    res = s.post(
        f"{endpoint_url}/upsert",
        headers=headers,
        json={
            "documents": documents[i:i_end]
        }
    )

In [None]:
queries = [
    {'query': "What are the rate limits for Exchange/Pro?"},
]

res = requests.post(
    f"{endpoint_url}/query",
    headers=headers,
    json={
        'queries': queries
    }
)
res

In [None]:
print(res.json())

In [None]:
for query_result in res.json()['results']:
    query = query_result['query']
    answers = []
    scores = []
    for result in query_result['results']:
        answers.append(result['text'])
        scores.append(round(result['score'], 2))
    print("-"*70+"\n"+query+"\n\n"+"\n".join([f"{s}: {a}" for a, s in zip(answers, scores)])+"\n"+"-"*70+"\n\n")