In [None]:
import numpy as np
import pandas as pd
import re
import torch
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import matplotlib.pyplot as plt
import nltk
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel


In [None]:
pip install hdbscan

In [None]:

import hdbscan
class TextChunker:
    def __init__(self, embedding_model='thenlper/gte-large'):
        self.embedding_model = SentenceTransformer(embedding_model)

    def _get_embeddings(self, texts):
        return self.embedding_model.encode(texts)

    def _preprocess_text(self, text):
        sentences = sent_tokenize(text)
        cleaned_sentences = [re.sub(r'[^\w\s]', '', s).strip() for s in sentences if s.strip()]
        return cleaned_sentences
        
    def chunk_text(self, text, max_clusters=35):
        sentences = []
        if isinstance(text, str):
            text = [text]
        
        for t in text:
            sentences.extend(self._preprocess_text(t))
        
        embeddings = self._get_embeddings(sentences)
        
        # Apply HDBSCAN instead of KMeans
        clusterer = hdbscan.HDBSCAN(
            min_cluster_size=2,  # Smaller clusters allowed
            min_samples=3,       # Less strict, more points in clusters
            metric='euclidean',  # Euclidean distance is good for embeddings
            cluster_selection_method='eom',  # "EOM" for better density-based clustering
        )
        cluster_labels = clusterer.fit_predict(embeddings)
        
        clustered_sentences = {}
        for idx, label in enumerate(cluster_labels):
            if label not in clustered_sentences:
                clustered_sentences[label] = []
            clustered_sentences[label].append(sentences[idx])
        
        chunks = ['. '.join(cluster_sentences) + '.'
                  for cluster_sentences in clustered_sentences.values() if cluster_sentences]

        return {
            'chunks': chunks,
            'n_chunks': len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0),  # Ignore noise (-1 label)
            'sentences_per_chunk': [len(cluster) for cluster in clustered_sentences.values() if cluster]
        }
