In [1]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import re
import pandas as pd
import numpy as np
from typing import List, Dict, Any
import spacy
from transformers import AutoTokenizer, AutoModel
import torch

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

  return torch._C._cuda_getDeviceCount() > 0
[nltk_data] Downloading package punkt to /home/dhruv/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/dhruv/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [2]:
class ContentPreprocessor:
    def __init__(self):
        self.nlp = spacy.load('en_core_web_sm')
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.model = AutoModel.from_pretrained('bert-base-uncased')
        
    def process_content(self, text: str) -> Dict[str, Any]:
        chunks = self.content_chunking(text)
        cleaned_chunks = self.text_cleaning(chunks)
        metadata = self.metadata_extraction(text)
        
        return {
            'chunks': cleaned_chunks,
            'metadata': metadata,
            'embeddings': self.generate_embeddings(cleaned_chunks)
        }
    
    def content_chunking(self, text: str, chunk_size: int = 512) -> List[str]:
        sentences = sent_tokenize(text)
        chunks = []
        current_chunk = []
        current_length = 0
        
        for sentence in sentences:
            sentence_length = len(word_tokenize(sentence))
            if current_length + sentence_length > chunk_size:
                chunks.append(' '.join(current_chunk))
                current_chunk = [sentence]
                current_length = sentence_length
            else:
                current_chunk.append(sentence)
                current_length += sentence_length
                
        if current_chunk:
            chunks.append(' '.join(current_chunk))
            
        return chunks
    
    def text_cleaning(self, chunks: List[str]) -> List[str]:
        cleaned_chunks = []
        for chunk in chunks:
            cleaned = re.sub(r'[^\w\s]', '', chunk)
            cleaned = re.sub(r'\s+', ' ', cleaned)
            cleaned = cleaned.strip().lower()
            cleaned_chunks.append(cleaned)
        return cleaned_chunks
    
    def metadata_extraction(self, text: str) -> Dict[str, Any]:
        doc = self.nlp(text)
        
        metadata = {
            'entities': [(ent.text, ent.label_) for ent in doc.ents],
            'word_count': len(word_tokenize(text)),
            'sentence_count': len(sent_tokenize(text)),
            'language': doc.lang_
        }
        return metadata
    
    def generate_embeddings(self, chunks: List[str]) -> np.ndarray:
        embeddings = []
        
        with torch.no_grad():
            for chunk in chunks:
                inputs = self.tokenizer(chunk, return_tensors='pt', 
                                      padding=True, truncation=True)
                outputs = self.model(**inputs)
                embedding = outputs.last_hidden_state[:, 0, :].numpy()
                embeddings.append(embedding)
                
        return np.vstack(embeddings)