In [10]:
import os 
import json 
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import chromadb 
from chromadb.utils import embedding_functions 
from pathlib import Path

In [11]:
# paths 
PROCESSED_DIR = Path('data/processed')
VECTORSTORE_DIR = Path('vectorstore')
VECTORSTORE_DIR.mkdir(parents=True, exist_ok=True) 
CLEANED_SECTIONS_JSON = Path('data/processed/cleaned_sections.json')

In [12]:
# load cleaned sectioned text 
with open(CLEANED_SECTIONS_JSON, 'r', encoding='utf-8') as f: 
    cleaned_sections = json.load(f) 

In [13]:
# initialize embeddings function for chromadb
embedding_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name='all-MiniLM-L6-v2',
)

# initialize chromadb 
chroma_client = chromadb.PersistentClient(path=VECTORSTORE_DIR)
collection = chroma_client.get_or_create_collection(
    name='academic_papers', 
    embedding_function=embedding_fn 
)

In [14]:
# create chunk function
def chunk_text(text, chunk_size=200, overlap=50): 
    '''
    split text into overlapiing chunks 
    '''
    words = text.split()
    chunks = [] 
    start = 0
    
    while start < len(words): 
        end = min(start + chunk_size, len(words))
        chunk = ' '.join(words[start:end])
        chunks.append(chunk) 
        start += chunk_size - overlap
    return chunks 

In [15]:
# generate embeddings and populate chroma 
for fname, sections in tqdm(cleaned_sections.items()): 
    for section_name, text in sections.items(): 
        chunks = chunk_text(text)
        ids = [f'{fname}_{section_name}_{i}' for i in range(len(chunks))]
        collection.add(
            documents=chunks, 
            metadatas=[{'source': fname, 'section': section_name} for _ in chunks], 
            ids=ids
        )

  0%|          | 0/31 [00:00<?, ?it/s]

100%|██████████| 31/31 [00:02<00:00, 10.36it/s]
