In [None]:
import shutil
import sys
import os

from langchain_experimental.text_splitter import SemanticChunker

# Construct the path to the 'src' directory
src_path = os.path.join(os.getcwd(),  'src')

# Add this path to sys.path if it's not already included
if src_path not in sys.path:
    sys.path.append(src_path)

print("Updated Python path:", sys.path)

# Disable bytecode generation
sys.dont_write_bytecode = True
os.environ['PYTHONDONTWRITEBYTECODE'] = "1"

def ensure_no_pycache(path):
    sys.dont_write_bytecode = True  # Disable the creation of .pyc files
    for dirpath, dirnames, filenames in os.walk(path):
        pycache_path = os.path.join(dirpath, '__pycache__')
        if os.path.exists(pycache_path):
            shutil.rmtree(pycache_path)
            print(f"Deleted {pycache_path}")

from dotenv import load_dotenv
import os

dotenv_path = os.path.join('.env')
if os.path.exists(dotenv_path):
    load_dotenv()

ensure_no_pycache('../app')

In [None]:
import re

import fitz
import tiktoken
# from llama_index.core.text_splitter import SentenceSplitter
from langchain_experimental.text_splitter import SemanticChunker

from src.utils.llm import LLM
from src.modules.knowledge_base.kb_service import KBService
from src.modules.knowledge_base.kb_repository import KBRepository
from src.modules.embedding_service import EmbeddingService
from src.modules.pdf_parser import PdfParser
from src.modules.text_cleaning_service import CleaningService
from src.modules.text_partition_service import TextPartitionService

In [None]:
query_examples = [
    "AI Development",
    "Cosine similarity",
    "Best embedding models"
]

In [None]:
def evaluate_tokens(model, text):
        encoder = tiktoken.get_encoding("o200k_base")
        encoder = tiktoken.encoding_for_model(model)
        
        tokens = encoder.encode(text)
        return len(tokens)
        

In [None]:
LLM.get_model()

In [None]:
pdf_parser = PdfParser()
text_cleaner = CleaningService()

pages = await pdf_parser.process('ai_adoption_framework_whitepaper.pdf', start_page=2)
page_list = list(pages)
pages = await pdf_parser.process('ai_adoption_framework_whitepaper.pdf', start_page=2)

In [None]:
page = page_list[15]
text = page.content
text = text_cleaner.clean(text)
repr(text)

In [None]:
partitioner = TextPartitionService()
chunks = list(partitioner.split_by_sentences(text))
print(chunks)

In [None]:
partitioner = TextPartitionService()
chunks = list(partitioner.split_semantically(text))
print(chunks)

In [None]:
text_cleaner = CleaningService()
text_splitter = TextPartitionService()
embedder = EmbeddingService()
pdf_parser = PdfParser()
repo = KBRepository()
kb = KBService(pdf_parser=pdf_parser,
               text_cleaner=text_cleaner,
               text_partitioner=text_splitter,
               embedder=embedder, 
               repository=repo
               )

In [None]:
docs = await kb._create_documents('ai_adoption_framework_whitepaper.pdf', start_page=2)
docs


In [None]:
kb.truncate()

In [None]:
res = await kb.create('ai_adoption_framework_whitepaper.pdf')
res

In [None]:
res = await kb.search("Hero manager")
res