In [1]:
import requests
from config import OPENAI_API_KEY, EMBEDDING_MODEL, EMBEDDING_URL


def local_embedding(inputs):
    """Get embeddings from the embedding service"""

    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {OPENAI_API_KEY}"
    }
    data = {
        "input": inputs,
        "model": EMBEDDING_MODEL
    }

    response = requests.post(EMBEDDING_URL, headers=headers, json=data)
    outputs = [output['embedding'] for output in response.json()['data']]
    return outputs

In [6]:
inputs = ["Hello world!", "How are you?"]

headers = {
    "Content-Type": "application/json",
    "Authorization": f"Bearer {OPENAI_API_KEY}"
}
data = {
    "input": inputs,
    "model": EMBEDDING_MODEL
}

In [7]:
response = requests.post(EMBEDDING_URL, headers=headers, json=data)

In [16]:
for embed in response.json()['data']:
    print(type(embed['embedding'][0]))

<class 'float'>
<class 'float'>


In [3]:
from __future__ import annotations

import os
from io import BytesIO
import fitz  # PyMuPDF
import tiktoken
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from typing import List, IO, Callable
import streamlit as st
from config import TEST_PDFS_DIR


def load_documents(uploaded_files: List[IO]) -> List[Document]:
    """Load pdf documents from streamlit file_uploader and extract them to text documents."""
    documents = []
    for uploaded_file in uploaded_files:
        pdf_content = BytesIO(uploaded_file.read())
        with fitz.open(stream=pdf_content, filetype="pdf") as doc:
            text = "".join(page.get_text() for page in doc)
        if text:
            documents.append(Document(page_content=text, metadata={"file_name": uploaded_file.name}))
    return documents


def _load_local_documents(pdf_directory: str) -> List[Document]:
    """
    For testing: Load multiple PDF files directly from a local folder.

    Returns the same type as load_documents → List[Document]
    """
    documents = []
    pdf_files = os.listdir(pdf_directory)

    if not pdf_files:
        print(f"No PDF files found in '{pdf_directory}'")
        return documents

    for file_name in pdf_files:
        try:
            file_path = os.path.join(pdf_directory, file_name)
            print(file_path)
            with fitz.open(file_path) as doc:
                text = "".join(page.get_text() for page in doc)
            if text:
                documents.append(Document(page_content=text, metadata={"file_name": file_name}))
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

    print(f"Loaded {len(documents)} document(s) from '{pdf_directory}'")
    return documents


def num_tokens_from_string(string: str) -> int:
    """
    Use token length instead of string length as a better text splitting method.

    Returns the number of tokens in a text string.
    """
    encoding = tiktoken.get_encoding('cl100k_base')
    num_tokens = len(encoding.encode(string))
    return num_tokens


def split_documents_to_text_chunks(
    documents: List[Document],
    *,
    chunk_size: int = 512,
    chunk_overlap: int = 64,
    length_function: Callable[[str], int] = num_tokens_from_string,
) -> List[str]:
    """
    Load list of documents and split into chunks using LangChain.

    Returns: List[str] (each item is a chunk of text)
    """

    textsplit = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=length_function
    )
    chunks = textsplit.split_documents(documents)
    return chunks

In [4]:
documents = _load_local_documents(TEST_PDFS_DIR)
chunks = split_documents_to_text_chunks(documents, length_function=num_tokens_from_string)

C:\DAHOU\Business\go_tech\chat-pdf\data\test_data\AttentionIsAllYouNeed.pdf
C:\DAHOU\Business\go_tech\chat-pdf\data\test_data\TrainingLanguageModelsToFollowInstructionsWithHumanFeedback.pdf
Loaded 2 document(s) from 'C:\DAHOU\Business\go_tech\chat-pdf\data\test_data'


In [6]:
chunks[0]

Document(metadata={'file_name': 'AttentionIsAllYouNeed.pdf'}, page_content='Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Tra

In [8]:
vars(chunks[0])

{'id': None,
 'metadata': {'file_name': 'AttentionIsAllYouNeed.pdf'},
 'page_content': 'Provided proper attribution is provided, Google hereby grants permission to\nreproduce the tables and figures in this paper solely for use in journalistic or\nscholarly works.\nAttention Is All You Need\nAshish Vaswani∗\nGoogle Brain\navaswani@google.com\nNoam Shazeer∗\nGoogle Brain\nnoam@google.com\nNiki Parmar∗\nGoogle Research\nnikip@google.com\nJakob Uszkoreit∗\nGoogle Research\nusz@google.com\nLlion Jones∗\nGoogle Research\nllion@google.com\nAidan N. Gomez∗†\nUniversity of Toronto\naidan@cs.toronto.edu\nŁukasz Kaiser∗\nGoogle Brain\nlukaszkaiser@google.com\nIllia Polosukhin∗‡\nillia.polosukhin@gmail.com\nAbstract\nThe dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks that include an encoder and a decoder. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architect