In [1]:
import os
import json
import ollama
import numpy as np
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from tqdm import tqdm
from transformers import AutoTokenizer
from typing import List

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
CHUNK_SIZE = 384
CHUNK_OVERLAP = 96

In [3]:
def load_docs(file_path: str) -> List[Document]:
    """
    Load pdfs from the given file path. Each pdf is loaded and converted to a Document object.

    Args:
        file_path (str): Path to the folder containing pdfs

    Returns:
        List[Document]: List of Document objects
    """
    pdfs = os.listdir(file_path)
    pdfs = [os.path.join(file_path, pdf) for pdf in pdfs if pdf.endswith(".pdf")]

    docs = []
    for pdf in tqdm(pdfs):
        pages = []
        loader = PyPDFLoader(pdf)
        for page in loader.load():
            pages.append(page)

        text = "\n".join(page.page_content for page in pages)
        doc = Document(page_content=text, metadata={"source": page.metadata["source"]})
        docs.append(doc)

    return docs


def split_docs(docs: List[Document], tokenizer: AutoTokenizer) -> List[Document]:
    """
    Split the documents into chunks of text using the RecursiveCharacterTextSplitter.

    Args:
        docs (List[Document]): List of Document objects
        tokenizer (AutoTokenizer): Huggingface tokenizer

    Returns:
        List[Document]: List of Document objects
    """
    docs_all = []
    text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
        tokenizer, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
    )

    for doc in docs:
        doc_chunks = text_splitter.split_documents([doc])

        for idx, chunk in enumerate(doc_chunks):
            chunk.metadata.update({"chunk_idx": idx})
            docs_all.append(chunk)

    return docs_all

In [4]:
file_path = "../../data/pdfs"
tokenizer = AutoTokenizer.from_pretrained("hkunlp/instructor-xl")
docs = load_docs(file_path)
docs_all = split_docs(docs, tokenizer)

 99%|█████████▊| 74/75 [01:08<00:01,  1.64s/it]Ignoring wrong pointing object 22 0 (offset 0)
Ignoring wrong pointing object 24 0 (offset 0)
Ignoring wrong pointing object 45 0 (offset 0)
Ignoring wrong pointing object 48 0 (offset 0)
Ignoring wrong pointing object 53 0 (offset 0)
Ignoring wrong pointing object 63 0 (offset 0)
Ignoring wrong pointing object 69 0 (offset 0)
Ignoring wrong pointing object 71 0 (offset 0)
Ignoring wrong pointing object 75 0 (offset 0)
Ignoring wrong pointing object 84 0 (offset 0)
Ignoring wrong pointing object 89 0 (offset 0)
Ignoring wrong pointing object 91 0 (offset 0)
Ignoring wrong pointing object 97 0 (offset 0)
Ignoring wrong pointing object 102 0 (offset 0)
Ignoring wrong pointing object 111 0 (offset 0)
100%|██████████| 75/75 [01:08<00:00,  1.09it/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (791 > 512). Running this sequence through the model will result in indexing errors


In [5]:
def save_document_as_json(doc, filename):
    doc_data = {
        "page_content": doc.page_content,
        "metadata": doc.metadata
    }
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(doc_data, f, indent=4)

In [6]:
idx = np.random.randint(0, len(docs_all), 100)

In [8]:
for cnt, i in tqdm(enumerate(idx), total=len(idx), desc="Modifying Documents"):
    doc = docs_all[i]
    response = ollama.chat(model='llama3.1:latest', messages=[{
        'role': 'user',
        'content': f'Slightly change and rephrase the provided piece of scientific paper. PLEASE RESPOND ONLY WITH THE MODIFIED DOCUMENT, NO OTHER TEXT!!! Document: {doc.page_content}',
    }])
    
    doc_modified = Document(page_content=response["message"]["content"], metadata=doc.metadata)
    save_document_as_json(doc_modified, f"../../data/mod/doc_{cnt}.json")

Modifying Documents: 100%|██████████| 100/100 [14:26<00:00,  8.66s/it]


In [8]:
idx = np.array([2677, 1188, 2608, 2097,  417, 2196, 1442,  486,  481,  873, 2693,
       2519, 3068, 2472,  760, 1111, 1672, 1062,   63,  896, 1227, 2376,
       1043,   69,  411,  730, 2064, 1009, 3164, 1912,  814,    1, 2826,
       2041, 2363, 3073, 2447,  448, 2484,  879, 2945,  465,   95, 2798,
       1522, 1128, 2251,  741, 2552,  923, 1755,  686,  424,  136, 1333,
       2636, 1076, 2254, 2722,  160, 2971, 2224, 1540,  447,  243, 2786,
       1604, 3147,  796,  607, 2576, 1341, 2200, 2077,  923, 1148, 2988,
       1357,  211, 1243, 3100, 1649, 2278,   10, 2842, 2243, 2803,  998,
        800,  490, 2600, 3068,  174, 2548, 1684, 1180, 2936, 1306, 1401,
         17])