In [1]:
from pyalm.models.openai import OpenAI
import pyalm
from pyalm import ConversationRoles
import json
from tqdm import tqdm
import time
import os

import re

In [3]:
llm = ...

In [6]:
# Final step. Look at this when you read the rest.
# This will read from a PDF, segment it and try to convert it into a json file that will directly be saved.
# The correct metadata set here is up to you.

parse_and_store_doc("/home/finn/Downloads/PEER_final.pdf",
                    {"document_title":"Lecture Notes on CMB Theory: From Nucleosynthesis to Recombination",
                     "source_url":"https://arxiv.org/abs/0802.3688" ,
                     "authors":"Wayne Hu",
                     "publisher":"arxiv.org",
                     "tags":["physics","astrophysics"]},
                    read_directly=False
                    )

Document size: 20003 chars


100%|██████████| 2/2 [01:12<00:00, 36.39s/it]

Finished. Took 73s and a total of 8581 tokens to generate 16 entries.
Size in chars: 12224





In [5]:
full_list=None
file=None
def parse_and_store_doc(filepath, metadata, read_directly=False):
    global full_list, file
    if read_directly:
        # You can use read_directly to e.g read from Latex, a raw text, HTML etc.
        with open(filepath, "r") as f:
            content = f.read()
    else:
        # This will use tika to read from a PDF, DOCX etc.
        # You can also first read from the PDF and then store the content in a variable and use read_directly=True
        # Makes sense if you don't just have PDFs but also other formats.
        import tika
        tika.initVM()
        from tika import parser
        parsed = parser.from_file(filepath, xmlContent=True)
        content = parsed["content"]
    #577 tokens for system message
    start=time.time()
    with open("base_tracker.yaml","r") as f:
        base_yaml = f.read()
    
    doc_title = os.path.basename(filepath).split(".")[0]
    
    metadata = [metadata]
    
    full_list = metadata
    with open(f"{doc_title}.json","w") as file:
        file.write(json.dumps(full_list))
    
    plan_size =10000
    # If the document is too large, we will split it into chunks of 10000 chars.
    # Otherwise we run into forgetfulness issues.
    doc_len = len(content)
    print(f"Document size: {doc_len} chars")
    iter = doc_len//plan_size
    step_size = int(plan_size+(doc_len%plan_size)/iter)+1
    total_tokens = 0
    total_chars = 0
    for i in tqdm(range(iter)):
        llm.conversation_history = pyalm.internal.state.ConversationTracker.from_yaml(base_yaml)
        chunk = content[i*step_size:(i+1)*step_size]
        llm.add_tracker_entry(chunk,ConversationRoles.USER)
        txt = llm.create_completion(max_tokens=3500, chat=True, temperature=0)
        total_tokens += llm.finish_meta["tokens"]["total_tokens"]
        entries = txt.replace("```json", "").replace("```", "")
        entries = entries.replace("\\", "\\\\") 
        try:
            total_chars +=len(entries)
            entries = json.loads(entries)
            full_list += entries
            with open(f"{doc_title}.json","w") as file:
                file.write(json.dumps(full_list))
        except Exception as e:
            print(f"Error during turning text into json!\nEntry {i} will be skipped!")
            print("--------")
            print(e)
            print(txt)
            print("--------")
    end = time.time()
    file.close()
    print(f"Finished. Took {round(end - start)}s and a total of {total_tokens} tokens to generate {len(full_list)-1} entries.\nSize in chars: {total_chars}")

In [None]:
# You can use this to directly query the model for completions.
# gen = llm.create_generator("MESSAGE",max_tokens=3500, chat=True)
# for i in gen:
#     print(i[0],end="")

In [4]:
# This is the core of the system. THis tells the model how to actually split the text.
# You need to execute this at least once as it creates a "template" for the above functions.

instruct = """You are a bot for content extraction for an embeddings based knowledge retrieval system.
A user will prompt you with some form of extracted or otherwise obtained document, usually in the form of html, xml etc.

It is your job to transform the entire user input into a json file, fitting for database of "knowledge snippets".

That means you will transform the text into segments, that each provide meaninguf information.
This will be used in the end for knowledge retrieval so each snippet should contain a coherent block of info, but be easily readable.
A single sentence or two sentences are not very valuable on their own!

It is important that the entirety of the input will be transformed into such segments, so that if one would append all of them together, the document would be restored.

You can orient yourself for splitting at headings, however you don't have to.

As this will be presented to a human reader the entries need to be cleaned up appropriately. That means that all control sequences from html, xml etc. are removed.
And that all math will be turned into latex.

It may happen that in the extraction process info may be lost. For example "A proportion of $3\frac{4}{2}$ of..." may be seen by you as "A proportion of 3 42 of...".
In cases where you see something like this, replace parts with question marks. Incorrect information is far more dangerous as missing information as people will rely on this!

For each content entry, try to add infos on where to find the sequence in the original document. This could be the page number, header, subheader etc.
Ideally with that a user is able to find the text block quickly in the document.

Should the document stop too early to finish a block, do not add it! Only add a coherent block of info!

Do not the source section of a document.

An ideal output would look like this (this has been shortened to only one entry).

[
{
"header":"Cabibbo–Kobayashi–Maskawa Matrix of Flavor Mixing",
"subheader":"History"
"page":1,
"content":"By 1950s, physicists have noted that the Fermi constant GF inferred from the $\beta$–decays of nuclei is a couple of percent smaller than the GF inferred from the muon decay.
At the same time, a bunch of strange particles were discovered in cosmic rays and accelerator labs; these particles were created by the strong interactions but decayed only by the weak interactions, hence the name “strange”.
Moreover, the effective Fermi constant responsible for the strange particle decays was about $4 \frac{1}{2}$ times weaker than the regular GF responsible or the nuclear $\beta$–decays or the pion decays."
},
..
]
Note especially the length of the content. Do no output entries with less content!

Do not respond with anything else but the JSON!
Nothing like ```json, no comments, nothing!
Your output will be parsed immediately, you would cause parsing errors with this!

Do not rewrite any content! You write down content adhering precisely to original formulations.
Do not write summaries, interprerations or any of the sort. Your job is to make a parsed document readable and segment it.
You do not alter anything!"""

llm.reset_tracker()
llm.set_system_message(instruct)

with open("base_tracker.yaml", "w") as f:
    f.write(llm.conversation_history.to_yaml())