In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from llamabot.zotero.library import ZoteroLibrary

In [None]:
library = ZoteroLibrary(articles_only=True)

In [None]:
from pdfminer.high_level import extract_text
from pathlib import Path


def convert_to_markdown(text: str):
    lines = text.split("\\\\n")
    for i, line in enumerate(lines):
        stripped = line.strip()
        if stripped.isupper() and len(stripped) < 50:
            lines[i] = f"## {stripped}"
    return "\\\\n".join(lines)


fpath = library.library["DFFPEADJ"].download_pdf(Path("/tmp"))
text = extract_text(fpath)
md_text = convert_to_markdown(text)

In [None]:
from pathlib import Path
from llamabot.doc_processor import magic_load_doc, split_document

fpath = library.library["DFFPEADJ"].download_pdf(Path("/tmp"))

doc = magic_load_doc(fpath)
split_docs = split_document(doc, chunk_size=5_000)

In [None]:
import os
from dotenv import load_dotenv
from llamabot import SimpleBot
import litellm

# litellm.drop_params = True

load_dotenv()

literature_parser = SimpleBot(
    model_name="ollama/mistral",  # Specifying Ollama via the model_name argument is necessary!s
    system_prompt="You are an expert in parsing scientific literature.",
    stream_target="stdout",  # this is the default!
    api_base=f"http://{os.getenv('OLLAMA_SERVER')}:11434",
)

keywords_sysprompt = """"Generate keywords for the document provided to you.
Please return JSON of format:

    {'keywords': ['keyword', 'keyword', 'keyword',...]}.


Keywords should be one or two words, separated by a space.
Return only keywords, nothing else.
Do not add your own commentary.
"""

keyword_generator_ollama = SimpleBot(
    model_name="ollama/mixtral:8x7b-instruct-v0.1-q4_0",  # Specifying Ollama via the model_name argument is necessary!s
    system_prompt=keywords_sysprompt,
    stream_target="stdout",  # this is the default!
    api_base=f"http://{os.getenv('OLLAMA_SERVER')}:11434",
    # json_mode=True,
    # format="json",
)

keyword_generator_gpt = SimpleBot(
    model_name="gpt-4-0125-preview",  # Specifying Ollama via the model_name argument is necessary!s
    system_prompt="Generate keywords for the document provided to you. Please return JSON of format: {'keywords': ['keyword', 'keyword', 'keyword',...]}. Keywords should be one or two words, separated by a space. Return only keywords, nothing else.",
    stream_target="stdout",  # this is the default!
    # api_base=f"http://{os.getenv('OLLAMA_SERVER')}:11434",
    # format="json",
    json_mode=True,
)

In [None]:
chunk_classifier_ollama = SimpleBot(
    model_name="ollama/mistral:instruct",  # Specifying Ollama via the model_name argument is necessary!s
    system_prompt="Please help me classify the following text as being part of the 'main body' of a paper or being part of the 'references'.",
    stream_target="stdout",  # this is the default!
    api_base=f"http://{os.getenv('OLLAMA_SERVER')}:11434",
)

In [None]:
json_fixer_sysprompt = """
Please help me fix the following string to be valid JSON.
Parts of the string are JSON (as a dictionary) with some formatting errors,
while others are just free text.
Using the content provided below, strip out free text, extract only the valid JSON,
and return just the JSON as a string.
Do not add any commentary of your own.
Ensure that everything that was within the JSON block is still present.
"""

json_fixer = SimpleBot(
    model_name="ollama/mixtral:8x7b-instruct-v0.1-q4_0",
    system_prompt=json_fixer_sysprompt,
    stream_target="stdout",
    api_base=f"http://{os.getenv('OLLAMA_SERVER')}:11434",
    temperature=0.0,
)

In [None]:
dummy_text = """
{
    "stuff": [stuff1, stuff2],
}

And some free text.
"""

json_fixer(dummy_text)

In [None]:
dummy_text2 = """
{
    "keywords": ['keyword1', 'stuff2', stuffy3, "stuffy4"],
}

Thank you for helping me fix this JSON. I am very grateful for your help.
You are helping me to save a ton of money and time.
"""
json_fixer(dummy_text2)

In [None]:
len(split_docs)

In [None]:
import json
from collections import Counter

keywords = Counter()
responses = []
for document in split_docs:
    response = keyword_generator_ollama(document)
    responses.append(response)
    try:
        kws = json.loads(response.content.replace("'", '"'))
    except json.decoder.JSONDecodeError as e:
        print("\n\n")
        print(response.content)
        print(e)
        response = json_fixer(response.content)
        kws = json.loads(response.content.replace("'", '"'))

    if isinstance(kws, dict):
        keywords.update(kws["keywords"])
    elif isinstance(kws, list):
        keywords.update(kws)

In [None]:
keywords

In [None]:
keywords

In [None]:
# Use

In [None]:
zotero_ollama("What is a protein?")