In [None]:
import tiktoken
import re
import os
import json

from llamabot import QueryBot
from nltk.corpus import stopwords
from pathlib import Path
from llamabot.zotero.utils import load_zotero
from llamabot.zotero.library import ZoteroLibrary
from llamabot.prompt_library.zotero import retrieverbot_sysprompt
from pyzotero import zotero
from dotenv import load_dotenv

%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'


In [None]:
# I found a bug: sometimes, I get back the "arxiv fulltext PDF or arxiv snapshot, not the entry itself."
# I think the fix is to individually embed just the entries that are not PDFs as JSON and then query them.

In [None]:
ZOTERO_JSON_DIR = Path.home() / ".llamabot/zotero/zotero_index/"
# ZOTERO_JSON_DIR.mkdir(parents=True, exist_ok=True)

library = ZoteroLibrary(articles_only=True)

In [None]:
library.library["PD98W5BW"].info["data"]["title"]

In [None]:
# Use set intersection of terms for searching


# Function that eliminates any non-alphanumeric characters from a string
def remove_non_alphanumeric(s: str) -> str:
    return re.sub(r"[^\w\s]", "", s)


# Remove stop words from a string
def remove_stop_words(s: str) -> str:
    stop_words = set(stopwords.words("english"))
    return " ".join([word for word in s.split() if word not in stop_words])


def process_words(s: str) -> set:
    s = remove_non_alphanumeric(s)
    s = remove_stop_words(s)
    return set(s.lower().split(" "))


# Process a ZoteroItem's title, abstract, and authors by removing non-alphanumeric characters and stop words
# and then storing them inside a "keywords" set.
# def process_zotero_item(zotero_item):
#     keywords = set()
#     keywords.add(remove_non_alphanumeric(zotero_item.title))
#     keywords.add(remove_non_alphanumeric(zotero_item.abstract))

In [None]:
items = library.library.items()
keywords = set()
for key, item in items:
    keywords = keywords.union(process_words(item["data"]["title"]))
    keywords = keywords.union(process_words(item["data"]["abstractNote"]))
    break

In [None]:
keywords

In [None]:
library.to_json(ZOTERO_JSON_DIR, has_pdf=True)

In [None]:
doc_paths = list(ZOTERO_JSON_DIR.glob("*.json"))
# doc_paths = [ZOTERO_JSON_DIR / "BNGBSLPD.json"]
doc_paths

In [None]:
encoding = tiktoken.encoding_for_model("gpt-4-32k")
max_tokens = 0
for json_path in list(ZOTERO_JSON_DIR.glob("*.json")):
    string: str = json_path.read_text()
    num_tokens = len(encoding.encode(string))
    if num_tokens > max_tokens:
        max_tokens = num_tokens

retrieverbot = QueryBot(
    retrieverbot_sysprompt(),
    doc_paths=doc_paths,
    stream=True,
    use_cache=False,
    chunk_size=max_tokens,
)

In [None]:
nodes = retrieverbot.retrieve("antibody")
nodes

In [None]:
nodes = retrieverbot.retrieve("alphafold")
for node in nodes:
    try:
        json.loads(node.node.text)
    except Exception:
        print(node)
    print()

In [None]:
from llamabot.prompt_library.zotero import get_key

In [None]:
retrieverbot(get_key("alphafold paper"))

In [None]:
for k, v in library.library.items():
    if v.has_pdf():
        print(k)

In [None]:
ZOTERO_JSON_PATH = Path.home() / ".llamabot/zotero/zotero_index.json"

retrieverbot = QueryBot(
    retrieverbot_sysprompt(),
    doc_paths=[ZOTERO_JSON_PATH],
)

In [None]:
# CLI: llamabot zotero configure --library-id 12345 --library-type "user" --api-key 1p84325f

# Then we store the library ID and the library type in the config file.

In [None]:
retrieverbot(get_key("A paper on machine learning for engineering GFP"))

In [None]:
lib = ZoteroLibrary()
lib["BV68IW7P"].download_pdf(Path("/tmp"))

In [None]:
# llamabot zotero sync

In [None]:
load_dotenv()

zotero_library_id = os.environ.get("ZOTERO_LIBRARY_ID", None)
zotero_library_type = os.environ.get("ZOTERO_LIBRARY_TYPE", None)
zotero_api_key = os.environ.get("ZOTERO_API_KEY", None)

zot = zotero.Zotero(
    library_id=zotero_library_id,
    library_type=zotero_library_type,
    api_key=zotero_api_key,
)
items = zot.everything(zot.items())

In [None]:
len(items)

In [None]:
# We only want the parent items. They don't have a "parent" key.

# items[3]
# [item for item in items if "up" not in item["links"]]
items_with_abstracts = [
    item for item in items if "data" in item and "abstractNote" in item["data"]
]
len(items_with_abstracts)

In [None]:
# QueryBot

# from llamabot import QueryBot
title = items_with_abstracts[0]["data"]["title"]
abstract = items_with_abstracts[0]["data"]["abstractNote"]
authors = [
    c
    for c in items_with_abstracts[0]["data"]["creators"]
    if c["creatorType"] == "author"
]
items_with_abstracts[0]

Save all of the JSONS to disk under /tmp/zotero_jsons


In [None]:
# Desired CLI:
# llamabot zotero chat --title "Title" --author "any author"

# Step 1: Retrieve the PDF key from Zotero. Raise error if there is no PDF key.

In [None]:
ZOTERO_JSON_PATH = Path.home() / ".llamabot/zotero/zotero_index.json"

retrieverbot = QueryBot(
    retrieverbot_sysprompt(),
    doc_paths=[ZOTERO_JSON_PATH],
)

In [None]:
from llamabot.prompt_library.zotero import get_key

response = retrieverbot(
    get_key(title="A Connection Between Score Matching and Denoising Autoencoders")
)

In [None]:
key = json.loads(response.content)["key"]


zot = load_zotero()

# library = zot.everything(zot.items())
library = ZoteroLibrary(ZOTERO_JSON_PATH)
fpath = library[key].download_pdf(Path("/tmp"))
fpath

In [None]:
library[key]["data.title"]

In [None]:
docchat = QueryBot(
    system_message="You are an expert paper reader.",
    doc_paths=[fpath],
)

In [None]:
docchat("Summarize this paper for me.")

In [None]:
docchat("What implementation of Gaussian Processes did the author use?")

In [None]:
docchat("What is the reference for COMBO? Is there a paper?")

In [None]:
docchat("How big was the initial small library that they tested?")

In [None]:
docchat("Define 'point saturation mutagenesis' for me please.")

In [None]:
len(items)

items[1]["links"]["attachment"]

In [None]:
items[0]["links"]["attachment"]["href"]

In [None]:
with open("/tmp/article.pdf", "wb") as f:
    f.write(zot.file("A4BK56EA"))

In [None]:
# We can parse the href to get the item key

# I think we need to