In [1]:
from tqdm import tqdm
import json
from pathlib import Path

import spacy
import scispacy
from scispacy.umls_linking import UmlsEntityLinker

In [2]:
ner_dir = Path('ner')
split_dir = Path('dataset') / 'scispacy_split'

In [3]:
with open(ner_dir / 'emetophobia_posts_ner_scispacy.json', 'r') as f:
    all_posts_scispacy = json.load(f)
len(all_posts_scispacy)

986

In [4]:
with open(ner_dir / 'emetophobia_posts_ner_biobert.json', 'r') as f:
    all_posts_biobert = json.load(f)
len(all_posts_biobert)

986

In [5]:
linker = UmlsEntityLinker(resolve_abbreviations=True, filter_for_definitions=False)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [6]:
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain_community import embeddings
from langchain.llms import Ollama
from langchain.chains import RetrievalQA

In [7]:
docs_no_ner = []
docs_scispacy_ner = []
docs_biobert_ner = []

for i in tqdm(range(len(all_posts_scispacy))):
    scispacy_post = all_posts_scispacy[i]
    biobert_post = all_posts_biobert[i]

    title = scispacy_post['title']
    content = scispacy_post['content']
    label = scispacy_post['labels']
    ents_scispacy = [e for e in scispacy_post['entities'] if e[1] is not None]
    ents_biobert = [e for e in biobert_post['entities'] if e[1] is not None]

    if len(ents_scispacy) == 0 or len(ents_biobert) == 0:
        continue

    for i in range(len(ents_scispacy)):
        ents_scispacy[i] = f"Entity: {ents_scispacy[i][0]} Canonical Name: {linker.kb.cui_to_entity[ents_scispacy[i][1]].canonical_name} Definition: {linker.kb.cui_to_entity[ents_scispacy[i][1]].definition}"
    
    for i in range(len(ents_biobert)):
        ents_biobert[i] = f"Entity: {ents_biobert[i][0]} Group: {ents_biobert[i][0]}"
    

    # docs_no_ner.append(Document(page_content=title + '\n' + content, metadata={'label': label, 'source': 'no_ner'}))
    # docs_scispacy_ner.append(Document(page_content=title + '\n' + content + f"\nDictionary\n" + "\n".join(ents_scispacy), metadata={'label': label, 'source': 'scispacy'}))
    docs_biobert_ner.append(Document(page_content=title + '\n' + content + f"\nDictionary\n" + "\n".join(ents_biobert), metadata={'label': label, 'source': 'biobert'}))

# docs_no_ner[0], docs_scispacy_ner[0], docs_biobert_ner[0]

100%|██████████| 986/986 [00:00<00:00, 49352.41it/s]


In [8]:
def process_question(query, docs):
    embedding = embeddings.OllamaEmbeddings(model="nomic-embed-text")
    vectordb = Chroma.from_documents(docs, embedding)

    llm = Ollama(model="mistral-nemo")

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=vectordb.as_retriever(),
        return_source_documents=True
    )

    return qa_chain(query)


In [9]:
query = "What are the top 5 most common medical or psychological ideas that the documents refer to? Make a numbered list. Respond with just the list, avoid other text."

In [10]:
process_question(query, docs_biobert_ner)

  embedding = embeddings.OllamaEmbeddings(model="nomic-embed-text")
  llm = Ollama(model="mistral-nemo")
  return qa_chain(query)


{'query': 'What are the top 5 most common medical or psychological ideas that the documents refer to? Make a numbered list. Respond with just the list, avoid other text.',
 'result': '1. Exposure Therapy\n2. Emetophobia (Fear of Vomiting)\n3. Panic Attacks\n4. Reassurance Seeking (Harmful in Context)\n5. Anxiety Disorder',
 'source_documents': [Document(metadata={'label': 'Positive Reminder', 'source': 'biobert'}, page_content='Yes, it is perfectly possible to recover from emetophobia.\nBeen seeing a couple of "Is it even possible to recover?"-posts recently, and I\'m here to tell you that yes, it is.\n\nTenish years ago, I was down to below 80lbs. I was constantly worried about overeating; sometimes, I\'d struggle eating a whole oreo. I lived off crackers and other snacks, because warm meals were my worst enemy.\n\nI couldn\'t leave the house by myself. Not even to check the mail or take out the trash. I always needed someone with me to calm me down and help me rantionalize in case of