In [1]:
from tqdm import tqdm
import json
from pathlib import Path

import spacy
import scispacy
from scispacy.umls_linking import UmlsEntityLinker

In [2]:
ner_dir = Path('ner')
split_dir = Path('dataset') / 'scispacy_split'

In [3]:
with open(ner_dir / 'emetophobia_posts_ner_scispacy.json', 'r') as f:
    all_posts_scispacy = json.load(f)
len(all_posts_scispacy)

986

In [4]:
with open(ner_dir / 'emetophobia_posts_ner_biobert.json', 'r') as f:
    all_posts_biobert = json.load(f)
len(all_posts_biobert)

986

In [5]:
linker = UmlsEntityLinker(resolve_abbreviations=True, filter_for_definitions=False)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [6]:
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain_community import embeddings
from langchain.llms import Ollama
from langchain.chains import RetrievalQA

In [7]:
docs_no_ner = []
docs_scispacy_ner = []
docs_biobert_ner = []

for i in tqdm(range(len(all_posts_scispacy))):
    scispacy_post = all_posts_scispacy[i]
    biobert_post = all_posts_biobert[i]

    title = scispacy_post['title']
    content = scispacy_post['content']
    label = scispacy_post['labels']
    ents_scispacy = [e for e in scispacy_post['entities'] if e[1] is not None]
    ents_biobert = [e for e in biobert_post['entities'] if e[1] is not None]

    if len(ents_scispacy) == 0 or len(ents_biobert) == 0:
        continue

    for i in range(len(ents_scispacy)):
        ents_scispacy[i] = f"Entity: {ents_scispacy[i][0]} Canonical Name: {linker.kb.cui_to_entity[ents_scispacy[i][1]].canonical_name} Definition: {linker.kb.cui_to_entity[ents_scispacy[i][1]].definition}"
    
    for i in range(len(ents_biobert)):
        ents_biobert[i] = f"Entity: {ents_biobert[i][0]} Group: {ents_biobert[i][0]}"
    

    # docs_no_ner.append(Document(page_content=title + '\n' + content, metadata={'label': label, 'source': 'no_ner'}))
    docs_scispacy_ner.append(Document(page_content=title + '\n' + content + f"\nDictionary\n" + "\n".join(ents_scispacy), metadata={'label': label, 'source': 'scispacy'}))
    # docs_biobert_ner.append(Document(page_content=title + '\n' + content + f"\nDictionary\n" + "\n".join(ents_biobert), metadata={'label': label, 'source': 'biobert'}))

# docs_no_ner[0], docs_scispacy_ner[0], docs_biobert_ner[0]

100%|██████████| 986/986 [00:00<00:00, 50167.21it/s]


In [13]:
def process_question(query, docs):
    embedding = embeddings.OllamaEmbeddings(model="nomic-embed-text")
    vectordb = Chroma.from_documents(docs, embedding)

    llm = Ollama(model="mistral-nemo")

    qa_chain = RetrievalQA.from_chain_type(
        llm=llm,
        retriever=vectordb.as_retriever(),
        return_source_documents=True
    )

    return qa_chain(query)


In [9]:
query = "What are the top 5 most common medical or psychological ideas that the documents refer to? Make a numbered list. Respond with just the list, avoid other text."

In [14]:
process_question(query, docs_scispacy_ner)

  llm = Ollama(model="mistral-nemo")
  return qa_chain(query)


{'query': 'What are the top 5 most common medical or psychological ideas that the documents refer to? Make a numbered list. Respond with just the list, avoid other text.',
 'result': '1. Panic Attacks\n2. Nausea/Vomiting\n3. PTSD\n4. Zofran (an antinausea medication)\n5. Mental Suffering/Distress',
 'source_documents': [Document(metadata={'source': 'scispacy', 'label': 'It Happened (TW)'}, page_content='got fp from sushi LMAOOO laughing not to cry\ngot sushi at this restaurant i ALWAYS go to. like always. nothing ever happened. tonight though … literallt less than 40 minutes passed since i finished eating and my stomach just went: NOPEEEEE and i got d* like FOUR times in a row. i got up from the toilet and had to use it again IMMEDIATELY. then i started g*gging and boom panic attack. yhing is i’m at my bf’s whicj only made the anxiety WORSE. lmao .\n\nfelt like i was getting better after an hour but no. started feeling like i was passing out and g*gged again. tu* a little too nothing c