In [100]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.insert(0, "../../scrapemed")

import scrapemed.paper as paper
import pandas as pd
import lxml
import chromadb
from chromadb.utils import embedding_functions
from langchain.text_splitter import CharacterTextSplitter
import re
from typing import Union, List, Dict
from difflib import SequenceMatcher

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [101]:
embedding_functions.DefaultEmbeddingFunction()

<chromadb.utils.embedding_functions.ONNXMiniLM_L6_V2 at 0x7fdcc7ec87c0>

In [102]:
PMCID = 7067710
email = "danielfrees247@gmail.com"

p = paper.Paper.from_pmc(PMCID, email, download=False)





In [103]:
client = chromadb.Client()

chromadb.api.models.Collection.Collection

In [29]:
collection = client.get_or_create_collection("paper-test")
collection.peek()
type(collection)

{'ids': ['7067710'],
 'embeddings': [[0.0860835611820221,
   -0.011491809971630573,
   -0.057480357587337494,
   0.031779590994119644,
   -0.03433350473642349,
   0.018448898568749428,
   -0.037901006639003754,
   0.13125614821910858,
   0.05952084809541702,
   -0.007475802209228277,
   -0.04355744644999504,
   0.02744288556277752,
   0.057396236807107925,
   0.01491390261799097,
   -0.023976394906640053,
   -0.007901456207036972,
   0.11673666536808014,
   0.02848568744957447,
   -0.0011698374291881919,
   0.04883452504873276,
   -0.015924280509352684,
   -0.033605869859457016,
   0.04215315356850624,
   -0.00034386469633318484,
   -0.12131091207265854,
   0.05767066404223442,
   0.0010608189040794969,
   -0.012912094593048096,
   -0.05820910632610321,
   0.06912665069103241,
   0.11672540754079819,
   -0.011653769761323929,
   0.07213318347930908,
   -0.03191077336668968,
   -0.0003668710414785892,
   0.0555034764111042,
   -0.06669610738754272,
   0.04117043316364288,
   0.027029007

## Add paper as a whole

In [20]:
collection.add(
    documents = [p.full_text()],
    metadatas = [{"pmcid": p.article_id['pmc']}],
    ids = [p.article_id['pmc']]
)

In [23]:
collection.delete(ids = ['id1'])

In [31]:
collection.query(query_texts=["absorption"],include=["documents"], n_results=1)

{'ids': [['7067710']],
 'distances': None,
 'metadatas': None,
 'embeddings': None,
 'documents': [['Abstract: \n\nSECTION: Introduction:\n\nA fixed-dose combination (FDC) of ibuprofen and acetaminophen has been developed that provides greater analgesic efficacy than either agent alone at the same doses without increasing the risk for adverse events.\n\nSECTION: Methods:\n\nWe report three clinical phase I studies designed to assess the pharmacokinetics (PK) of the FDC of ibuprofen/acetaminophen 250/500\xa0mg (administered as two tablets of ibuprofen 125\xa0mg/acetaminophen 250\xa0mg) in comparison with its individual components administered alone or together, and to determine the effect of food on the PK of the FDC. Two studies in healthy adults aged 18–55\xa0years used a crossover design in which subjects received a single dose of each treatment with a 2-day washout period between each. In the third study, the bioavailability of ibuprofen and acetaminophen from a single oral dose of 

## Add Paper in Chunks 

Langchain text splitter

In [104]:
chunk_model = CharacterTextSplitter(
    separator="\\n\\n|\\n|\\.|\\s", 
    is_separator_regex=True, 
    chunk_size = 100,
    chunk_overlap = 20,
    length_function = len,
    keep_separator = True)

In [105]:
def generate_chunk_id(pmcid:str, index:Union[int,str]):
    """
    Generate id for a PMC text chunk, using pmcid and index of the chunk.
    The chunk indices should be unique. Recommended to use indexes from the result
    of chunk model. 
    """
    return f"pmcid-{pmcid}-chunk-{str(index)}"

def get_chunk_index_from_chunk_id(chunk_id:str)->str:
    """
    Given a PMCID Chunk ID, in the format generated by generate_pmcid_chunk_id,
    gather the index of the chunk.
    """
    pattern = re.compile(r"chunk-(\d+)")  # Compile the regex pattern
    match = pattern.search(chunk_id)
    index = None
    if match:
        index = match.group(1)
    return index    

def get_pmcid_from_chunk_id(chunk_id:str)->str:
    """
    Given a PMCID Chunk ID, in the format generated by generate_pmcid_chunk_id,
    gather the PMCID of the chunk.
    """
    pattern = re.compile(r"pmcid-(\d+)")  # Compile the regex pattern
    match = pattern.search(chunk_id)
    pmcid = None
    if match:
        pmcid = match.group(1)
    return pmcid    

In [106]:
p_chunks = chunk_model.split_text(p.full_text())
p_metadatas = [{"pmcid": p.article_id['pmc']}] * len(p_chunks)
pmcid = p.article_id['pmc']
p_ids = [f"pmcid-{pmcid}-chunk-{i}" for i in range(len(p_chunks))]

In [107]:
collection.add(
    documents = p_chunks,
    metadatas = p_metadatas,
    ids = p_ids
)

Insert of existing embedding ID: pmcid-7067710-chunk-0
Insert of existing embedding ID: pmcid-7067710-chunk-1
Insert of existing embedding ID: pmcid-7067710-chunk-2
Insert of existing embedding ID: pmcid-7067710-chunk-3
Insert of existing embedding ID: pmcid-7067710-chunk-4
Insert of existing embedding ID: pmcid-7067710-chunk-5
Insert of existing embedding ID: pmcid-7067710-chunk-6
Insert of existing embedding ID: pmcid-7067710-chunk-7
Insert of existing embedding ID: pmcid-7067710-chunk-8
Insert of existing embedding ID: pmcid-7067710-chunk-9
Insert of existing embedding ID: pmcid-7067710-chunk-10
Insert of existing embedding ID: pmcid-7067710-chunk-11
Insert of existing embedding ID: pmcid-7067710-chunk-12
Insert of existing embedding ID: pmcid-7067710-chunk-13
Insert of existing embedding ID: pmcid-7067710-chunk-14
Insert of existing embedding ID: pmcid-7067710-chunk-15
Insert of existing embedding ID: pmcid-7067710-chunk-16
Insert of existing embedding ID: pmcid-7067710-chunk-17
In

In [56]:
result = collection.query(
    query_texts=["absorption"],
    include=["documents"], 
    n_results=1)
result

{'ids': [['pmcid-7067710-chunk-26']],
 'distances': None,
 'metadatas': None,
 'embeddings': None,
 'documents': [['effects. Similar to previous findings for the individual components, the rates of absorption of']]}

In [108]:
def expanded_query(collection:chromadb.api.models.Collection.Collection, query:str, n_results:int=1, n_before:int = 2, n_after:int = 2):
    result = collection.query(
        query_texts=[query],
        include=["documents"], 
        n_results=n_results)

    expanded_results = {}
    for id in result['ids'][0]:
        chunk_index = get_chunk_index_from_chunk_id(id)
        pmcid = get_pmcid_from_chunk_id(id)
        #get the texts before and after the result chunk
        expanded_ids = []
        for i in range(1, n_before+1):
            expanded_ids.append(generate_chunk_id(pmcid, int(chunk_index) - i))
        expanded_ids.append(id)
        for i in range(1, n_after+1):
            expanded_ids.append(generate_chunk_id(pmcid, int(chunk_index) + i))

        expanded_results[f"Match on {id}"] = collection.get(
                ids=expanded_ids,
            )['documents']

    cleaned_results = {}
    #append docs together two at a time, removing overlap
    for match, docs in expanded_results.items():
        combined_result = ""
        #combined docs together
        if len(docs) == 0:
            combined_result = None
        elif len(docs) == 1:
            combined_result = docs[0]
        else:
            #combine first two docs, removing overlap, to start the combined result
            substring_match = SequenceMatcher(None, docs[0], docs[1]).find_longest_match(0, len(docs[0]), 0, len(docs[1]))
            combined_docs = docs[0][:substring_match.a]+docs[1][substring_match.b:]
            combined_result += combined_docs
            #eat these first two docs
            if len(docs) >= 3:
                docs = docs[2:]
            else:
                docs = []
            #continue eating the rest one by one
            while len(docs) >= 1:
                substring_match = SequenceMatcher(None, combined_result, docs[0]).find_longest_match(0, len(combined_result), 0, len(docs[0]))
                combined_result = combined_result[:substring_match.a]+docs[0][substring_match.b:]
                #eat the processed doc
                if len(docs) >=2:
                    docs = docs[1:]
                else:
                    docs = []
                
            cleaned_results[match] = "..." + combined_result + "..."
    
    return cleaned_results

In [92]:
cleaned_results

{'pmcid-7067710-chunk-26': ['similar to its monocomponent constituents when administered separately or coadministered,',
  'or coadministered, indicating no drug–drug interactions and no formulation effects. Similar to',
  'effects. Similar to previous findings for the individual components, the rates of absorption of',
  'of absorption of ibuprofen and acetaminophen from the FDC were slightly delayed in the presence of',
  'in the presence of food. Overall, adolescents had similar exposures to acetaminophen and ibuprofen']}

In [111]:
result = expanded_query(
    collection = collection,
    query="absorption",
    n_before = 1, 
    n_after = 2,
    n_results = 2
    )
result

{'Match on pmcid-7067710-chunk-26': '...or coadministered, indicating no drug–drug interactions and no formulation effects. Similar to previous findings for the individual components, the rates of absorption of ibuprofen and acetaminophen from the FDC were slightly delayed in the presence of food. Overall, adolescents had similar exposures to acetaminophen and ibuprofen...',
 'Match on pmcid-7067710-chunk-242': '...respectively. The 90% CIs for these AUC values (Table\xa03) were within the acceptance range for bioequivalence of 80–125%. However, the rate of absorption was delayed with food compared with fasting; the ratio for dose-normalized C_max for ibuprofen when...'}