# Search Where

In [1]:
%load_ext autoreload
%autoreload 2

In [15]:
import pandas as pd
import numpy as np
from cord.core import DOCUMENT_VECTOR_PATH, cord_support_dir
from cord import ResearchPapers
from cord.vectors import *
from pathlib import Path, PurePath

## 1. Load Document Vectors

In [3]:
document_vectors = pd.read_parquet(DOCUMENT_VECTOR_PATH).reset_index()

In [4]:
sha_lookup = document_vectors.loc[~document_vectors.sha.isnull(), ['sha', 'index']].set_index('sha').to_dict()['index']
pmcid_lookup = document_vectors.loc[~document_vectors.pmcid.isnull(),['pmcid', 'index']].set_index('pmcid').to_dict()['index']

In [5]:
sha_lookup['eb0d926b1fa6dc7d9f86a001f8ccd69407c5ed86'], pmcid_lookup['PMC3783479']

(8630, 11654)

## 2. Load Metadata

In [89]:
metadata = ResearchPapers.load_metadata()
#metadata = metadata[['cord_uid', 'sha', 'pmcid', 'full_text_file', 'covid_related', ]]

Loading metadata from data\CORD-19-research-challenge
Cleaning metadata
Applying tags to metadata


## 3. Match Metadata to Document Vector

In [7]:
def lookup_by_sha(shas, sha_map, not_found=[]):
    '''
        Lookup a value in the map by the sha, handling cases of multiple shas
    '''
    if not isinstance(shas, str): return not_found
    for sha in shas.split(';'):
        sha_value = sha_map.get(sha.strip())
        if sha_value is not None:
            return sha_value
    return not_found

index_from_sha = metadata.sha.apply(lambda sha: lookup_by_sha(sha, sha_lookup, np.nan))
index_from_pmcid = metadata.pmcid.apply(lambda pmcid: pmcid_lookup.get(pmcid, np.nan))
index_from_sha.isnull().sum(), index_from_sha.fillna(index_from_pmcid).isnull().sum()
metadata['index'] = index_from_sha.fillna(index_from_pmcid).fillna(-1).astype(int)

In [8]:
VECTOR_COLS = ['document_vector','document_vector_2d','document_vector_1d','cluster_id']
metadata_vectors = metadata.merge(document_vectors, on='index', how='left')[VECTOR_COLS]

In [9]:
metadata_with_vectors = pd.concat([metadata, metadata_vectors], axis=1)\
                        .drop(columns=['index']).dropna(subset=['document_vector'])[['cord_uid']+ VECTOR_COLS]
metadata_with_vectors['x'] = metadata_with_vectors.document_vector_2d.apply(lambda v: v[0])
metadata_with_vectors['y'] = metadata_with_vectors.document_vector_2d.apply(lambda v: v[1])
metadata_with_vectors.cluster_id = metadata_with_vectors.cluster_id.astype(int)
metadata_coords = metadata_with_vectors[['cord_uid', 'document_vector_1d','cluster_id','x','y']]
metadata_coords.to_parquet(PurePath(cord_support_dir() ) / 'MetadataCoords.pq')
metadata_coords

Unnamed: 0,cord_uid,document_vector_1d,cluster_id,x,y
3,zp9k1k3z,15.571824,3,5.154543,-60.681904
5,wwf90zxt,13.811303,3,18.092474,-30.416946
6,dlh93ax6,-18.650908,3,41.136578,-71.717049
7,i94lyfsh,30.520699,3,-6.935116,-89.103371
8,vs5yondw,31.640532,3,-14.407829,-88.229309
...,...,...,...,...,...
47293,3b8sz7t5,21.519590,5,-22.232004,36.251747
47294,0gqdq1ga,-60.511009,5,34.019108,87.516403
47295,si91es0i,54.562248,4,-48.364540,-8.895615
47296,m6j0us2e,27.212635,0,-7.816017,-39.591602


## Now Sample

In [10]:
metadata_coord_sample = metadata_coords.sample(4900)
metadata_coord_sample.to_parquet(PurePath(cord_support_dir() ) / 'MetadataCoordsSample.pq')

In [11]:
metadata_coord_sample

Unnamed: 0,cord_uid,document_vector_1d,cluster_id,x,y
8041,scb7pz87,-23.226923,1,18.438417,-16.114279
18757,uofygmeu,-20.360249,1,54.035545,-66.429520
31155,wbjszxi2,56.755390,0,-39.761169,0.990101
11272,cuyboyqc,-24.693068,5,17.166658,-7.197783
2665,9s15qwxo,69.504303,0,-89.185394,11.911603
...,...,...,...,...,...
30340,umfzrawo,63.415733,4,-60.548874,14.931207
46844,99f4zaqv,39.347988,4,-35.157738,-52.049980
13434,czmnkzw2,-87.473907,4,-57.242111,-74.766258
11437,3cad6j89,-5.963532,1,3.460091,16.435886


In [12]:
import altair as alt
def chartEmbeddings2D(embeddings, width=500, height=350, color_column='color', title=''):
    chart = alt.Chart(embeddings).mark_point().encode(
       x='x',
       y='y',
       color=color_column
    ).properties(
        title=title,
        width=width,
        height=height
    )
    return chart

chartEmbeddings2D(metadata_coord_sample, color_column='cluster_id:N')

In [93]:
cord_ids = metadata_coord_sample.query('cluster_id==2').cord_uid.tolist()
metadata[metadata.cord_uid.isin(cord_ids)]

Unnamed: 0,cord_uid,sha,source,title,doi,pmcid,pubmed_id,license,abstract,published,...,WHO #Covidence,has_pdf_parse,has_pmc_xml_parse,full_text_file,url,when,covid_related,virus,coronavirus,sars
66,qo74f00i,d52f2394a56cd777e6e1b4cf337864f593b7c2ec,Elsevier,Stop making sense or Regulation at the level o...,10.1016/0014-5793(88)81225-0,,3042454,els-covid,An increasing number of examples of translati...,1988-08-01,...,,True,False,custom_license,https://doi.org/10.1016/0014-5793(88)81225-0,32 years ago,False,True,False,False
130,ctqt544y,32e462b91128830abd0694d2348bef2c671444d4,Elsevier,Temporal events in the invasion of the codling...,10.1016/0022-2011(87)90108-x,,,els-covid,The replication cycle of the granulosis virus...,1987-09-30,...,,True,False,custom_license,https://doi.org/10.1016/0022-2011(87)90108-x,32 years ago,False,True,False,False
240,x6ehkp6t,c347c33ebbdc6fa40f150a2b3c7ce045a09e37ec,Elsevier,Reversible restriction of vesicular stomatitis...,10.1016/0042-6822(84)90191-0,,6330977,els-covid,"Indomethacin, a potent nonsteroidal inhibitor...",1984-06-30,...,,True,False,custom_license,https://doi.org/10.1016/0042-6822(84)90191-0,36 years ago,False,True,False,False
267,ls7rpzf2,088fa94e5689d1c83f8e8b407e340eb1ac3c3929,Elsevier,RNA-binding proteins of coronavirus MHV: Detec...,10.1016/0042-6822(86)90305-3,,3083580,els-covid,RNA-binding proteins of coronavirus MHV-A59 w...,1986-04-30,...,,True,False,custom_license,https://doi.org/10.1016/0042-6822(86)90305-3,34 years ago,False,True,True,False
281,ck7eto16,771006c821f2bad4ea3eb592173208e7fb51834e,Elsevier,Analysis of intracellular small RNAs of mouse ...,10.1016/0042-6822(87)90414-4,,3027983,els-covid,We have previously shown the presence of mult...,1987-02-28,...,,True,False,custom_license,https://doi.org/10.1016/0042-6822(87)90414-4,33 years ago,False,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46953,m52qnhtm,91928facf53b5a9b8dea0f42a2ff0767dd2b94d0,Elsevier,Calmodulin interacts with angiotensin-converti...,10.1016/j.febslet.2007.11.085,PMC7094239,18070603,els-covid,Angiotensin-converting enzyme-2 (ACE2) is a r...,2008-01-23,...,,True,False,custom_license,https://doi.org/10.1016/j.febslet.2007.11.085,12 years ago,False,True,True,True
47031,ar6wyv3u,f5180b6877e751cc23cc6f4ce44a32fcea79feca,Elsevier,Chinese hamster ovary cell lines selected for ...,10.1016/j.virol.2012.05.018,PMC3402687,22726751,els-covid,Ebolavirus causes severe hemorrhagic fever in...,2012-10-10,...,,True,False,custom_license,https://doi.org/10.1016/j.virol.2012.05.018,7 years ago,False,True,True,True
47040,f32rcce3,f678918637d6949ca22b1d7cd5982309f8ad6dbe; f13e...,Elsevier,Role of host reticulon proteins in rearranging...,10.1016/j.mib.2012.04.007,PMC3670673,22621853,els-covid,Positive-strand RNA [(+)RNA] viruses are respo...,2012-08-31,...,,True,False,custom_license,https://doi.org/10.1016/j.mib.2012.04.007,8 years ago,False,True,False,False
47045,lql07dqa,654c290db8070c13849ca52d90a7b7672c722050,Elsevier,MCPIP1 negatively regulate cellular antiviral ...,10.1016/j.bbrc.2018.06.083,PMC7092953,29920243,els-covid,IFNβ innate immune plays an essential role in...,2018-09-05,...,,True,False,custom_license,https://doi.org/10.1016/j.bbrc.2018.06.083,2 years ago,False,True,False,False


In [13]:
papers = ResearchPapers.load()

Loading metadata from data\CORD-19-research-challenge
Cleaning metadata
Applying tags to metadata

Indexing research papers
Creating the BM25 index from the abstracts of the papers
Use index="text" if you want to index the texts of the paper instead
Finished Indexing in 73.0 seconds


In [95]:
query = """
Physical science of the coronavirus (e.g., charge distribution, adhesion to hydrophilic/phobic surfaces,
environmental survival to inform decontamination efforts for affected areas and provide information about viral shedding
"""
papers.search_2d(query)

HTML(value='<h4>\nPhysical science of the coronavirus (e.g., charge distribution, adhesion to hydrophilic/phob…

In [96]:
query ="""
Efforts to identify the underlying drivers of fear, anxiety and stigma that
fuel misinformation and rumor, particularly through social media.
"""
papers.search_2d(query)

HTML(value='<h4>\nEfforts to identify the underlying drivers of fear, anxiety and stigma that\nfuel misinforma…

In [97]:
query ="""
An increasing number of examples of translational regulation at the level of termination has been recently reported in eukaryotes. This paper reviews our present knowledge on this topic and proposes an understanding of these regulations by relating the study of viral gene expression to a comprehensive view of the mechanisms and components of the translational process.
"""
papers.search_2d(query)

HTML(value='<h4>\nAn increasing number of examples of translational regulation at the level of termination has…