# Search Where

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
from cord.core import DOCUMENT_VECTOR_PATH, cord_support_dir
from cord import ResearchPapers
from cord.vectors import *
from pathlib import Path, PurePath

## 1. Load Document Vectors

In [3]:
document_vectors = pd.read_parquet(DOCUMENT_VECTOR_PATH).reset_index()

In [4]:
sha_lookup = document_vectors.loc[~document_vectors.sha.isnull(), ['sha', 'index']].set_index('sha').to_dict()['index']
pmcid_lookup = document_vectors.loc[~document_vectors.pmcid.isnull(),['pmcid', 'index']].set_index('pmcid').to_dict()['index']

In [5]:
sha_lookup['eb0d926b1fa6dc7d9f86a001f8ccd69407c5ed86'], pmcid_lookup['PMC3783479']

(8630, 11654)

## 2. Load Metadata

In [6]:
metadata = ResearchPapers.load_metadata()
#metadata = metadata[['cord_uid', 'sha', 'pmcid', 'full_text_file', 'covid_related', ]]

Loading metadata from data\CORD-19-research-challenge
Cleaning metadata
Applying tags to metadata


## 3. Match Metadata to Document Vector

In [7]:
def lookup_by_sha(shas, sha_map, not_found=[]):
    '''
        Lookup a value in the map by the sha, handling cases of multiple shas
    '''
    if not isinstance(shas, str): return not_found
    for sha in shas.split(';'):
        sha_value = sha_map.get(sha.strip())
        if sha_value is not None:
            return sha_value
    return not_found

index_from_sha = metadata.sha.apply(lambda sha: lookup_by_sha(sha, sha_lookup, np.nan))
index_from_pmcid = metadata.pmcid.apply(lambda pmcid: pmcid_lookup.get(pmcid, np.nan))
index_from_sha.isnull().sum(), index_from_sha.fillna(index_from_pmcid).isnull().sum()
metadata['index'] = index_from_sha.fillna(index_from_pmcid).fillna(-1).astype(int)

In [8]:
VECTOR_COLS = ['document_vector','document_vector_2d','document_vector_1d','cluster_id']
metadata_vectors = metadata.merge(document_vectors, on='index', how='left')[VECTOR_COLS]

In [9]:
metadata_with_vectors = pd.concat([metadata, metadata_vectors], axis=1)\
                        .drop(columns=['index']).dropna(subset=['document_vector'])[['cord_uid']+ VECTOR_COLS]
metadata_with_vectors['x'] = metadata_with_vectors.document_vector_2d.apply(lambda v: v[0])
metadata_with_vectors['y'] = metadata_with_vectors.document_vector_2d.apply(lambda v: v[1])
metadata_with_vectors.cluster_id = metadata_with_vectors.cluster_id.astype(int)
metadata_coords = metadata_with_vectors[['cord_uid', 'document_vector_1d','cluster_id','x','y']]
metadata_coords.to_parquet(PurePath(cord_support_dir() ) / 'MetadataCoords.pq')
metadata_coords

Unnamed: 0,cord_uid,document_vector_1d,cluster_id,x,y
3,zp9k1k3z,15.571824,3,5.154543,-60.681904
5,wwf90zxt,13.811303,3,18.092474,-30.416946
6,dlh93ax6,-18.650908,3,41.136578,-71.717049
7,i94lyfsh,30.520699,3,-6.935116,-89.103371
8,vs5yondw,31.640532,3,-14.407829,-88.229309
...,...,...,...,...,...
47293,3b8sz7t5,21.519590,5,-22.232004,36.251747
47294,0gqdq1ga,-60.511009,5,34.019108,87.516403
47295,si91es0i,54.562248,4,-48.364540,-8.895615
47296,m6j0us2e,27.212635,0,-7.816017,-39.591602


## Now Sample

In [10]:
metadata_coord_sample = metadata_coords.sample(4900)
metadata_coord_sample.to_parquet(PurePath(cord_support_dir() ) / 'MetadataCoordsSample.pq')

In [11]:
metadata_coord_sample

Unnamed: 0,cord_uid,document_vector_1d,cluster_id,x,y
42041,xqs4baou,84.697433,0,-87.176971,34.263596
9090,sb5kf2kw,55.232693,4,-37.105133,-0.623715
18674,jb4l74d3,41.263538,4,-27.198973,-37.683079
25560,s5k59eq8,19.845816,0,-29.425728,78.332466
46002,5ayjudl9,54.088665,4,-42.504688,-9.615630
...,...,...,...,...,...
6986,no2tm05i,36.551430,4,-37.001953,-70.206421
41743,rbnfh89u,46.768898,4,-32.850391,-30.315857
22115,tgagwgpj,37.682690,4,-47.229324,-62.261360
5249,nwdgphxa,-49.265484,2,60.730026,1.413522


In [12]:
import altair as alt
def chartEmbeddings2D(embeddings, width=500, height=350, color_column='color', title=''):
    chart = alt.Chart(embeddings).mark_point().encode(
       x='x',
       y='y',
       color=color_column
    ).properties(
        title=title,
        width=width,
        height=height
    )
    return chart

chartEmbeddings2D(metadata_coord_sample, color_column='cluster_id:N')

In [13]:
cord_ids = metadata_coord_sample.query('cluster_id==2').cord_uid.tolist()
metadata[metadata.cord_uid.isin(cord_ids)]

Unnamed: 0,cord_uid,sha,source,title,doi,pmcid,pubmed_id,license,abstract,published,...,has_pdf_parse,has_pmc_xml_parse,full_text_file,url,when,covid_related,virus,coronavirus,sars,index
228,92lazxd7,444d604966843cb3414dea1b8f5e2a24043553f1,Elsevier,Synthesis and subcellular localization of the ...,10.1016/0042-6822(83)90106-x,,6196910,els-covid,The synthesis and processing of the nucleocap...,1983-10-30,...,True,False,custom_license,https://doi.org/10.1016/0042-6822(83)90106-x,36 years ago,False,True,True,False,30408
250,nx7odchj,a46e50560c14dd1cbab901be5d570ba90b636f36,Elsevier,Defective interfering particles of mouse hepat...,10.1016/0042-6822(84)90420-3,,6322437,els-covid,After six to eight serial undiluted passages ...,1984-02-29,...,True,False,custom_license,https://doi.org/10.1016/0042-6822(84)90420-3,36 years ago,False,True,False,False,39122
283,i6vuhaiv,02569bcf9a7e53d2d262ef58dbfb2f9d38e7c7b2,Elsevier,Sequence analysis of the 3′ end of the feline ...,10.1016/0042-6822(88)90097-9,,3201747,els-covid,"The genetic information, carried on mRNA 6 of...",1988-12-31,...,True,False,custom_license,https://doi.org/10.1016/0042-6822(88)90097-9,31 years ago,False,True,False,False,24389
286,pbn9n37n,7890cdb9c6a2dc34cccc3bf93c87033d2a81e88b,Elsevier,Defective-interfering particles of murine coro...,10.1016/0042-6822(88)90237-1,,2831651,els-covid,The mechanism of synthesis of the defective v...,1988-03-31,...,True,False,custom_license,https://doi.org/10.1016/0042-6822(88)90237-1,32 years ago,False,True,False,False,35123
294,i17gweqp,bf78f453e4f454f92454f388f5167542c4825f33,Elsevier,The amino-terminal signal peptide on the porci...,10.1016/0042-6822(88)90581-8,,2841792,els-covid,cDNA clones mapping within the first 2601 bas...,1988-08-31,...,True,False,custom_license,https://doi.org/10.1016/0042-6822(88)90581-8,32 years ago,False,True,True,False,41528
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47104,cqcfstst,b5a9cf5bcdde7381ddbbaf4795da4ec08b7ccabe,Elsevier,Growth kinetics of SARS-coronavirus in Vero E6...,10.1016/j.bbrc.2005.02.085,PMC7092881,15752773,els-covid,Vero E6 cells are commonly used for in vitro ...,2005-04-15,...,True,False,custom_license,https://doi.org/10.1016/j.bbrc.2005.02.085,15 years ago,False,True,True,True,40686
47158,ufu0kgw7,221e5c964d0af38a0a1744714dae695e3fdd75f5; 35b2...,Elsevier,Systems biology: A tool for charting the antiv...,10.1016/j.virusres.2016.01.005,PMC4902762,26795869,els-covid,The host antiviral programs that are initiate...,2016-06-15,...,True,False,custom_license,https://doi.org/10.1016/j.virusres.2016.01.005,4 years ago,False,True,False,False,27258
47166,ks3nryfh,f6e8b44f19fd28de061592f560371c1a8796dc13; d7e5...,Elsevier,Nucleotide-Resolution Profiling of RNA Recombi...,10.1016/j.jmb.2012.10.005,PMC3502730,23069247,els-covid,Next-generation sequencing has been used in n...,2012-12-14,...,True,False,custom_license,https://doi.org/10.1016/j.jmb.2012.10.005,7 years ago,False,True,False,False,46476
47232,gmzbinpi,6364a6acea5c75ec6f2e82ae0a032abe593f4e0f; 9342...,Elsevier,Molecular virology of hepatitis E virus,10.1016/j.virusres.2011.02.011,PMC3130092,21345356,els-covid,This review details the molecular virology of...,2011-10-31,...,True,False,custom_license,https://doi.org/10.1016/j.virusres.2011.02.011,8 years ago,False,True,False,False,33201


In [14]:
papers = ResearchPapers.load()

Loading metadata from data\CORD-19-research-challenge
Cleaning metadata
Applying tags to metadata

Indexing research papers
Creating the BM25 index from the abstracts of the papers
Use index="text" if you want to index the texts of the paper instead
Finished Indexing in 41.0 seconds


In [15]:
query = """
Physical science of the coronavirus (e.g., charge distribution, adhesion to hydrophilic/phobic surfaces,
environmental survival to inform decontamination efforts for affected areas and provide information about viral shedding
"""
papers.search_2d(query)

HTML(value='<h4>\nPhysical science of the coronavirus (e.g., charge distribution, adhesion to hydrophilic/phob…

In [16]:
query ="""
Efforts to identify the underlying drivers of fear, anxiety and stigma that
fuel misinformation and rumor, particularly through social media.
"""
papers.search_2d(query)

HTML(value='<h4>\nEfforts to identify the underlying drivers of fear, anxiety and stigma that\nfuel misinforma…

In [17]:
query ="""
An increasing number of examples of translational regulation at the level of termination has been recently reported in eukaryotes. This paper reviews our present knowledge on this topic and proposes an understanding of these regulations by relating the study of viral gene expression to a comprehensive view of the mechanisms and components of the translational process.
"""
papers.search_2d(query)

HTML(value='<h4>\nAn increasing number of examples of translational regulation at the level of termination has…