# Nomic Exploration

I'm going to use the Nomic SDK to create embeddings for the abstracts. I'll use the Nomic object to retrieve the 2D projection using their projection model, as well as the topic modeling features.

In [2]:
import pandas as pd
from nomic import atlas
from nomic import AtlasDataset

In [3]:
df = pd.read_csv("../data/biorxiv_complete_dataset.csv")

In [4]:
df.head()

Unnamed: 0,doi,title,authors,author_corresponding,author_corresponding_institution,date,version,type,license,category,jatsxml,abstract,published,server
0,10.1101/000109,Speciation and introgression between Mimulus n...,Yaniv Brandvain;Amanda M Kenney;Lex Fagel;Grah...,Yaniv Brandvain,Department of Evolution and Ecology & Center f...,2013-11-07,1,New Results,cc_by,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,Mimulus guttatus and M. nasutus are an evoluti...,10.1371/journal.pgen.1004410,biorxiv
1,10.1101/000075,A Scalable Formulation for Engineering Combina...,Vanessa Jonsson;Anders Rantzer;Richard M Murray;,Vanessa Jonsson,Caltech,2013-11-07,1,New Results,cc_by_nc,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,It has been shown that optimal controller synt...,10.1109/ACC.2014.6859452,biorxiv
2,10.1101/000240,Genome-wide targets of selection: female respo...,Paolo Innocenti;Ilona Flis;Edward H Morrow;,Edward H Morrow,University of Sussex,2013-11-12,1,New Results,cc_by,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,Despite the common assumption that promiscuity...,,biorxiv
3,10.1101/000208,Population genomics of parallel hybrid zones i...,Nicola Nadeau;Mayte Ruiz;Patricio Salazar;Bria...,Chri Jiggins,Cambridge,2013-11-12,1,New Results,cc_by_nc_nd,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,Hybrid zones can be valuable tools for studyin...,10.1101/gr.169292.113,biorxiv
4,10.1101/000398,The Origin of Human-infecting Avian Influenza ...,Liangsheng Zhang;Zhenguo Zhang;,Zhenguo Zhang,"Department of Biology, The Pennsylvania State ...",2013-11-14,1,New Results,cc_by_nc_nd,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,"In this study, we retraced the origin of the r...",,biorxiv


In [5]:
df.shape

(357046, 14)

## Clean data

In [6]:
# Filter for only version 1 papers
df = df[df['version'] == 1]

In [7]:
df.shape

(258821, 14)

In [8]:
# Drop rows with missing abstracts
df = df.dropna(subset=['abstract'])

In [9]:
df.shape

(258818, 14)

In [10]:
df.head()

Unnamed: 0,doi,title,authors,author_corresponding,author_corresponding_institution,date,version,type,license,category,jatsxml,abstract,published,server
0,10.1101/000109,Speciation and introgression between Mimulus n...,Yaniv Brandvain;Amanda M Kenney;Lex Fagel;Grah...,Yaniv Brandvain,Department of Evolution and Ecology & Center f...,2013-11-07,1,New Results,cc_by,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,Mimulus guttatus and M. nasutus are an evoluti...,10.1371/journal.pgen.1004410,biorxiv
1,10.1101/000075,A Scalable Formulation for Engineering Combina...,Vanessa Jonsson;Anders Rantzer;Richard M Murray;,Vanessa Jonsson,Caltech,2013-11-07,1,New Results,cc_by_nc,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,It has been shown that optimal controller synt...,10.1109/ACC.2014.6859452,biorxiv
2,10.1101/000240,Genome-wide targets of selection: female respo...,Paolo Innocenti;Ilona Flis;Edward H Morrow;,Edward H Morrow,University of Sussex,2013-11-12,1,New Results,cc_by,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,Despite the common assumption that promiscuity...,,biorxiv
3,10.1101/000208,Population genomics of parallel hybrid zones i...,Nicola Nadeau;Mayte Ruiz;Patricio Salazar;Bria...,Chri Jiggins,Cambridge,2013-11-12,1,New Results,cc_by_nc_nd,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,Hybrid zones can be valuable tools for studyin...,10.1101/gr.169292.113,biorxiv
4,10.1101/000398,The Origin of Human-infecting Avian Influenza ...,Liangsheng Zhang;Zhenguo Zhang;,Zhenguo Zhang,"Department of Biology, The Pennsylvania State ...",2013-11-14,1,New Results,cc_by_nc_nd,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,"In this study, we retraced the origin of the r...",,biorxiv


In [15]:
# Check for duplicate DOIs
duplicate_dois = df[df.duplicated(subset=['doi'], keep=False)]
print(f"Number of rows with duplicate DOIs: {len(duplicate_dois)}")

Number of rows with duplicate DOIs: 39


In [14]:
duplicate_dois

Unnamed: 0,doi,title,authors,author_corresponding,author_corresponding_institution,date,version,type,license,category,jatsxml,abstract,published,server
9948,10.1101/090746,Tumor Origin Detection with Tissue-Specific mi...,"Tang, W.; Wan, S.; Zou, Q.",-,"School of Computer Science and Technology, Tia...",2016-12-01,1,new results,cc_by_nc_nd,bioinformatics,https://www.biorxiv.org/content/early/2016/12/...,MotivationCancer of unknown primary origin con...,10.1093/bioinformatics/btx622,biorxiv
9949,10.1101/090746,Tumor Origin Detection with Tissue-Specific mi...,"Tang, W.; Wan, S.; Zou, Q.",-,"School of Computer Science and Technology, Tia...",2016-12-01,1,new results,cc_by_nc_nd,bioinformatics,https://www.biorxiv.org/content/early/2016/12/...,MotivationCancer of unknown primary origin con...,10.1093/bioinformatics/btx622,biorxiv
18620,10.1101/161612,shRNA mediated inhibition of Cdc42 gene expres...,"Ghambari, Z.;Nabiuni, M.;Jalali, H.;Karimzadeh...",Mohammad Nabiuni,Kharazmi University,2017-07-10,1,new results,cc_by_nd,genomics,https://www.biorxiv.org/content/early/2017/07/...,Background Information: RNAi technique as a ne...,,biorxiv
18621,10.1101/161612,shRNA mediated inhibition of Cdc42 gene expres...,"Ghambari, Z.;Nabiuni, M.;Jalali, H.;Karimzadeh...",Mohammad Nabiuni,Kharazmi University,2017-07-10,1,new results,cc_by_nd,genomics,https://www.biorxiv.org/content/early/2017/07/...,Background Information: RNAi technique as a ne...,,biorxiv
66705,10.1101/617381,LFMD: a new likelihood-based method to detect ...,"Ye, R.; Ruan, J.; Zhuang, X.; Qi, Y.; An, Y.; ...",Rui Ye,The University of Hong Kong,2019-04-24,1,new results,cc_no,bioinformatics,https://www.biorxiv.org/content/early/2019/04/...,As next-generation sequencing (NGS) and liquid...,,biorxiv
66706,10.1101/617381,LFMD: a new likelihood-based method to detect ...,"Ye, R.; Ruan, J.; Zhuang, X.; Qi, Y.; An, Y.; ...",Rui Ye,The University of Hong Kong,2019-04-24,1,new results,cc_no,bioinformatics,https://www.biorxiv.org/content/early/2019/04/...,As next-generation sequencing (NGS) and liquid...,,biorxiv
66707,10.1101/615294,HAMAP rules as SPARQL - A portable annotation ...,"Bolleman, J.; de Castro, E.; Baratin, D.; Geha...",Jerven Bolleman,SIB Swiss Institute of Bioinformatics,2019-04-24,1,new results,cc_by_nd,bioinformatics,https://www.biorxiv.org/content/early/2019/04/...,MotivationGenome and proteome annotation pipel...,10.1093/gigascience/giaa003,biorxiv
66708,10.1101/615294,HAMAP rules as SPARQL - A portable annotation ...,"Bolleman, J.; de Castro, E.; Baratin, D.; Geha...",Jerven Bolleman,SIB Swiss Institute of Bioinformatics,2019-04-24,1,new results,cc_by_nd,bioinformatics,https://www.biorxiv.org/content/early/2019/04/...,MotivationGenome and proteome annotation pipel...,10.1093/gigascience/giaa003,biorxiv
66737,10.1101/618124,Natural Amelioration of Mn-induced Chlorosis F...,"Li, X. F.; Ling, G. Z.; Wang, X. X.; Yang, S.;...",Xiao Feng Li,Guangxi University,2019-04-24,1,new results,cc_by,plant biology,https://www.biorxiv.org/content/early/2019/04/...,We had previously reported that manganese (Mn)...,,biorxiv
66738,10.1101/618124,Natural Amelioration of Mn-induced Chlorosis F...,"Li, X. F.; Ling, G. Z.; Wang, X. X.; Yang, S.;...",Xiao Feng Li,Guangxi University,2019-04-24,1,new results,cc_by,plant biology,https://www.biorxiv.org/content/early/2019/04/...,We had previously reported that manganese (Mn)...,,biorxiv


In [16]:
# Keep only first occurrence of each DOI
df = df.drop_duplicates(subset=['doi'], keep='first')


In [18]:
df.shape

(258798, 14)

In [19]:
df.head()

Unnamed: 0,doi,title,authors,author_corresponding,author_corresponding_institution,date,version,type,license,category,jatsxml,abstract,published,server
0,10.1101/000109,Speciation and introgression between Mimulus n...,Yaniv Brandvain;Amanda M Kenney;Lex Fagel;Grah...,Yaniv Brandvain,Department of Evolution and Ecology & Center f...,2013-11-07,1,New Results,cc_by,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,Mimulus guttatus and M. nasutus are an evoluti...,10.1371/journal.pgen.1004410,biorxiv
1,10.1101/000075,A Scalable Formulation for Engineering Combina...,Vanessa Jonsson;Anders Rantzer;Richard M Murray;,Vanessa Jonsson,Caltech,2013-11-07,1,New Results,cc_by_nc,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,It has been shown that optimal controller synt...,10.1109/ACC.2014.6859452,biorxiv
2,10.1101/000240,Genome-wide targets of selection: female respo...,Paolo Innocenti;Ilona Flis;Edward H Morrow;,Edward H Morrow,University of Sussex,2013-11-12,1,New Results,cc_by,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,Despite the common assumption that promiscuity...,,biorxiv
3,10.1101/000208,Population genomics of parallel hybrid zones i...,Nicola Nadeau;Mayte Ruiz;Patricio Salazar;Bria...,Chri Jiggins,Cambridge,2013-11-12,1,New Results,cc_by_nc_nd,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,Hybrid zones can be valuable tools for studyin...,10.1101/gr.169292.113,biorxiv
4,10.1101/000398,The Origin of Human-infecting Avian Influenza ...,Liangsheng Zhang;Zhenguo Zhang;,Zhenguo Zhang,"Department of Biology, The Pennsylvania State ...",2013-11-14,1,New Results,cc_by_nc_nd,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,"In this study, we retraced the origin of the r...",,biorxiv


## Create and upload atlas

In [20]:
# This only runs once on first upload.
atlas = atlas.map_data(
  data=df, 
  indexed_field="abstract", 
  identifier="orchard", 
  is_public=False,
  id_field="doi"
)

[32m2024-12-12 19:29:23.061[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_create_project[0m:[36m839[0m - [1mOrganization name: `cob`[0m
[32m2024-12-12 19:29:23.486[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_create_project[0m:[36m867[0m - [1mCreating dataset `orchard`[0m
[32m2024-12-12 19:29:23.651[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m140[0m - [1mUploading data to Atlas.[0m
52it [00:43,  1.19it/s]                        
[32m2024-12-12 19:30:12.005[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m_add_data[0m:[36m1690[0m - [1mUpload succeeded.[0m
[32m2024-12-12 19:30:12.274[0m | [1mINFO    [0m | [36mnomic.atlas[0m:[36mmap_data[0m:[36m158[0m - [1m`cob/orchard`: Data upload succeeded to dataset`[0m
[32m2024-12-12 19:30:13.844[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36mcreate_index[0m:[36m1272[0m - [1mCreated map `orchard` in dataset `cob/orchard`: https://atlas.nomic.ai/data/cob/orchard

In [22]:
atlas = AtlasDataset("orchard")

[32m2024-12-12 19:47:48.088[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m__init__[0m:[36m775[0m - [1mLoading existing dataset `cob/orchard`.[0m


In [23]:
map = atlas.maps[0]
projected_embeddings = map.embeddings.projected

[32m2024-12-12 19:47:55.327[0m | [1mINFO    [0m | [36mnomic.data_operations[0m:[36m_download_projected[0m:[36m538[0m - [1mDownloading projected embeddings[0m
100%|██████████| 21/21 [00:04<00:00,  4.88it/s]
100%|██████████| 21/21 [00:04<00:00,  4.46it/s]
[32m2024-12-12 19:48:05.230[0m | [1mINFO    [0m | [36mnomic.data_operations[0m:[36mtb[0m:[36m479[0m - [1mLoading projected embeddings[0m
100%|██████████| 21/21 [00:00<00:00, 136.40it/s]


In [26]:
projected_embeddings.shape

(258798, 3)

In [24]:
projected_embeddings.head()

Unnamed: 0,doi,x,y
0,10.1101/2023.06.28.546819,24.525175,-10.402438
1,10.1101/2020.12.07.415232,14.892089,29.124025
2,10.1101/2022.12.05.519101,-2.816088,5.28118
3,10.1101/2020.08.11.246504,-23.76844,-23.230116
4,10.1101/656314,-7.747561,8.243316


In [25]:
atlas.get_data(["10.1101/2023.06.28.546819"])

[{'doi': '10.1101/2023.06.28.546819',
  'title': 'Discovery of New Broad-Spectrum Anti-Infectives for Eukaryotic Pathogens Using Bioorganometallic Chemistry',
  'authors': "Lin, Y.; Jung, H.; Bulman, C. A.; Ng, J.; Vinck, R.; O'Beirne, C.; Moser, M. S.; Tricoche, N.; Peguero, R.; Li, R. W.; Urban, J. F.; Le Pape, P.; Pagniez, F.; Moretto, M.; Weil, T.; Lustigman, S.; Cariou, K.; Mitreva, M.; Sakanari, J.; Gasser, G.",
  'author_corresponding': 'Gilles Gasser',
  'author_corresponding_institution': 'Chimie ParisTech',
  'date': '2023-06-30',
  'version': 1,
  'type': 'new results',
  'license': 'cc_no',
  'category': 'pharmacology and toxicology',
  'jatsxml': 'https://www.biorxiv.org/content/early/2023/06/30/2023.06.28.546819.source.xml',
  'abstract': 'Drug resistance observed with many anti-infectives clearly highlights the need for new broad-spectrum agents to treat especially neglected tropical diseases (NTDs) caused by eukaryotic parasitic pathogens including fungal infections. Si

In [27]:
topic_df = map.topics.df

[32m2024-12-12 22:20:05.636[0m | [1mINFO    [0m | [36mnomic.data_operations[0m:[36m_download_topics[0m:[36m189[0m - [1mDownloading topics[0m
100%|██████████| 21/21 [00:00<00:00, 88.74it/s]
100%|██████████| 21/21 [00:04<00:00,  4.49it/s]
[32m2024-12-12 22:20:11.565[0m | [1mINFO    [0m | [36mnomic.data_operations[0m:[36m_load_topics[0m:[36m151[0m - [1mLoading topics[0m
100%|██████████| 21/21 [00:01<00:00, 15.72it/s]


In [29]:
topic_df.head(20)

Unnamed: 0,doi,topic_depth_1,topic_depth_2,topic_depth_3
0,10.1101/2023.06.28.546819,Viral Infections (2),Malaria,Antimalarial Treatment
1,10.1101/2020.12.07.415232,Evolutionary Biology (10),Animal Behavior (2),Animal Communication
2,10.1101/2022.12.05.519101,Microbial Ecology,Photosynthesis (4),Biological Optics
3,10.1101/2020.08.11.246504,Cancer Research,Genome Regulation,DNA Repair
4,10.1101/656314,Neurological Disorders (2),Parkinsons Disease (6),Eye Disease
5,10.1101/192823,Neural Science,Cardiovascular,Vascular System
6,10.1101/2020.04.27.064550,Cancer Research,Nematode Biology,Neural Behavior
7,10.1101/2024.05.03.592173,Evolutionary Biology (10),Animal Behavior (2),Behavioral Tracking
8,10.1101/2021.07.15.452453,Cancer Research,Liver Inflammation,Inflammation Response
9,10.1101/2021.10.02.461750,Microbial Ecology,Plant Stress (2),RNA Regulation


In [30]:
viral_infections_df = topic_df[topic_df['topic_depth_1'] == "Viral Infections (2)"]
viral_infections_df


Unnamed: 0,doi,topic_depth_1,topic_depth_2,topic_depth_3
0,10.1101/2023.06.28.546819,Viral Infections (2),Malaria,Antimalarial Treatment
43,10.1101/2022.09.20.508747,Viral Infections (2),COVID Vaccine (2),Vaccine Development (2)
44,10.1101/2021.05.13.444112,Viral Infections (2),Microbial Pathogens,Parasite Biology (2)
45,10.1101/677112,Viral Infections (2),Malaria,Parasite Biology
71,10.1101/554311,Viral Infections (2),Malaria,Malaria Immunity
...,...,...,...,...
258015,10.1101/2022.11.07.515479,Viral Infections (2),Zoonotic Diseases,Bats and Rabies
258205,10.1101/2022.12.13.520061,Viral Infections (2),Virology,Microbial Diversity
258269,10.1101/728824,Viral Infections (2),Zoonotic Diseases,Bats and Rabies
258397,10.1101/645309,Viral Infections (2),Zoonotic Diseases,Bats and Rabies
