# Create a Full Database

I've now got all of the bioRxiv abstracts, embeddings, projections, and topics. I'll merge all of these together into one database.

In [25]:
import pandas as pd
from nomic import AtlasDataset

In [26]:
df = pd.read_csv("../data/biorxiv_complete_dataset.csv")

I'll perform all the same filtering as before.

In [27]:
df = df[df['version'] == 1]
df = df.dropna(subset=['abstract'])
df = df.drop_duplicates(subset=['doi'], keep='first')

In [28]:
df.shape

(258798, 14)

And then I'll get the atlas for the projections and topics.

In [29]:
atlas = AtlasDataset("orchard")
projections = atlas.maps[0].embeddings.projected

[32m2024-12-13 19:34:21.731[0m | [1mINFO    [0m | [36mnomic.dataset[0m:[36m__init__[0m:[36m775[0m - [1mLoading existing dataset `cob/orchard`.[0m
[32m2024-12-13 19:34:22.184[0m | [1mINFO    [0m | [36mnomic.data_operations[0m:[36m_download_projected[0m:[36m538[0m - [1mDownloading projected embeddings[0m
100%|██████████| 21/21 [00:00<00:00, 1560.19it/s]
100%|██████████| 21/21 [00:00<00:00, 1354.40it/s]
[32m2024-12-13 19:34:22.543[0m | [1mINFO    [0m | [36mnomic.data_operations[0m:[36mtb[0m:[36m479[0m - [1mLoading projected embeddings[0m
100%|██████████| 21/21 [00:00<00:00, 138.25it/s]


In [30]:
projections.head()

Unnamed: 0,doi,x,y
0,10.1101/2023.06.28.546819,24.525175,-10.402438
1,10.1101/2020.12.07.415232,14.892089,29.124025
2,10.1101/2022.12.05.519101,-2.816088,5.28118
3,10.1101/2020.08.11.246504,-23.76844,-23.230116
4,10.1101/656314,-7.747561,8.243316


In [31]:
projections.shape

(258798, 3)

In [32]:
df.head()

Unnamed: 0,doi,title,authors,author_corresponding,author_corresponding_institution,date,version,type,license,category,jatsxml,abstract,published,server
0,10.1101/000109,Speciation and introgression between Mimulus n...,Yaniv Brandvain;Amanda M Kenney;Lex Fagel;Grah...,Yaniv Brandvain,Department of Evolution and Ecology & Center f...,2013-11-07,1,New Results,cc_by,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,Mimulus guttatus and M. nasutus are an evoluti...,10.1371/journal.pgen.1004410,biorxiv
1,10.1101/000075,A Scalable Formulation for Engineering Combina...,Vanessa Jonsson;Anders Rantzer;Richard M Murray;,Vanessa Jonsson,Caltech,2013-11-07,1,New Results,cc_by_nc,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,It has been shown that optimal controller synt...,10.1109/ACC.2014.6859452,biorxiv
2,10.1101/000240,Genome-wide targets of selection: female respo...,Paolo Innocenti;Ilona Flis;Edward H Morrow;,Edward H Morrow,University of Sussex,2013-11-12,1,New Results,cc_by,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,Despite the common assumption that promiscuity...,,biorxiv
3,10.1101/000208,Population genomics of parallel hybrid zones i...,Nicola Nadeau;Mayte Ruiz;Patricio Salazar;Bria...,Chri Jiggins,Cambridge,2013-11-12,1,New Results,cc_by_nc_nd,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,Hybrid zones can be valuable tools for studyin...,10.1101/gr.169292.113,biorxiv
4,10.1101/000398,The Origin of Human-infecting Avian Influenza ...,Liangsheng Zhang;Zhenguo Zhang;,Zhenguo Zhang,"Department of Biology, The Pennsylvania State ...",2013-11-14,1,New Results,cc_by_nc_nd,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,"In this study, we retraced the origin of the r...",,biorxiv


In [33]:
merged_df = df.merge(projections, on='doi')
merged_df.head()

Unnamed: 0,doi,title,authors,author_corresponding,author_corresponding_institution,date,version,type,license,category,jatsxml,abstract,published,server,x,y
0,10.1101/000109,Speciation and introgression between Mimulus n...,Yaniv Brandvain;Amanda M Kenney;Lex Fagel;Grah...,Yaniv Brandvain,Department of Evolution and Ecology & Center f...,2013-11-07,1,New Results,cc_by,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,Mimulus guttatus and M. nasutus are an evoluti...,10.1371/journal.pgen.1004410,biorxiv,17.530146,12.635113
1,10.1101/000075,A Scalable Formulation for Engineering Combina...,Vanessa Jonsson;Anders Rantzer;Richard M Murray;,Vanessa Jonsson,Caltech,2013-11-07,1,New Results,cc_by_nc,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,It has been shown that optimal controller synt...,10.1109/ACC.2014.6859452,biorxiv,-20.932812,-2.397594
2,10.1101/000240,Genome-wide targets of selection: female respo...,Paolo Innocenti;Ilona Flis;Edward H Morrow;,Edward H Morrow,University of Sussex,2013-11-12,1,New Results,cc_by,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,Despite the common assumption that promiscuity...,,biorxiv,14.181043,16.80138
3,10.1101/000208,Population genomics of parallel hybrid zones i...,Nicola Nadeau;Mayte Ruiz;Patricio Salazar;Bria...,Chri Jiggins,Cambridge,2013-11-12,1,New Results,cc_by_nc_nd,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,Hybrid zones can be valuable tools for studyin...,10.1101/gr.169292.113,biorxiv,16.907476,12.933809
4,10.1101/000398,The Origin of Human-infecting Avian Influenza ...,Liangsheng Zhang;Zhenguo Zhang;,Zhenguo Zhang,"Department of Biology, The Pennsylvania State ...",2013-11-14,1,New Results,cc_by_nc_nd,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,"In this study, we retraced the origin of the r...",,biorxiv,22.511051,-23.536976


In [34]:
merged_df[merged_df['doi'] == '10.1101/2023.06.28.546819']

Unnamed: 0,doi,title,authors,author_corresponding,author_corresponding_institution,date,version,type,license,category,jatsxml,abstract,published,server,x,y
198705,10.1101/2023.06.28.546819,Discovery of New Broad-Spectrum Anti-Infective...,"Lin, Y.; Jung, H.; Bulman, C. A.; Ng, J.; Vinc...",Gilles Gasser,Chimie ParisTech,2023-06-30,1,new results,cc_no,pharmacology and toxicology,https://www.biorxiv.org/content/early/2023/06/...,Drug resistance observed with many anti-infect...,10.1021/acs.jmedchem.3c01333,biorxiv,24.525175,-10.402438


In [35]:
topic_df = atlas.maps[0].topics.df

[32m2024-12-13 19:34:23.841[0m | [1mINFO    [0m | [36mnomic.data_operations[0m:[36m_download_topics[0m:[36m189[0m - [1mDownloading topics[0m
100%|██████████| 21/21 [00:00<00:00, 4353.09it/s]
100%|██████████| 21/21 [00:00<00:00, 1430.25it/s]
[32m2024-12-13 19:34:24.248[0m | [1mINFO    [0m | [36mnomic.data_operations[0m:[36m_load_topics[0m:[36m151[0m - [1mLoading topics[0m
100%|██████████| 21/21 [00:00<00:00, 50.33it/s]


In [36]:
topic_df.head()

Unnamed: 0,doi,topic_depth_1,topic_depth_2,topic_depth_3
0,10.1101/2023.06.28.546819,Viral Infections (2),Malaria,Antimalarial Treatment
1,10.1101/2020.12.07.415232,Evolutionary Biology (10),Animal Behavior (2),Animal Communication
2,10.1101/2022.12.05.519101,Microbial Ecology,Photosynthesis (4),Biological Optics
3,10.1101/2020.08.11.246504,Cancer Research,Genome Regulation,DNA Repair
4,10.1101/656314,Neurological Disorders (2),Parkinsons Disease (6),Eye Disease


In [37]:
merged_df = merged_df.merge(topic_df, on='doi')
merged_df.head()

Unnamed: 0,doi,title,authors,author_corresponding,author_corresponding_institution,date,version,type,license,category,jatsxml,abstract,published,server,x,y,topic_depth_1,topic_depth_2,topic_depth_3
0,10.1101/000109,Speciation and introgression between Mimulus n...,Yaniv Brandvain;Amanda M Kenney;Lex Fagel;Grah...,Yaniv Brandvain,Department of Evolution and Ecology & Center f...,2013-11-07,1,New Results,cc_by,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,Mimulus guttatus and M. nasutus are an evoluti...,10.1371/journal.pgen.1004410,biorxiv,17.530146,12.635113,Evolutionary Biology (10),Evolutionary Change,Evolutionary Genetics
1,10.1101/000075,A Scalable Formulation for Engineering Combina...,Vanessa Jonsson;Anders Rantzer;Richard M Murray;,Vanessa Jonsson,Caltech,2013-11-07,1,New Results,cc_by_nc,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,It has been shown that optimal controller synt...,10.1109/ACC.2014.6859452,biorxiv,-20.932812,-2.397594,Genomics Analysis (2),Systems Biology (2),Cellular Networks
2,10.1101/000240,Genome-wide targets of selection: female respo...,Paolo Innocenti;Ilona Flis;Edward H Morrow;,Edward H Morrow,University of Sussex,2013-11-12,1,New Results,cc_by,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,Despite the common assumption that promiscuity...,,biorxiv,14.181043,16.80138,Evolutionary Biology (10),Animal Behavior (2),Evolutionary Biology (9)
3,10.1101/000208,Population genomics of parallel hybrid zones i...,Nicola Nadeau;Mayte Ruiz;Patricio Salazar;Bria...,Chri Jiggins,Cambridge,2013-11-12,1,New Results,cc_by_nc_nd,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,Hybrid zones can be valuable tools for studyin...,10.1101/gr.169292.113,biorxiv,16.907476,12.933809,Evolutionary Biology (10),Animal Behavior (2),Butterfly Colors
4,10.1101/000398,The Origin of Human-infecting Avian Influenza ...,Liangsheng Zhang;Zhenguo Zhang;,Zhenguo Zhang,"Department of Biology, The Pennsylvania State ...",2013-11-14,1,New Results,cc_by_nc_nd,Evolutionary Biology,https://www.biorxiv.org/content/early/2013/11/...,"In this study, we retraced the origin of the r...",,biorxiv,22.511051,-23.536976,Viral Infections (2),Zoonotic Diseases,Avian Influenza


In [38]:
unique_servers = merged_df['server'].unique()
unique_servers


array(['biorxiv'], dtype=object)

In [39]:
merged_df['type'].unique()

array(['New Results', 'Contradictory Results', 'Confirmatory Results',
       nan, 'new results', 'confirmatory results',
       'contradictory results'], dtype=object)

In [40]:
merged_df = merged_df.drop(columns=['server', 'version', 'jatsxml', 'license', 'type'])

In [41]:
merged_df.head()

Unnamed: 0,doi,title,authors,author_corresponding,author_corresponding_institution,date,category,abstract,published,x,y,topic_depth_1,topic_depth_2,topic_depth_3
0,10.1101/000109,Speciation and introgression between Mimulus n...,Yaniv Brandvain;Amanda M Kenney;Lex Fagel;Grah...,Yaniv Brandvain,Department of Evolution and Ecology & Center f...,2013-11-07,Evolutionary Biology,Mimulus guttatus and M. nasutus are an evoluti...,10.1371/journal.pgen.1004410,17.530146,12.635113,Evolutionary Biology (10),Evolutionary Change,Evolutionary Genetics
1,10.1101/000075,A Scalable Formulation for Engineering Combina...,Vanessa Jonsson;Anders Rantzer;Richard M Murray;,Vanessa Jonsson,Caltech,2013-11-07,Evolutionary Biology,It has been shown that optimal controller synt...,10.1109/ACC.2014.6859452,-20.932812,-2.397594,Genomics Analysis (2),Systems Biology (2),Cellular Networks
2,10.1101/000240,Genome-wide targets of selection: female respo...,Paolo Innocenti;Ilona Flis;Edward H Morrow;,Edward H Morrow,University of Sussex,2013-11-12,Evolutionary Biology,Despite the common assumption that promiscuity...,,14.181043,16.80138,Evolutionary Biology (10),Animal Behavior (2),Evolutionary Biology (9)
3,10.1101/000208,Population genomics of parallel hybrid zones i...,Nicola Nadeau;Mayte Ruiz;Patricio Salazar;Bria...,Chri Jiggins,Cambridge,2013-11-12,Evolutionary Biology,Hybrid zones can be valuable tools for studyin...,10.1101/gr.169292.113,16.907476,12.933809,Evolutionary Biology (10),Animal Behavior (2),Butterfly Colors
4,10.1101/000398,The Origin of Human-infecting Avian Influenza ...,Liangsheng Zhang;Zhenguo Zhang;,Zhenguo Zhang,"Department of Biology, The Pennsylvania State ...",2013-11-14,Evolutionary Biology,"In this study, we retraced the origin of the r...",,22.511051,-23.536976,Viral Infections (2),Zoonotic Diseases,Avian Influenza


In [42]:
merged_df.to_csv('../data/biorxiv_nomic.csv', index=False)

In [43]:
unique_topic_depth_1 = merged_df['topic_depth_1'].unique()
unique_topic_depth_1

array(['Evolutionary Biology (10)', 'Genomics Analysis (2)',
       'Viral Infections (2)', 'Microbial Ecology', 'Neural Science',
       'Cancer Research', 'Cell Biology (4)',
       'Neurological Disorders (2)'], dtype=object)

In [44]:
unique_topic_depth_2 = merged_df['topic_depth_2'].unique()
unique_topic_depth_2

array(['Evolutionary Change', 'Systems Biology (2)',
       'Animal Behavior (2)', 'Zoonotic Diseases',
       'Evolutionary Dynamics', 'Genetic Engineering (2)', 'Genetics (5)',
       'Neural Networks', 'Genetic Traits (2)', 'Genetic Evolution (5)',
       'Noncoding RNAs', 'Ecosystem Management', 'Genomics (6)',
       'Protein Networks', 'Glioblastoma', 'Liver Inflammation',
       'HIV Research', 'Plant Stress (2)', 'Pharmaceuticals',
       'Nematode Biology', 'Protein Structure (3)', 'Aging',
       'Soil Microbiology', "Alzheimer's Disease (4)",
       'Parkinsons Disease (6)', 'Genetics (4)',
       'Psychological Disorders', 'Embryonic Development',
       'Mitochondria (2)', 'Microbial Pathogens', 'Neuroscience (9)',
       'Muscle Kinematics', 'Biomedical Imaging', 'Learning Theory (2)',
       'Memory and Fear', 'Virology', 'Single Cell', 'Malaria',
       'Influenza Virus', 'Gene Expression (3)', 'Microscopy (2)',
       'Microbial Ecology (2)', 'Immune Response (7)',
   

In [45]:
len(unique_topic_depth_2)

90

In [46]:
unique_topic_depth_3 = merged_df['topic_depth_3'].unique()
unique_topic_depth_3

array(['Evolutionary Genetics', 'Cellular Networks',
       'Evolutionary Biology (9)', 'Butterfly Colors', 'Avian Influenza',
       'Genetic Mutations (2)', 'Social Networks',
       'Genetic Evolution (2)', 'Sex Determination',
       'Biological Circuits', 'Biotech', 'Neural Plasticity',
       'Genetic Regulation (2)', 'Genetics (2)', 'MicroRNAs',
       'Genetic Traits', 'Ecological Diversity', 'Genetic Analysis (2)',
       'Gene Expression (2)', 'Systems Biology', 'Bioinformatics (3)',
       'Evolutionary Biology (2)', 'Optimal Foraging',
       'Inflammation Response', 'HIV Cure', 'Plant Development',
       'Plant Extracts (2)', 'Immune Response (3)', 'Neural Behavior',
       'Brain Connectivity', 'Biological Assemblies',
       'Telomere Maintenance', 'Plant Disease', 'Viral Infection',
       "Alzheimer's Disease (3)", 'DNA Origami', 'Genetic Disorders',
       'Genetic Variation (3)', 'Human Genetics', 'Genetic Analysis',
       'Photosynthesis (3)', 'Biodiversity Conser

In [47]:
len(unique_topic_depth_3)

512

In [48]:
len(unique_topic_depth_1)

8

In [50]:
aging_topics_2 = [topic for topic in unique_topic_depth_2 if "Aging" in topic]
aging_topics_2


['Aging', 'Epigenetic Aging']

In [51]:
aging_topics_3 = [topic for topic in unique_topic_depth_3 if "Aging" in topic]
aging_topics_3


['Aging (6)', 'Aging (3)', 'Aging (5)', 'Aging (2)', 'Aging (4)']

In [52]:
import re

def trim_topic(topic):
    return re.sub(r' \(\d+\)$', '', topic)

merged_df['topic_depth_1'] = merged_df['topic_depth_1'].apply(trim_topic)
merged_df['topic_depth_2'] = merged_df['topic_depth_2'].apply(trim_topic)
merged_df['topic_depth_3'] = merged_df['topic_depth_3'].apply(trim_topic)

In [53]:
merged_df['topic_depth_1'].unique()

array(['Evolutionary Biology', 'Genomics Analysis', 'Viral Infections',
       'Microbial Ecology', 'Neural Science', 'Cancer Research',
       'Cell Biology', 'Neurological Disorders'], dtype=object)

In [54]:
merged_df.to_csv('../data/biorxiv_nomic.csv', index=False)

In [55]:
merged_df['topic_depth_2'].unique()

array(['Evolutionary Change', 'Systems Biology', 'Animal Behavior',
       'Zoonotic Diseases', 'Evolutionary Dynamics',
       'Genetic Engineering', 'Genetics', 'Neural Networks',
       'Genetic Traits', 'Genetic Evolution', 'Noncoding RNAs',
       'Ecosystem Management', 'Genomics', 'Protein Networks',
       'Glioblastoma', 'Liver Inflammation', 'HIV Research',
       'Plant Stress', 'Pharmaceuticals', 'Nematode Biology',
       'Protein Structure', 'Aging', 'Soil Microbiology',
       "Alzheimer's Disease", 'Parkinsons Disease',
       'Psychological Disorders', 'Embryonic Development', 'Mitochondria',
       'Microbial Pathogens', 'Neuroscience', 'Muscle Kinematics',
       'Biomedical Imaging', 'Learning Theory', 'Memory and Fear',
       'Virology', 'Single Cell', 'Malaria', 'Influenza Virus',
       'Gene Expression', 'Microscopy', 'Microbial Ecology',
       'Immune Response', 'Cell Division', 'Honeybees',
       'Genome Regulation', 'Neural Development', 'Bacterial Biofilm

In [56]:
len(merged_df['topic_depth_2'].unique())

88