In [76]:
from bs4 import BeautifulSoup
import requests

from urllib.request import urlopen
import time
import json

In [71]:
nature_root = 'https://www.nature.com'
sleep_interval = 1 # seconds

In [64]:
def get_tax_node(subj_string):
    fetch_url = nature_root + '/subjects/' + subj_string
    html_content = requests.get(fetch_url).text
    soup = BeautifulSoup(html_content, "lxml")
    
    topic_id = subj_string
    name = soup.find_all('h1')[1].get_text()
    
    
    rel_subjs = [el for el in soup.find_all('h2') if 'Related Subjects' in el]

    if len(rel_subjs) == 0:
        children = []
    else:
        children_tags = rel_subjs[0].findNext('div').find_all('a')
        children = [child.get('href').replace('/subjects/','') for child in children_tags]

    return {'topic_id': topic_id,
            'name': name,
            'children': children}

In [72]:
def get_full_taxonomy_below(subj_string):
    queue = [subj_string]
    topic_list = []
    topic_ids = []

    while len(queue) > 0:
        
        # work in order
        current = queue.pop(0)
        
        if current in topic_ids:
            print('WARNING! {} is already in the list of fetched topics. Skipping.'.format(current))
            continue

        print('Fetching {}'.format(current))
        curr_node = get_tax_node(current)
        
        topic_list.append(curr_node)
        topic_ids.append(curr_node['topic_id'])
        
        queue.extend(curr_node['children'])

        # Slow ourselves down so the server doesn't get mad at us
        time.sleep(sleep_interval)
        print('queue length {}'.format(len(queue)))
        
    return topic_list

In [73]:
biology_tax = get_full_taxonomy_below('biological-sciences')

Fetching biological-sciences
queue length 24
Fetching biochemistry
queue length 55
Fetching biological-techniques
queue length 86
Fetching biophysics
queue length 99
Fetching biotechnology
queue length 126
Fetching cancer
queue length 125
Fetching cell-biology
queue length 147
Fetching chemical-biology
queue length 191
Fetching computational-biology-and-bioinformatics
queue length 232
Fetching developmental-biology
queue length 260
Fetching drug-discovery
queue length 273
Fetching ecology
queue length 310
Fetching evolution
queue length 327
Fetching genetics
queue length 369
Fetching immunology
queue length 404
Fetching microbiology
queue length 426
Fetching molecular-biology
queue length 449
Fetching neuroscience
queue length 496
Fetching physiology
queue length 506
Fetching plant-sciences
queue length 528
Fetching psychology
queue length 528
Fetching stem-cells
queue length 551
Fetching structural-biology
queue length 556
Fetching systems-biology
queue length 600
Fetching zoology
que

queue length 971
Fetching membranes
queue length 970
Fetching metabolic-pathways
queue length 969
Fetching natural-products
queue length 967
Fetching networks-and-systems-biology
queue length 966
Fetching nucleic-acids
queue length 965
Fetching pharmacology
queue length 968
Fetching protein-design
queue length 965
Fetching screening
queue length 960
Fetching small-molecules
queue length 959
Fetching synthetic-biology
queue length 958
Fetching target-identification
queue length 957
Fetching target-validation
queue length 956
Fetching transporters
queue length 955
Fetching biochemical-reaction-networks
queue length 954
Fetching cellular-signalling-networks
queue length 953
Fetching classification-and-taxonomy
queue length 952
Fetching communication-and-replication
queue length 951
Fetching computational-models
queue length 950
Fetching computational-neuroscience
queue length 955
Fetching computational-platforms-and-environments
queue length 954
Fetching data-acquisition
queue length 953


queue length 895
Fetching cytogenetics
queue length 894
Fetching development
queue length 893
Fetching epigenetics
queue length 896
Fetching epigenomics
queue length 895
Fetching eukaryote
queue length 894
Fetching evolutionary-biology
queue length 893
Fetching gene-expression
queue length 891
Fetching gene-regulation
queue length 890
Fetching genetic-association-study
queue length 890
Fetching genetic-hybridization
queue length 889
Fetching genetic-interaction
queue length 889
Fetching genetic-linkage-study
queue length 888
Fetching genetic-markers
queue length 887
Fetching genome
queue length 891
Fetching genomic-instability
queue length 891
Fetching genotype
queue length 891
Fetching haplotypes
queue length 890
Fetching heritable-quantitative-trait
queue length 890
Fetching immunogenetics
queue length 889
Fetching inbreeding
queue length 888
Fetching medical-genetics
queue length 888
Fetching microbial-genetics
queue length 893
Fetching mutation
queue length 895
Fetching neurodevelo

queue length 1127
Fetching transporters-in-the-nervous-system
queue length 1126
Fetching visual-system
queue length 1133
Fetching bone
queue length 1131
Fetching bone-quality-and-biomechanics
queue length 1130
Fetching calcium-and-vitamin-d
queue length 1129
Fetching cardiovascular-biology
queue length 1136
Fetching circulation
queue length 1144
Fetching kidney
queue length 1144
Fetching metabolism
queue length 1151
Fetching neurophysiology
queue length 1150
Fetching reproductive-biology
queue length 1151
Fetching respiration
queue length 1150
Fetching biofuels
queue length 1158
Fetching light-responses
queue length 1157
Fetching natural-variation-in-plants
queue length 1156
Fetching photosynthesis
queue length 1162
Fetching plant-cell-biology
queue length 1169
Fetching plant-development
queue length 1178
Fetching plant-domestication
queue length 1177
Fetching plant-ecology
queue length 1176
Fetching plant-evolution
queue length 1175
Fetching plant-hormones
queue length 1179
Fetching p

queue length 1079
Fetching insect-hormones
queue length 1078
Fetching melatonin
queue length 1077
Fetching parathyroid-hormone
queue length 1076
Fetching peptide-hormones
queue length 1075
Fetching steroid-hormones
queue length 1074
Fetching thymus-hormones
queue length 1073
Fetching thyroid-hormones
queue length 1072
Fetching calcium-channels
queue length 1071
Fetching chloride-channels
queue length 1070
Fetching cyclic-nucleotide-gated-cation-channels
queue length 1069
Fetching ligand-gated-ion-channels
queue length 1068
Fetching porins
queue length 1067
Fetching potassium-channels
queue length 1066
Fetching sodium-channels
queue length 1065
Fetching transient-receptor-potential-channels
queue length 1064
Fetching ceroid
queue length 1063
Fetching fats
queue length 1062
Fetching fatty-acids
queue length 1061
Fetching fatty-alcohols
queue length 1060
Fetching glycerides
queue length 1059
Fetching glycolipids
queue length 1058
Fetching lipid-peroxides
queue length 1057
Fetching lipofus

queue length 943
Fetching endoscopy
queue length 946
Fetching fluorescence-imaging
queue length 945
Fetching functional-magnetic-resonance-imaging
queue length 944
Fetching magnetic-resonance-imaging
queue length 943
Fetching magnetoencephalography
queue length 942
Fetching molecular-imaging
queue length 941
Fetching optical-imaging
queue length 940
Fetching positron-emission-tomography
queue length 939
Fetching time-lapse-imaging
queue length 938
Fetching ultrasound
queue length 937
Fetching viral-tracing
queue length 936
Fetching x-ray-tomography
queue length 935
Fetching antibody-generation
queue length 934
Fetching antibody-isolation-and-purification
queue length 933
Fetching elisa
queue length 932
Fetching elispot
queue length 931
Fetching immunoblotting
queue length 930
Fetching immunoprecipitation
queue length 928
Fetching isolation-of-immune-cells
queue length 927
Fetching blotting
queue length 926
Fetching chromatography
queue length 925
Fetching electrophoresis
queue length 9

queue length 787
Fetching field-trials
queue length 784
Fetching molecular-engineering-in-plants
queue length 783
Fetching chaperone-mediated-autophagy
queue length 782
Fetching macroautophagy
queue length 781
Fetching mitophagy
queue length 780
Fetching pexophagy
queue length 779
Fetching ribophagy
queue length 778
Fetching adherens-junctions
queue length 777
Fetching cadherins
queue length 776
Fetching desmosomes
queue length 775
Fetching extracellular-matrix
queue length 774
Fetching focal-adhesion
queue length 773
Fetching mechanotransduction
queue length 771
Fetching tight-junctions
queue length 770
Fetching apoptosis
queue length 769
Fetching entosis
queue length 767
Fetching necroptosis
queue length 766
Fetching cell-cycle-exit
queue length 765
Fetching checkpoints
queue length 763
Fetching chromosome-condensation
queue length 762
Fetching chromosome-segregation
queue length 761
Fetching cohesion
queue length 760
Fetching cytokinesis
queue length 759
Fetching kinetochores
queue 

queue length 620
Fetching developmental-neurogenesis
queue length 619
Fetching limb-development
queue length 617
Fetching musculoskeletal-development
queue length 615
Fetching ectoderm
queue length 613
Fetching endoderm
queue length 611
Fetching mesoderm
queue length 610
Fetching diagnostic-markers
queue length 609
Fetching predictive-markers
queue length 608
Fetching prognostic-markers
queue length 607
Fetching intellectual-property
queue length 606
Fetching licensing
queue length 605
Fetching market-analysis
queue length 604
Fetching mergers-and-acquisitions
queue length 603
Fetching pharmacoeconomics
queue length 602
Fetching portfolio-management
queue length 601
Fetching public-private-partnerships
queue length 600
Fetching reimbursement
queue length 599
Fetching phenotypic-screening
queue length 597
Fetching virtual-screening
queue length 596
Fetching drug-discovery-and-development
queue length 591
Fetching lead-optimization
queue length 590
Fetching structure-based-drug-design
qu

queue length 525
Fetching archaea-genomics
queue length 524
Fetching archaea-physiology
queue length 523
Fetching bacterial-development
queue length 521
Fetching bacterial-evolution
queue length 520
Fetching bacterial-genomics
queue length 518
Fetching bacterial-host-response
queue length 517
Fetching bacterial-immune-evasion
queue length 516
Fetching bacterial-pathogenesis
queue length 515
Fetching bacterial-physiology
queue length 519
Fetching bacterial-secretion
queue length 518
Fetching bacterial-structural-biology
queue length 517
Fetching bacterial-synthetic-biology
queue length 516
Fetching bacterial-systems-biology
queue length 515
Fetching bacterial-techniques-and-applications
queue length 514
Fetching bacterial-toxins
queue length 513
Fetching bacterial-transcription
queue length 512
Fetching infectious-disease-epidemiology
queue length 510
Fetching marine-microbiology
queue length 509
Fetching symbiosis
queue length 507
Fetching microbiome
queue length 503
Fetching air-micro

queue length 355
Fetching depression
queue length 354
Fetching developmental-disorders
queue length 353
Fetching dystonia
queue length 352
Fetching encephalopathy
queue length 352
Fetching epilepsy
queue length 351
Fetching huntingtons-disease
queue length 350
Fetching lipid-storage-diseases
queue length 349
Fetching macular-degeneration
queue length 348
Fetching multiple-sclerosis
queue length 347
Fetching neurodegeneration
queue length 346
Fetching obsessive-compulsive-disorder
queue length 345
Fetching parkinsons-disease
queue length 344
Fetching post-traumatic-stress-disorder
queue length 343
Fetching psychosis
queue length 342
Fetching schizophrenia
queue length 341
Fetching spinocerebellar-ataxia
queue length 340
Fetching stroke
queue length 338
Fetching aggression
queue length 337
Fetching amygdala
queue length 336
Fetching insula
queue length 335
Fetching limbic-system
queue length 334
Fetching prefrontal-cortex
queue length 333
Fetching striatum
queue length 332
Fetching epige

queue length 201
Fetching root-apical-meristem
queue length 200
Fetching shoot-apical-meristem
queue length 199
Fetching auxin
queue length 198
Fetching brassinosteroid
queue length 197
Fetching cytokinin
queue length 196
Fetching gibberellins
queue length 195
Fetching jasmonic-acid
queue length 194
Fetching strigolactone
queue length 193
Fetching effectors-in-plant-pathology
queue length 192
Fetching microbe
queue length 191
Fetching pattern-recognition-receptors-in-plants
queue length 190
Fetching virulence
queue length 189
Fetching fertilization
queue length 188
Fetching flowering
queue length 187
Fetching fruiting
queue length 186
Fetching pollen
queue length 185
Fetching pollen-tube
queue length 184
Fetching pollination
queue length 183
Fetching seed-development
queue length 182
Fetching seed-distribution
queue length 181
Fetching self-incompatability
queue length 180
Fetching vernalization
queue length 179
Fetching abiotic
queue length 178
Fetching biotic
queue length 177
Fetchin

queue length 40
Fetching hypoxic-ischaemic-encephalopathy
queue length 38
Fetching enteric-nervous-system
queue length 37
Fetching acute-coronary-syndromes
queue length 37
Fetching arrhythmias
queue length 39
Fetching cardiomyopathies
queue length 39
Fetching congenital-heart-defects
queue length 38
Fetching dyslipidaemias
queue length 37
Fetching heart-failure
queue length 36
Fetching valvular-disease
queue length 34
Fetching vascular-diseases
queue length 47
Fetching pre-eclampsia
queue length 46
Fetching renovascular-hypertension
queue length 45
Fetching glomerulus
queue length 45
Fetching diabetes
queue length 49
Fetching metabolic-bone-disease
queue length 52
Fetching metabolic-syndrome
queue length 51
Fetching multihormonal-system-disorders
queue length 50
Fetching pre-diabetes
queue length 48
Fetching pituitary-gland
queue length 46
Fetching endocrine-reproductive-disorders
queue length 45
Fetching infertility
queue length 44
Fetching sexual-dysfunction
queue length 43
Fetching 

In [142]:
def find_parents(subj, tax):
    return [ els['topic_id'] for els in tax if (subj in els['children']) ]

In [100]:
def max_depth(subj, tax):
    parents = find_parents(subj, tax)
    if len(parents) == 0:
        return 0
    else:
        return 1 + max([ max_depth(parent, tax) for parent in parents ])

In [143]:
def filter_to_depth(max_depth, tax, depths=None):
    if depths is None:
        depths = {subj['topic_id']: max_depth(subj['topic_id'], tax) for subj in tax}
    valid_topics = [topic_id for (topic_id, depth) in depths.items() if depth<=max_depth]
    return [ subj for subj in tax if subj['topic_id'] in valid_topics ]

In [132]:
biology_depths = {subj['topic_id']: max_depth(subj['topic_id'], biology_tax) for subj in biology_tax}

In [138]:
biology_tax_1 = filter_to_depth(1, biology_tax, biology_depths)
biology_tax_2 = filter_to_depth(2, biology_tax, biology_depths)

In [145]:
print(len(biology_tax_1))
print(len(biology_tax_2))
print(len(biology_tax))

23
449
1675


In [144]:
with open("biological-sciences-full.json", 'w') as file:
    json.dump(biology_tax, file)
with open("biological-sciences-1.json", 'w') as file:
    json.dump(biology_tax_1, file)
with open("biological-sciences-2.json", 'w') as file:
    json.dump(biology_tax_2, file)

In [156]:
abbrev_list_1 = [{'abbreviation': 'bio_'+subj['topic_id'],
                  'name': subj['name'],
                  'subject': 'bio'}
                  for subj in biology_tax_1 if subj['topic_id'] != 'biological-sciences']

In [157]:
with open("abbrevs-1.json", 'w') as file:
    json.dump(abbrev_list_1, file)