### UNIPROT

In [2]:
import requests, sys

requestURL = "https://www.ebi.ac.uk/proteins/api/proteins?offset=0&size=100&accession=P21802"

r = requests.get(requestURL, headers={ "Accept" : "application/xml"})

if not r.ok:
    r.raise_for_status()
    sys.exit()

responseBody = r.text
print(responseBody)

<?xml version='1.0' encoding='UTF-8'?><uniprot xmlns="http://uniprot.org/uniprot" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><entry xmlns="http://uniprot.org/uniprot" dataset="Swiss-Prot" created="1991-05-01" modified="2023-05-03" version="268"><accession>P21802</accession><accession>B4DFC2</accession><accession>E7EVR6</accession><accession>E9PCR0</accession><accession>P18443</accession><accession>Q01742</accession><accession>Q12922</accession><accession>Q14300</accession><accession>Q14301</accession><accession>Q14302</accession><accession>Q14303</accession><accession>Q14304</accession><accession>Q14305</accession><accession>Q14672</accession><accession>Q14718</accession><accession>Q14719</accession><accession>Q1KHY5</accession><accession>Q86YI4</accession><accession>Q8IXC7</accession><accession>Q96KL9</accession><accession>Q96KM0</accession><accession>Q96KM1</accession><accession

In [3]:
import requests, sys

requestURL = "https://www.ebi.ac.uk/proteins/api/coordinates?offset=0&size=100&gene=FGFR2"

r = requests.get(requestURL, headers={ "Accept" : "application/json"})

if not r.ok:
    r.raise_for_status()
    sys.exit()

responseBody = r.text
print(responseBody)

[{"accession":"A0A087WY21","name":"A0A087WY21_HUMAN","taxid":9606,"sequence":"XSLARPSFSLVEDTTLEPEEPPTKYQISQPEVYVAAPGESLEVRCLLKDAAVISWTKDGVHLGPNNRTVLIGEYLQIKGATPRDSGLYACTASRTVDSETWYFMVNVTDAISSGDDEDDTDGAEDFVSENSNNKSK","protein":{"recommendedName":{"fullName":"Ig-like domain-containing protein"},"submittedName":[{"fullName":"Fibroblast growth factor receptor 2"}]},"gene":[{"value":"FGFR2","type":"primary"}],"gnCoordinate":[{"genomicLocation":{"exon":[{"proteinLocation":{"begin":{"position":1,"status":"certain"},"end":{"position":19,"status":"certain"}},"genomeLocation":{"begin":{"position":121593764,"status":"certain"},"end":{"position":121593709,"status":"certain"}},"id":"ENSE00003724297"},{"proteinLocation":{"begin":{"position":19,"status":"certain"},"end":{"position":108,"status":"certain"}},"genomeLocation":{"begin":{"position":121565704,"status":"certain"},"end":{"position":121565438,"status":"certain"}},"id":"ENSE00003488412"},{"proteinLocation":{"begin":{"position":108,"status":"ce

### BioRxiv api 

https://api.biorxiv.org/

The format of the endpoint is https://api.biorxiv.org/details/[server]/[interval]/[cursor]/[format] or https://api.biorxiv.org/details/[server]/[DOI]/na/[format]

In [34]:
import requests
import json
from IPython.display import display, JSON
import pandas as pd

In [129]:
base_url = "https://api.biorxiv.org"

def fetch_details(server, interval, cursor=0, format='json'):
    url = f"{base_url}/details/{server}/{interval}/{cursor}/{format}"
    response = requests.get(url)
    if response.status_code == 200:
        return json.loads(response.text)
    else:
        return None

def fetch_preprint_publications(server, interval, cursor=0, format='json'):
    url = f"{base_url}/pubs/{server}/{interval}/{cursor}/{format}"
    response = requests.get(url)
    if response.status_code == 200:
        return json.loads(response.text)
    else:
        return None

def fetch_published_articles(interval, cursor=0, format='json'):
    url = f"{base_url}/pub/{interval}/{cursor}/{format}"
    response = requests.get(url)
    if response.status_code == 200:
        return json.loads(response.text)
    else:
        return None

def fetch_summary_statistics(interval, format='json'):
    url = f"{base_url}/sum/{interval}"
    response = requests.get(url)
    if response.status_code == 200:
        return json.loads(response.text)
    else:
        return None

def fetch_usage_statistics(interval, format='json'):
    url = f"{base_url}/usage/{interval}/{format}"
    response = requests.get(url)
    if response.status_code == 200:
        return json.loads(response.text)
    else:
        return None
    
def fetch_data(fetch_func, uses_cursor=True, *args, **kwargs):
    cursor = 0
    all_data = []

    while True:
        fetch_args = (*args, cursor) if uses_cursor else args
        response = fetch_func(*fetch_args, **kwargs)
        
        key = 'collection' if 'collection' in response else 'bioRxiv content statistics' if 'bioRxiv content statistics' in response else None
        
        if key is None or not response[key]:
            break

        all_data.extend(response[key])
        cursor += 100

    return all_data

def print_json_full_window(data):
    display(JSON(data))



#### API SEGMENTATION: CONTENT DETAIL

In [109]:
# API SEGMENTATION: CONTENT DETAIL
interval = '2023-06-01/2023-06-05'

content_details = fetch_data(fetch_details, True, 'biorxiv', interval)

filtered_content_details = []
for pub in content_details:
    filtered_content_pub = {
        'doi': pub['doi'],
        'title': pub['title'],
        'authors': pub['authors'],
        'date': pub['date'],
        'abstract': pub['abstract'],
        'category': pub['category'], 
        'published': pub['published'],
        'author_institution': pub['author_corresponding_institution'], 
        'author_corresponding': pub['author_corresponding'],
        'version': pub['version'],
    }
    filtered_content_details.append(filtered_content_pub)

In [110]:
filtered_content_details_df = pd.DataFrame(filtered_content_details)
filtered_content_details_df.tail()

Unnamed: 0,doi,title,authors,date,abstract,category,published,author_institution,author_corresponding,version
723,10.1101/2023.06.05.542485,DNA methylation signatures of early life adver...,"Anderson, J. A.; Lin, D.; Lea, A. J.; Johnston...",2023-06-05,The early life environment can profoundly shap...,genomics,,Max Planck Institute for Evolutionary Anthropo...,Jenny Tung,1
724,10.1101/2023.06.02.543369,SNVstory: A dockerized algorithm for rapid and...,"Bollas, A. E.; Rajkovic, A.; Ceyhan, D.; Gaith...",2023-06-05,Knowing a patients genetic ancestry is crucial...,genomics,,Abigail Wexner Research Institute at Nationwid...,Peter White,1
725,10.1101/2023.06.04.543628,Ryegrass mottle virus complete genome determin...,"Balke, I.; Silamikelis, I.; Radovica-Spalvina,...",2023-06-05,Sobemovirus ryegrass mottle virus (RGMoV) is a...,molecular biology,,Latvian Biomedical Research and Study Centre,Ina Balke,1
726,10.1101/2023.06.02.543351,Haplotype-resolve genome assembly and resequen...,"Zhang, Z.; Liu, Y.; Yang, T.; Wu, S.; Sun, H.;...",2023-06-05,Modern rose (Rosa hybrida) is a recently forme...,genomics,,Beijing Key Laboratory of Development and Qual...,Junping Gao,1
727,10.1101/2023.06.03.543569,Early Alzheimer's disease pathology in human c...,"Gazestani, V. H.; Kamath, T.; Nadaf, N. M.; Bu...",2023-06-05,Cellular perturbations underlying Alzheimers d...,genomics,,"Broad Institute of MIT and Harvard, Cambridge,...",Evan Z Macosko,1


#### API SEGMENTATION: PREPRINT PUBLISHED ARTICLE DETAIL FOR SPECIFIED SERVER (BIORXIV OR MEDRXIV) -- BIORXIV USED 

In [125]:
interval = '2023-06-01/2023-06-05'
preprint_publications = fetch_data(fetch_preprint_publications, True, 'biorxiv', interval)

filtered_preprint_details = []
for pub in preprint_publications:
    filtered__preprint_pub = {
        'biorxiv_doi': pub['preprint_doi'],
        'published_doi' : pub['published_doi'],
        'preprint_title' : pub['preprint_title'],
        'preprint_authors' : pub['preprint_authors'],
        'preprint_category' : pub['preprint_category'],
        'preprint_date' : pub['preprint_date'],
        'published_date' : pub['published_date'],
        'preprint_abstract' : pub['preprint_abstract'],
        'published_journal' : pub['published_journal'],
        'preprint_platform' : pub['preprint_platform'],
        'preprint_author_corresponding' : pub['preprint_author_corresponding'], 
        'preprint_author_corresponding_institution' : pub['preprint_author_corresponding_institution'],
    }
    filtered_preprint_details.append(filtered__preprint_pub)

In [126]:
filtered_preprint_details_df = pd.DataFrame(filtered_preprint_details)
filtered_preprint_details_df.tail()

Unnamed: 0,biorxiv_doi,published_doi,preprint_title,preprint_authors,preprint_category,preprint_date,published_date,preprint_abstract,published_journal,preprint_platform,preprint_author_corresponding,preprint_author_corresponding_institution
1,10.1101/2022.09.03.506470,10.1002/advs.202205445,Optimization and deoptimization of codons in S...,"Wu, X.; Shan, K.; Zan, F.; Tang, X.; Qian, Z.;...",evolutionary biology,2022-09-05,2023-06-02,The spread of Coronavirus Disease 2019 (COVID-...,Advanced Science,bioRxiv,Jian Lu,Peking University
2,10.1101/2023.03.29.534823,10.1007/s00604-023-05827-7,Hydrogel Microdroplet-Based Digital Quantitati...,"Tan, Z. L.; Yasuura, M.; Horiguchi, Y.; Ashiba...",molecular biology,2023-03-29,2023-06-01,Droplet digital PCR (ddPCR) is accurate in nuc...,Microchimica Acta,bioRxiv,Takashi Fukuda,"Sensing System Research Center, National Insti..."
3,10.1101/2023.05.17.541111,10.1002/adma.202301673,Shaping Synthetic Multicellular and Complex Mu...,"Ribezzi, D.; Gueye, M.; Florczak, S.; Dusi, F....",bioengineering,2023-05-19,2023-06-03,"In living tissues, cells express their functio...",Advanced Materials,bioRxiv,Riccardo Levato,Utrecht University
4,10.1101/2023.03.31.535181,10.1016/j.yexcr.2023.113671,Increase in primary cilia number and length up...,"Dutta, A.; Halder, P.; Gayen, A.; Mukherjee, A...",cell biology,2023-04-02,2023-06-03,Primary cilia (PCs) that are present in most h...,Experimental Cell Research,bioRxiv,Shubhra Majumder,Presidency University
5,10.1101/2022.11.01.514735,10.1371/journal.pgen.1010770,The Circadian Clock is Disrupted in Pancreatic...,"Schwartz, P. B.; Nukaya, M.; Berres, M.; Rubin...",cancer biology,2022-11-02,2023-06-01,Disruption of the circadian clock is linked to...,PLOS Genetics,bioRxiv,Sean Ronnekleiv-Kelly,"University of Wisconsin, Madison"


#### API SEGMENTATION: Published article detail (bioRxiv only)

In [113]:
# API SEGMENTATION: Published article detail (bioRxiv only)

interval = '2022-06-01/2022-06-25'
published_articles = fetch_data(fetch_published_articles, True, interval)

filtered_published_details = []
for pub in published_articles:
    filtered_published_pubs = {
        'biorxiv_doi' : pub['biorxiv_doi'],
        'published_doi' : pub['published_doi'],
        'preprint_title' : pub['preprint_title'],
        'preprint_category' : pub['preprint_category'],
        'preprint_date' : pub['preprint_date'],
        'published_date': pub['published_date'],
    }
    filtered_published_details.append(filtered_published_pubs)


In [114]:
filtered_published_details_df = pd.DataFrame(filtered_published_details)
len(filtered_published_details_df)

1459

### DOES NOT WORK

In [None]:
# # API SEGMENTATION: Publisher article detail
# interval = 'y'
# summary_stats = fetch_data(fetch_summary_statistics, False, interval)

# filtered_summary_stats = []
# for pub in published_articles:
#     filtered_content_stats = {
#         'month': pub['month'],
#         'new_papers': pub['new_papers'],
#         'new_papers_cumulative': pub['new_papers_cumulative'],
#         'revised_papers': pub['revised_papers'],
#         'preprint_date' : pub['preprint_date'],
#         'revised_papers_cumulative' : pub['revised_papers_cumulative']
#     }
#     filtered_summary_stats.append(filtered_content_stats)


In [None]:
# # API SEGMENTATION: Content Summary Statistics
# interval = 'm'
# usage_stats = fetch_data(fetch_usage_statistics, False, interval)


# filtered_summary_usage_stats = []
# for pub in published_articles:
#     filtered_usage_stats = {
#         'doi': pub['doi'],
#         'title': pub['title'],
#         'authors': pub['authors'],
#         'date': pub['date'],
#         'abstract': pub['abstract'],
#         'category': pub['category'], 
#         'published': pub['published'],
#         'author_institution': pub['author_corresponding_institution'], 
#         'author_corresponding': pub['author_corresponding'],
#         'version': pub['version'],
#     }
#     filtered_summary_usage_stats.append(filtered_usage_stats)

### GENE ONTOLOGY 

http://geneontology.org/docs/tools-guide/

https://github.com/tanghaibao/goatools/tree/main

https://raw.githubusercontent.com/tanghaibao/goatools/main/data/association

<!-- https://api.geneontology.org/api -->

In [None]:
# gene_ontology_url = 'http://api.geneontology.org/api/bioentity/function/'

# def fetch_gene_ontology(server, interval, cursor=0, format='json'):
#     url = f"{base_url}/details/{server}/{interval}/{cursor}/{format}"
#     response = requests.get(url)
#     if response.status_code == 200:
#         return json.loads(response.text)
#     else:
#         return None

In [132]:
!pip install goatools

Collecting goatools
  Downloading goatools-1.3.1-py3-none-any.whl (15.8 MB)
                                              0.0/15.8 MB ? eta -:--:--
     -                                        0.5/15.8 MB 10.2 MB/s eta 0:00:02
     --                                       1.0/15.8 MB 16.4 MB/s eta 0:00:01
     ----                                     1.7/15.8 MB 15.6 MB/s eta 0:00:01
     ----                                     1.9/15.8 MB 11.1 MB/s eta 0:00:02
     ----                                     2.0/15.8 MB 8.9 MB/s eta 0:00:02
     ----------                               4.2/15.8 MB 15.7 MB/s eta 0:00:01
     ----------                               4.2/15.8 MB 15.7 MB/s eta 0:00:01
     -------------                            5.2/15.8 MB 15.2 MB/s eta 0:00:01
     -------------                            5.2/15.8 MB 15.2 MB/s eta 0:00:01
     ---------------                          6.1/15.8 MB 14.0 MB/s eta 0:00:01
     ------------------                       7.3/15.

In [135]:
!pip install wget

Collecting wget
  Using cached wget-3.2.zip (10 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: wget
  Building wheel for wget (setup.py): started
  Building wheel for wget (setup.py): finished with status 'done'
  Created wheel for wget: filename=wget-3.2-py3-none-any.whl size=9681 sha256=c2121bef2007e1629f115a4ec8c2abb7ebba725f86baf43f0e0f20dfd5188f89
  Stored in directory: c:\users\derek\appdata\local\pip\cache\wheels\8b\f1\7f\5c94f0a7a505ca1c81cd1d9208ae2064675d97582078e6c769
Successfully built wget
Installing collected packages: wget
Successfully installed wget-3.2


In [136]:
from goatools import obo_parser
import wget
import os 


In [139]:
go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo'
data_folder = os.getcwd() + '/data_go'

if(not os.path.isfile(data_folder)):
    try:
        os.mkdir(data_folder)
    except OSError as e:
        if(e.errno != 17):
            raise e
else:
    raise Exception('Data path (' + data_folder + ') exists as a file. '
                   'Please rename, remove or change the desired location of the data path.')

if(not os.path.isfile(data_folder+'/go-basic.obo')):
    go_obo = wget.download(go_obo_url, data_folder+'/go-basic.obo')
else:
    go_obo = data_folder+'/go-basic.obo'

In [138]:
go_obo

'c:\\Users\\derek\\cs_projects\\bioML\\azureDevOps\\bioDevTool\\api_database_connection/data/go-basic.obo'

In [140]:
go = obo_parser.GODag(go_obo)

c:\Users\derek\cs_projects\bioML\azureDevOps\bioDevTool\api_database_connection/data_go/go-basic.obo: fmt(1.2) rel(2023-06-11) 46,420 Terms


In [145]:
go_id = 'GO:0006915'
go_term = go[go_id]
print(go_term)

GO:0006915	level-04	depth-04	apoptotic process [biological_process]


In [146]:
print('GO term name: {}'.format(go_term.name))
print('GO term namespace: {}'.format(go_term.namespace))

GO term name: apoptotic process
GO term namespace: biological_process


In [149]:
def transitive_closure(go_term, go):
    go_term_set = set()
    find_parents(go_term, go, go_term_set)
    find_children(go_term, go, go_term_set)
    return go_term_set
    
def find_parents(term1, go, go_term_set={}, ret=False):
    for term2 in term1.parents:
        go_term_set.update({term2})
        
        # Recurse on term to find all parents
        find_parents(term2, go, go_term_set)          
    if(ret):
        return go_term_set

def find_children(term1, go, go_term_set={}, ret=False):
    for term2 in term1.children:
        go_term_set.update({term2})
        
        # Recurse on term to find all children
        find_children(term2, go, go_term_set)
    if(ret):
        return go_term_set

In [150]:
go_term_set = transitive_closure(go_term, go)

In [151]:
for term in go_term_set:
    print(term)

GO:1905398	level-08	depth-08	activated CD4-positive, alpha-beta T cell apoptotic process [biological_process]
GO:0071948	level-08	depth-08	activation-induced B cell apoptotic process [biological_process]
GO:0034349	level-05	depth-05	glial cell apoptotic process [biological_process]
GO:0110088	level-05	depth-06	hippocampal neuron apoptotic process [biological_process]
GO:0072577	level-06	depth-06	endothelial cell apoptotic process [biological_process]
GO:0002516	level-06	depth-08	B cell deletion [biological_process]
GO:0003275	level-08	depth-08	apoptotic process involved in outflow tract morphogenesis [biological_process]
GO:0006924	level-08	depth-08	activation-induced cell death of T cells [biological_process]
GO:0003276	level-08	depth-08	apoptotic process involved in heart valve morphogenesis [biological_process]
GO:0003277	level-08	depth-08	apoptotic process involved in endocardial cushion morphogenesis [biological_process]
GO:1902217	level-06	depth-06	erythrocyte apoptotic process [

### Ensembl

In [155]:
class EnsemblWrapper:
    def __init__(self):
        self.server = "https://rest.ensembl.org"

    def get_sequence_by_id(self, id):
        ext = f"/sequence/id/{id}?content-type=application/json"
        response = requests.get(self.server + ext, headers={"Content-Type": "application/json"})

        if not response.ok:
            response.raise_for_status()

        return response.json()

    def get_gene_by_id(self, id):
        ext = f"/lookup/id/{id}?content-type=application/json"
        response = requests.get(self.server + ext, headers={"Content-Type": "application/json"})

        if not response.ok:
            response.raise_for_status()

        return response.json()

# Usage:
ensembl = EnsemblWrapper()
sequence = ensembl.get_sequence_by_id("ENSG00000157764")
gene = ensembl.get_gene_by_id("ENSG00000157764")

print(sequence)
print(gene)


{'molecule': 'dna', 'query': 'ENSG00000157764', 'desc': 'chromosome:GRCh38:7:140719327:140924929:-1', 'version': 14, 'seq': 'CTTCCCCCAATCCCCTCAGGCTCGGCTGCGCCCGGGGCCGCGGGCCGGTACCTGAGGTGGCCCAGGCGCCCTCCGCCCGCGGCGCCGCCCGGGCCGCTCCTCCCCGCGCCCCCCGCGCCCCCCGCTCCTCCGCCTCCGCCTCCGCCTCCGCCTCCCCCAGCTCTCCGCCTCCCTTCCCCCTCCCCGCCCGACAGCGGCCGCTCGGGCCCCGGCTCTCGGTTATAAGATGGCGGCGCTGAGCGGTGGCGGTGGTGGCGGCGCGGAGCCGGGCCAGGCTCTGTTCAACGGGGACATGGAGCCCGAGGCCGGCGCCGGCGCCGGCGCCGCGGCCTCTTCGGCTGCGGACCCTGCCATTCCGGAGGAGGTGAGTGCTGGCGCCACCCTGCCGCCCTCCCGACTCCGGGCTCGGCGGCTGGCTGGTGTTTATTTTGGAAAGAGGCGGCGGTGGGGGCTTGATGCCCTCAGCCACCTTCTCGGGCCAGCTCCGCGGGCTGGGAGGTGGGCATCGCCCCCGTGTCCCTCTCCGTCATGCAGCGCCTTCCTACGTAAACACACACAATGGCCCGGGGGGTTTCCCTGGCCCCCACCCCAGATGTGGGGATTGGGGCAGCGGTGGTTGAGCGGGAGGCTATCAATAGGGGGCGAAACTCAGGGTTGGTCCGAGAAGGTCACGATTGGCTGAAGTATCCAGCTCTGCATCTCTGTGGGGTGGGGGCGGCGGCGGCCTCGACGTGGAGGATATAGGTTAGTTGCTGGGGCTGAGACAACAGCCCGAGTTACTGTCGCGTGTAATTCTTACATGGTCGTGGGGATGATGGGGCTCATCATTTCCTCTCTCCTCTCCCGGACTGCCCCCCTTCTCAGTCCGCTGCCCT

In [156]:
class EnsemblAPI:

    def __init__(self):
        self.server = "https://rest.ensembl.org"

    def get_sequence(self, id):
        ext = f"/sequence/id/{id}?content-type=application/json"
        r = requests.get(self.server+ext, headers={"Content-Type": "application/json"})

        if not r.ok:
            r.raise_for_status()
        
        return r.json()

    def get_gene(self, id):
        ext = f"/lookup/id/{id}?content-type=application/json"
        r = requests.get(self.server+ext, headers={"Content-Type": "application/json"})
        
        if not r.ok:
            r.raise_for_status()
        
        return r.json()

In [158]:
ensembl = EnsemblAPI()
sequence = ensembl.get_sequence("ENSG00000157764")
gene = ensembl.get_gene("ENSG00000157764")
print(sequence)
print(gene)

{'query': 'ENSG00000157764', 'seq': 'CTTCCCCCAATCCCCTCAGGCTCGGCTGCGCCCGGGGCCGCGGGCCGGTACCTGAGGTGGCCCAGGCGCCCTCCGCCCGCGGCGCCGCCCGGGCCGCTCCTCCCCGCGCCCCCCGCGCCCCCCGCTCCTCCGCCTCCGCCTCCGCCTCCGCCTCCCCCAGCTCTCCGCCTCCCTTCCCCCTCCCCGCCCGACAGCGGCCGCTCGGGCCCCGGCTCTCGGTTATAAGATGGCGGCGCTGAGCGGTGGCGGTGGTGGCGGCGCGGAGCCGGGCCAGGCTCTGTTCAACGGGGACATGGAGCCCGAGGCCGGCGCCGGCGCCGGCGCCGCGGCCTCTTCGGCTGCGGACCCTGCCATTCCGGAGGAGGTGAGTGCTGGCGCCACCCTGCCGCCCTCCCGACTCCGGGCTCGGCGGCTGGCTGGTGTTTATTTTGGAAAGAGGCGGCGGTGGGGGCTTGATGCCCTCAGCCACCTTCTCGGGCCAGCTCCGCGGGCTGGGAGGTGGGCATCGCCCCCGTGTCCCTCTCCGTCATGCAGCGCCTTCCTACGTAAACACACACAATGGCCCGGGGGGTTTCCCTGGCCCCCACCCCAGATGTGGGGATTGGGGCAGCGGTGGTTGAGCGGGAGGCTATCAATAGGGGGCGAAACTCAGGGTTGGTCCGAGAAGGTCACGATTGGCTGAAGTATCCAGCTCTGCATCTCTGTGGGGTGGGGGCGGCGGCGGCCTCGACGTGGAGGATATAGGTTAGTTGCTGGGGCTGAGACAACAGCCCGAGTTACTGTCGCGTGTAATTCTTACATGGTCGTGGGGATGATGGGGCTCATCATTTCCTCTCTCCTCTCCCGGACTGCCCCCCTTCTCAGTCCGCTGCCCTTTTTCACTTTTCTATTTGGGGATTTCTCTTCACCTGTTTTACCCAGCAAATTATTTTGATTTAGTCTTTACTTTTTCAATCCTAAATC

### Human Cell Atlas

In [163]:
def get_data_from_hca(base_url, endpoint):  
    response = requests.get(base_url + endpoint)
    if response.status_code == 200:
        data = response.json()
        return data
    else:
        print(f"Request failed with status code {response.status_code}")
        return None

def print_data(data):
    if data:
        print(json.dumps(data, indent=4))

def print_keys_values(data):
    # Print all keys
    print("Keys:")
    for key in data.keys():
        print(key)
    
    # Print all values
    print("\nValues:")
    for value in data.values():
        print(value)

def main():
    base_url = "https://service.azul.data.humancellatlas.org"
    endpoint = "/index/projects"

    # get data from the API
    data = get_data_from_hca(base_url, endpoint)
    # print_data(data)
    
    # print keys and values
    print_keys_values(data)

def print_json_full_window(data):
    display(JSON(data))


In [164]:
if __name__ == "__main__":
    main()

Keys:
pagination
termFacets
hits

Values:
{'count': 10, 'total': 364, 'size': 10, 'next': 'https://service.azul.data.humancellatlas.org/index/projects?catalog=dcp28&filters=%7B%7D&search_after=%5B%22A+Single-Cell+Transcriptomic+Atlas+of+Human+Skin+Aging.%22%2C+%22923d3231-7295-4184-b3f6-c3082766a8c7%22%5D&sort=projectTitle&order=asc&size=10', 'previous': None, 'pages': 37, 'sort': 'projectTitle', 'order': 'asc'}
{'organ': {'terms': [{'term': 'blood', 'count': 65}, {'term': None, 'count': 58}, {'term': 'brain', 'count': 41}, {'term': 'lung', 'count': 35}, {'term': 'kidney', 'count': 33}, {'term': 'liver', 'count': 27}, {'term': 'eye', 'count': 20}, {'term': 'pancreas', 'count': 20}, {'term': 'skin of body', 'count': 19}, {'term': 'colon', 'count': 18}, {'term': 'heart', 'count': 15}, {'term': 'small intestine', 'count': 15}, {'term': 'spleen', 'count': 12}, {'term': 'breast', 'count': 11}, {'term': 'thymus', 'count': 11}, {'term': 'uterus', 'count': 11}, {'term': 'hematopoietic system',

In [183]:
class AzulClient:
    def __init__(self, base_url='https://service.azul.data.humancellatlas.org/'):
        self.base_url = base_url

    def get_catalogs(self):
        response = requests.get(f'{self.base_url}index/catalogs')
        return response.json()

    def get_entity(self, entity_type, entity_id):
        response = requests.get(f'{self.base_url}index/{entity_type}/{entity_id}')
        return response.json()

    def search_index(self, entity_type):
        response = requests.get(f'{self.base_url}index/{entity_type}')
        return response.json()

    def get_summary(self):
        response = requests.get(f'{self.base_url}index/summary')
        return response.json()

    def get_manifest_files(self):
        response = requests.get(f'{self.base_url}manifest/files')
        return response.json()

    def fetch_manifest_files(self):
        response = requests.get(f'{self.base_url}fetch/manifest/files')
        return response.json()

    def get_repository_file(self, file_uuid):
        response = requests.get(f'{self.base_url}repository/files/{file_uuid}')
        return response.json()

    def fetch_repository_file(self, action, file_uuid):
        if action == 'download data':
            response = requests.get(f'{self.base_url}/repository/files/{file_uuid}')
            return response.json()
        elif action == 'repo data':
            response = requests.get(f'{self.base_url}/fetch/repository/metadata/{file_uuid}')
            return response.json()
        elif action == 'data sources':
            response = requests.get(f'{self.base_url}/repository/sources')
            return response.json()
        else:
            return None

    def list_data_sources(self):
        response = requests.get(f'{self.base_url}repository/sources')
        return response.json()

    def get_health(self):
        response = requests.get(f'{self.base_url}health')
        return response.json()

    def get_version(self):
        response = requests.get(f'{self.base_url}version')
        return response.json()


In [189]:
hca_client = AzulClient()

# data_sources = hca_client.list_data_sources()
# get_summary = hca_client.get_summary()
# get_repository_files = hca_client.get_repository_file('237538e6-7f05-5e56-a47d-01cdfd136a7e')
fetch_repository_files = hca_client.fetch_repository_file('repo data', '237538e6-7f05-5e56-a47d-01cdfd136a7e')
print(fetch_repository_files)




{'message': 'Missing Authentication Token'}


In [169]:
catalogs = client.get_catalogs()
print(catalogs)


{'default_catalog': 'dcp28', 'catalogs': {'dcp28': {'internal': False, 'atlas': 'hca', 'plugins': {'metadata': {'name': 'hca', 'indices': {'bundles': {'default_sort': 'bundleVersion', 'default_order': 'desc'}, 'files': {'default_sort': 'fileName', 'default_order': 'asc'}, 'projects': {'default_sort': 'projectTitle', 'default_order': 'asc'}, 'samples': {'default_sort': 'sampleId', 'default_order': 'asc'}}}, 'repository': {'name': 'tdr_hca', 'sources': ['tdr:datarepo-1c07a6e4:snapshot/hca_prod_ea9eec5a4fc24c5894d02fcb598732bc__20221208_dcp2_20230314_dcp25:/0', 'tdr:datarepo-2285af8d:snapshot/hca_prod_b9484e4edc404e389b854cecf5b8c068__20220118_dcp2_20230314_dcp25:/0', 'tdr:datarepo-60acbcdf:snapshot/hca_prod_111d272bc25a49ac9b25e062b70d66e0__20230530_dcp2_20230530_dcp28:/0', 'tdr:datarepo-385ef7e4:snapshot/hca_prod_8a40ff19e6144c50b23b5c9e1d546bab__20220118_dcp2_20220607_dcp17:/0', 'tdr:datarepo-ffeff93b:snapshot/hca_prod_bd40033154b94fccbff66bb8b079ee1f__20220118_dcp2_20230314_dcp25:/0',

In [170]:
entity_info = client.get_entity(entity_type='samples', entity_id='your_entity_id')
print(entity_info)


{'Code': 'BadRequestError', 'Message': "'your_entity_id' is not a valid UUID."}


In [171]:
import os
from tqdm import tqdm

In [172]:
def download_file(url, output_path):
    url = url.replace('/fetch', '')  # Work around https://github.com/DataBiosphere/azul/issues/2908
    
    response = requests.get(url, stream=True)
    response.raise_for_status()
    
    total = int(response.headers.get('content-length', 0))
    print(f'Downloading to: {output_path}', flush=True)
    
    with open(output_path, 'wb') as f:
        with tqdm(total=total, unit='B', unit_scale=True, unit_divisor=1024) as bar:
            for chunk in response.iter_content(chunk_size=1024):
                size = f.write(chunk)
                bar.update(size)

In [173]:
def iterate_matrices_tree(tree, keys=()):
    if isinstance(tree, dict):
        for k, v in tree.items():
            yield from iterate_matrices_tree(v, keys=(*keys, k))
    elif isinstance(tree, list):
        for file in tree:
            yield keys, file
    else:
        assert False

In [176]:
project_uuid = '4a95101c-9ffc-4f30-a809-f04518a23803'
catalog = 'dcp1'
endpoint_url = f'https://service.azul.data.humancellatlas.org/index/projects/{project_uuid}'

save_location = '/tmp'

In [178]:
response = requests.get(endpoint_url, params={'catalog': catalog})
response.raise_for_status()

response_json = response.json()
project = response_json['projects'][0]

file_urls = set()
for key in ('matrices', 'contributedAnalyses'):
    tree = project[key]
    for path, file_info in iterate_matrices_tree(tree):
        url = file_info['url']
        if url not in file_urls:
            dest_path = os.path.join(save_location, file_info['name'])
            download_file(url, dest_path)
            file_urls.add(url)
print('Downloads Complete.')

Downloading to: /tmp\4a95101c-9ffc-4f30-a809-f04518a23803.homo_sapiens.csv.zip


FileNotFoundError: [Errno 2] No such file or directory: '/tmp\\4a95101c-9ffc-4f30-a809-f04518a23803.homo_sapiens.csv.zip'

In [14]:
def get_geo_data(accession):
    base_url = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi"
    params = {"acc": accession, "targ": "self", "form": "text", "view": "quick"}
    response = requests.get(base_url, params=params)

    if response.status_code == 200:
        return response.text
    else:
        print(f"Request failed with status code {response.status_code}")
        return None

def main():
    data = get_geo_data("GSE4049")  # using accession number from GEO
    if data:
        print(data)

if __name__ == "__main__":
    main()


^SERIES = GSE4049
!Series_title = Time-course studies of three cell wall interfering drugs on gene expression in yeast
!Series_geo_accession = GSE4049
!Series_status = Public on Nov 03 2006
!Series_submission_date = Jan 17 2006
!Series_last_update_date = Mar 16 2012
!Series_pubmed_id = 16925551
!Series_summary = Caffeine is a natural purine analog that elicits pleiotropic effects, which ultimately lead to cell death by a mechanism that is still largely uncharacterized. This drug can activate the PKC1-MAPK cell integrity pathway, as shown by phosphorylation of Mpk1 kinase. However, and contrary to expectation, the caffeine-induced hyperphosphorylation of Mpk1 was accompanied by a negligible activation of its downstream targets Rlm1 and SBF transcription factors, which suggested that the fortification of the cell wall induced by caffeine was independent on the MAP kinase activation. This result was consistent with the finding that the loss of RLM1 had no consequence on the increased resi