In [1]:
import pandas
from pathlib import Path
from pprint import pprint
from io import BytesIO
import re
import sys
import tarfile
from tqdm import tqdm
from collections import Counter, namedtuple

In [2]:
EC = str(Path("~/proj/encoded_client").expanduser())
if EC not in sys.path:
    sys.path.append(EC)
from encoded_client.encoded import ENCODED, HTTPError

In [3]:
human_table_url = "https://raw.githubusercontent.com/fairliereese/paper_rnawg/master/figures/ref/human/lr_human_library_data_summary.tsv"
mouse_table_url = "https://raw.githubusercontent.com/fairliereese/paper_rnawg/master/figures/ref/mouse/lr_mouse_library_data_summary.tsv"


In [4]:
human_table = pandas.read_csv(human_table_url, sep="\t")
human_table.head()

Unnamed: 0,ENCODE_experiment_id,dataset,sample,sample_display,general_tissue_cell_type,fig1_tissue_label,health_status,tissue_or_cell_line,sample_color_hex_code,matching_mouse_samples,reads_post_talon,ENCODE_alignments_id,ENCODE_reads_id,ENCODE_unfiltered_alignments_id
0,ENCSR989ZYL,a673_1_1,a673,A673,a673,bone,,cell_line,#de3700,,1330194,ENCFF045ZQI,ENCFF168MIB,ENCFF440LXJ
1,ENCSR989ZYL,a673_1_2,a673,A673,a673,bone,,cell_line,#de3700,,1707207,ENCFF320GCF,ENCFF861BKY,ENCFF240FZT
2,ENCSR081NRO,adrenal_gland_1_1,adrenal gland,Adrenal gland,adrenal gland,adrenal gland,,tissue,#e69b00,"adrenal,adrenal_10d,adrenal_14d,adrenal_18-20m...",765655,ENCFF147OYL,ENCFF211SQY,ENCFF967OHL
3,ENCSR563RLX,adrenal_gland_2_1,adrenal gland,Adrenal gland,adrenal gland,adrenal gland,,tissue,#e69b00,"adrenal,adrenal_10d,adrenal_14d,adrenal_18-20m...",1579294,ENCFF791WUV,ENCFF417ALN,ENCFF900XHI
4,ENCSR995WKW,adrenal_gland_3_1,adrenal gland,Adrenal gland,adrenal gland,adrenal gland,,tissue,#e69b00,"adrenal,adrenal_10d,adrenal_14d,adrenal_18-20m...",577077,ENCFF243PFI,ENCFF912HPY,ENCFF020MWV


In [5]:
known_document_types = {
    '/documents/ecb1c489-905b-4afb-8389-ed7fb2e36ff2/': "Chromium v2",
    '/documents/515a990c-50c4-459a-8036-502feaf5a18c/': "Chromium v2 technical note",
    '/documents/3408b7a1-6b1b-45d1-a73e-db0556e6abdc/': "Split-seq analysis protocol",
    '/documents/3cdf327b-df85-4149-9fca-a7541e6f1cb1/': "Split-seq overview",
    '/documents/c155e58a-13ab-4af9-8591-cee13d639a40/': "Nuclei Isolation protocol from tissue for 10x multiome",
    '/documents/53fb4382-c54f-4761-89bb-6a01a0dca97d/': "Chromium GEM v3.1",
    '/documents/4d42b1ef-c9e6-4b31-a443-1266079545d0/': "Chromium Multiome ATAC + Gene Expression",
    '/documents/d18bad0e-9a86-40f0-8ded-2c74072910e5/': "Nuclei Isolation protocol from tissue for 10x multiome",  # probably same as c155e58a-13ab-4af9-8591-cee13d639a40
    '/documents/e3110414-4013-425e-94f6-72c549702bde/': "Pool split protocol",
    '/documents/1bb75b62-ac29-4368-9855-68d410e1963a/': "GM12878 Cell Culture protocol",
    '/documents/28fef8e0-e171-450d-b66d-e3d199531cf2/': "Ambion spike in mixture",
    '/documents/a6e06058-5019-488f-9195-8bcd2e9229c1/': "Laser capture microdisection",
    '/documents/c4b8952c-d9f1-42e7-8ea5-54659d14f46b/': "Human tissue preservation protocol flash-freezing (version 1)",
    '/documents/c5c4174c-0f33-4793-be9d-f2b6a9357dae/': "Nuclei Isolation protocol from tissue for 10x multiome", # probably same as c155e58a-13ab-4af9-8591-cee13d639a40
    '/documents/84a66218-cd6a-4959-adc0-91ecc513117f/': "Chromium Multiome ATAC + Gene Expression",
    '/documents/c6235c97-a978-41b6-8a7e-1e16b983f4ba/': "inDrop Sample Collection Protocol",
    '/documents/8179222b-9ad7-4274-a07e-820dbcac6aca/': "inDrop Library Prep",
    '/documents/80953249-5d87-4f2c-84e0-0212c404c8d6/': "Nuclei Isolation protocol from tissue 10x snRNA", # probably same as c155e58a-13ab-4af9-8591-cee13d639a40
    '/documents/57177fd0-da98-4e8f-95a4-459fbaeea146/': "Chromium GEM v3.1", # psame as 53fb4382-c54f-4761-89bb-6a01a0dca97d
    '/documents/5f5193b0-0c61-40c6-a286-b1f6d6f0c0da/': "SmartSeq",
    '/documents/9a7e281f-b901-422b-9f01-378ba4b9e97f/': "Mouse adrenal gland nuclei isolation protocol for single nucleus RNA-seq",
    '/documents/f1b60b4e-b5d2-4b8d-af0b-06b767fa07c3/': "Biosample purification protocol",
    '/documents/49fd5b3b-9878-4ec8-92bf-ef6dce9116e3/': "SMARTer Ultra Low RNA kit",
    "/documents/f16e44c9-ee9d-4ede-9bb5-886ea451d7a0/": "Cortex nuclei isolation protocol",
    "/documents/eb624ae4-53ea-4de0-ba9e-11b40eb20e2c/": "cell isolation protocol",
    "/documents/77db752f-abf7-4c93-a460-510464134f52/": "PacBio LongRead 2.0",    
    "/documents/c4b8952c-d9f1-42e7-8ea5-54659d14f46b/": "Human Flash Frozen",
    "/documents/303304f2-2bb7-4ca9-9a6e-3a41989d7ed6/": "High resolution pathology image",
    "/documents/dcd089f6-fc01-4afa-8769-ed5e0a1b0db3/": "High resolution pathology image",
    "/documents/ecd67559-64eb-4f1e-99ea-48309e99c3da/": "High resolution pathology image",
    "/documents/22a7ff37-9393-4019-a4b7-a356bc8036a4/": "High resolution pathology image",
    "/documents/b556e2c0-6d3e-40d0-8d7f-37bdc59a3134/": "High resolution pathology image",
    "/documents/e7fcb862-1047-4990-a975-fcb3d71528f2/": "High resolution pathology image",
    "/documents/d3cf9d5c-c299-465d-a159-d5ab91467f7b/": "High resolution pathology image",
    "/documents/e90eb56a-f890-4795-a0cc-7c14080ce557/": "High resolution pathology image",
    "/documents/d4da5f7c-d3f4-4e28-97b0-d901a4562ee2/": "High resolution pathology image",
    "/documents/ae021ff9-4099-4701-957c-7effb7341962/": "HL-60 M0/M1/M2 differentiation protocol",
}


In [6]:
document_to_protocol = {
    '/documents/ecb1c489-905b-4afb-8389-ed7fb2e36ff2/': "Chromium v2",
    #'/documents/515a990c-50c4-459a-8036-502feaf5a18c/': "Chromium v2",
    '/documents/3408b7a1-6b1b-45d1-a73e-db0556e6abdc/': "Split-seq",
    '/documents/3cdf327b-df85-4149-9fca-a7541e6f1cb1/': "Split-seq",
    #'/documents/c155e58a-13ab-4af9-8591-cee13d639a40/': "Nuclei Isolation protocol from tissue for 10x multiome",
    '/documents/53fb4382-c54f-4761-89bb-6a01a0dca97d/': "Chromium GEM v3.1",
    '/documents/4d42b1ef-c9e6-4b31-a443-1266079545d0/': "Chromium Multiome ATAC + Gene Expression", # same as 53fb4382-c54f-4761-89bb-6a01a0dca97d
    #'/documents/d18bad0e-9a86-40f0-8ded-2c74072910e5/': "Nuclei Isolation protocol from tissue for 10x multiome",  # probably same as c155e58a-13ab-4af9-8591-cee13d639a40
    '/documents/d18bad0e-9a86-40f0-8ded-2c74072910e5/': "Chromium Multiome ATAC + Gene Expression",
    '/documents/e3110414-4013-425e-94f6-72c549702bde/': "SmartSeq Pool",
    '/documents/a6e06058-5019-488f-9195-8bcd2e9229c1/': "SmartSeq",
    #'/documents/c5c4174c-0f33-4793-be9d-f2b6a9357dae/': "Nuclei Isolation protocol from tissue for 10x multiome", # probably same as c155e58a-13ab-4af9-8591-cee13d639a40
    '/documents/84a66218-cd6a-4959-adc0-91ecc513117f/': "Chromium Multiome ATAC + Gene Expression",
    '/documents/c6235c97-a978-41b6-8a7e-1e16b983f4ba/': "inDrop",
    '/documents/8179222b-9ad7-4274-a07e-820dbcac6aca/': "inDrop",
    #'/documents/80953249-5d87-4f2c-84e0-0212c404c8d6/': "Nuclei Isolation protocol from tissue 10x snRNA", # probably same as c155e58a-13ab-4af9-8591-cee13d639a40
    '/documents/57177fd0-da98-4e8f-95a4-459fbaeea146/': "Chromium GEM v3.1", # psame as 53fb4382-c54f-4761-89bb-6a01a0dca97d
    '/documents/5f5193b0-0c61-40c6-a286-b1f6d6f0c0da/': "SmartSeq",
    '/documents/49fd5b3b-9878-4ec8-92bf-ef6dce9116e3/': "SmartSeq",
    "/documents/77db752f-abf7-4c93-a460-510464134f52/": "PacBio LongRead 2.0",
    "/documents/2297237f-dfb0-424d-a386-c350ffe8dbe3/": "cDNA libraries from brain for use in Illumina NextSeq 500 sequencing v1",
    "/documents/54012fc3-644e-49af-97e4-888226753fd9/": "non-size selected cDNA libraries for use in PacBio sequencing v1",
    "/documents/0a54a6da-ddb4-44bd-af95-2cd02754aa93/": "ENCODE Long Read RNA-Seq Analysis Protocol for Human Samples (v3.1)",
    "/documents/7ec9d66a-3b7e-4183-8677-e1df14770b44/": "ENCODE Long Read RNA-Seq Analysis Protocol for Human Samples (v.1.0)",
    "/documents/53196858-d826-458a-8232-22e4b7c2d653/": "Long read cDNA prep with Maxima H",
    "/documents/e909542d-44c0-4bee-9aac-4d41a0b768db/": "Protocol to add 5’ cap structures to exogenous synthetic RNA references (spike-ins) 1.0",
    "/documents/6d583a1d-d692-4511-b13b-c051822d861c/": "ENCODE Long Read RNA-Seq Analysis Protocol for Human Samples (v3.2)",
    "/documents/fc272a30-b9a5-4652-b255-424b61d4587b/": "ENCODE PacBio Iso-seq Analysis Protocol (v.1.0)",
    "/documents/81af563b-5134-4f78-9bc4-41cb42cc6a48/": "ENCODE Long Read RNA-Seq Analysis Protocol for HumanSamples (v3.0)",
    "/documents/9d64c094-3f6b-49ae-b344-ccdf6ae3eb0c/": "non-size selected cDNA libraries for use in PacBio sequencing",
    "/documents/3baa46d2-cb88-4608-8877-70596d200489/": "PacBio libraries v3 (October, 2020) Protocol to build non-size selected cDNA libraries for Pacific Biosciences long-read sequencing Version 3.0 (October, 2020)",
    
    "/documents/bf543ffc-23d0-4f6f-b6a8-d131859259e4/": "ENCODE Long Read RNA-Seq Analysis Protocol for MouseSamples (v.3.1)",
    "/documents/5a9cb0de-a425-449d-bc65-6316f79e71ce/": "ENCODE Long Read RNA-Seq Analysis Protocolfor Mouse Samples (v.1.1)",
    "/documents/7ca4f144-8428-4d20-95b5-f6dd74c0781c/": "ENCODE Long Read RNA-Seq Analysis Protocol for MouseSamples (v.3.0)",
}


In [7]:
server = ENCODED("www.encodeproject.org")
#server = ENCODED("test.encodedcc.org")

In [8]:
def find_documents_for_experiments(experiment_ids):
    documents_seen = Counter()
    unknown_documents = set()
    results = []
    needed_metadata = []
    experiment_cache = {}
    matching_strand = 0
    for accession in tqdm(experiment_ids):
        experiment = server.get_json(accession)
        documents = set(experiment.get("documents", []))
        construction_method = set()
        for replicate in experiment['replicates']:
            library = replicate['library']
            library_barcode = library.get("barcode_details")
            for d in library.get("documents", []):
                documents.add(d)
            construction_platform = library.get('construction_platform', None)
            if construction_platform is not None:                
                construction_method.add(construction_platform["term_name"])
            construction_platform = library.get("construction_method", None)
            if construction_platform is not None:
                construction_method.add(construction_platform[0])

            biosample = library["biosample"]
            for d in biosample.get("documents", []):
                documents.add(d)
            subcellular_fraction = biosample.get("subcellular_fraction_term_name", "whole cell")

        has_processed = False
        for f in experiment["files"]:
            submitted_file_name = Path(f["submitted_file_name"])            
            if f.get("output_type") == "sparse gene count matrix of unique reads":
                has_processed = True
            # split-seq doesn't have the filtered files
            elif f.get("output_type") == "unfiltered sparse gene count matrix of unique reads":
                has_processed = True
            # split-seq uses bulk files
            elif f.get("output_type") == "gene quantification":
                has_processed = True
            elif f.get("output_type") == "reads":
                platform = f.get("platform", {}).get("term_name")

        protocol_label = {}
        protocol_url = {}
        for d in documents:
            protocol_url["https://www.encodeproject.org{}".format(d)] = None
            if d in document_to_protocol:
                protocol_label[document_to_protocol[d]] = None
            elif d in known_document_types:
                # something we've examined
                protocol_label[known_document_types[d]] = None
            else:
                # unexamined protocols
                unknown_documents.add(d)

        experiment_cache[accession] = experiment
        results.append({
            "experiment": experiment["accession"], 
            "library": library["accession"],
            "status": experiment["status"],
            "lab": experiment["lab"]["title"].split(',')[1].strip(), 
            "description": experiment.get("description", ""),
            "document_labels": ", ".join(["'{}'".format(x) for x in protocol_label.keys()]),
            "document_urls": " , ".join(protocol_url.keys()),
            "platform": platform,
            #"library_barcode": library_barcode,
            #"library_description": library.get("description", ""),
            "RIN": library.get("rna_integrity_number"),
            "spikeins": " , ".join(library.get("spikeins_used")),
            "biosample_ontology": biosample["biosample_ontology"]["@id"],
            "biosample_term": biosample["biosample_ontology"]["term_name"],
            "organism": biosample["organism"]["name"],
            "subcellular_fraction": subcellular_fraction,
        })

    print("matching_strand", matching_strand)
    
    if len(unknown_documents) > 0:
        for d in unknown_documents:
            print("lookup https://www.encodeproject.org{}".format(d))
    return results

results = find_documents_for_experiments(human_table["ENCODE_experiment_id"])
df = pandas.DataFrame(results)
df.head()

100%|██████████| 138/138 [00:50<00:00,  2.72it/s]

matching_strand 0





Unnamed: 0,experiment,library,status,lab,description,document_labels,document_urls,platform,RIN,spikeins,biosample_ontology,biosample_term,organism,subcellular_fraction
0,ENCSR989ZYL,ENCLB767XEK,released,UCI,"""ENC4_509 , ENC4_510 A673 rep1 human Ewing sar...","'PacBio libraries v3 (October, 2020) Protocol ...",https://www.encodeproject.org/documents/3baa46...,Pacific Biosciences Sequel II,9.8,"/references/ENCSR156CIL/ , /references/ENCSR75...",/biosample-types/cell_line_EFO_0002106/,A673,human,whole cell
1,ENCSR989ZYL,ENCLB767XEK,released,UCI,"""ENC4_509 , ENC4_510 A673 rep1 human Ewing sar...","'PacBio libraries v3 (October, 2020) Protocol ...",https://www.encodeproject.org/documents/3baa46...,Pacific Biosciences Sequel II,9.8,"/references/ENCSR156CIL/ , /references/ENCSR75...",/biosample-types/cell_line_EFO_0002106/,A673,human,whole cell
2,ENCSR081NRO,ENCLB945ZXA,released,UCI,Adrenal Gland 1-2 ENTEX50,"'PacBio LongRead 2.0', 'non-size selected cDNA...",https://www.encodeproject.org/documents/77db75...,Pacific Biosciences Sequel,9.8,,/biosample-types/tissue_UBERON_0002369/,adrenal gland,human,whole cell
3,ENCSR563RLX,ENCLB748SEP,released,UCI,ENC4_29 W61 adrenal gland,"'PacBio libraries v3 (October, 2020) Protocol ...",https://www.encodeproject.org/documents/3baa46...,Pacific Biosciences Sequel II,,"/references/ENCSR759PLA/ , /references/ENCSR15...",/biosample-types/tissue_UBERON_0002369/,adrenal gland,human,whole cell
4,ENCSR995WKW,ENCLB699OTN,released,UCI,Adrenal Gland 2-1 ENTEX51,"'PacBio LongRead 2.0', 'non-size selected cDNA...",https://www.encodeproject.org/documents/77db75...,Pacific Biosciences Sequel,9.4,,/biosample-types/tissue_UBERON_0002369/,adrenal gland,human,whole cell


In [9]:
print(df.to_csv(sep="\t", index=None))

experiment	library	status	lab	description	document_labels	document_urls	platform	RIN	spikeins	biosample_ontology	biosample_term	organism	subcellular_fraction
ENCSR989ZYL	ENCLB767XEK	released	UCI	"""ENC4_509 , ENC4_510 A673 rep1 human Ewing sarcoma (bone or soft tissue sarcoma)"""	'PacBio libraries v3 (October, 2020) Protocol to build non-size selected cDNA libraries for Pacific Biosciences long-read sequencing Version 3.0 (October, 2020)', 'ENCODE Long Read RNA-Seq Analysis Protocol for Human Samples (v3.2)'	https://www.encodeproject.org/documents/3baa46d2-cb88-4608-8877-70596d200489/ , https://www.encodeproject.org/documents/6d583a1d-d692-4511-b13b-c051822d861c/	Pacific Biosciences Sequel II	9.8	/references/ENCSR156CIL/ , /references/ENCSR759PLA/	/biosample-types/cell_line_EFO_0002106/	A673	human	whole cell
ENCSR989ZYL	ENCLB767XEK	released	UCI	"""ENC4_509 , ENC4_510 A673 rep1 human Ewing sarcoma (bone or soft tissue sarcoma)"""	'PacBio libraries v3 (October, 2020) Protocol to build no

In [10]:
df.shape

(138, 14)

In [11]:
mouse_table = pandas.read_csv(mouse_table_url, sep="\t")
mouse_results = find_documents_for_experiments(mouse_table["ENCODE_experiment_id"])
mouse_df = pandas.DataFrame(mouse_results)
print(mouse_df.to_csv(sep="\t", index=None))

100%|██████████| 126/126 [00:33<00:00,  3.71it/s]

matching_strand 0
experiment	library	status	lab	description	document_labels	document_urls	platform	RIN	spikeins	biosample_ontology	biosample_term	organism	subcellular_fraction
ENCSR885NRP	ENCLB090NTH	released	UCI	ENC4_239 B6Cast F1 PND 18-20 months_Male 03_Adrenals	'PacBio libraries v3 (October, 2020) Protocol to build non-size selected cDNA libraries for Pacific Biosciences long-read sequencing Version 3.0 (October, 2020)', 'ENCODE Long Read RNA-Seq Analysis Protocol for MouseSamples (v.3.0)'	https://www.encodeproject.org/documents/3baa46d2-cb88-4608-8877-70596d200489/ , https://www.encodeproject.org/documents/7ca4f144-8428-4d20-95b5-f6dd74c0781c/	Pacific Biosciences Sequel II	8.1	/references/ENCSR759PLA/ , /references/ENCSR156CIL/	/biosample-types/tissue_UBERON_0002369/	adrenal gland	mouse	whole cell
ENCSR964OKW	ENCLB676FHV	released	UCI	ENC4_381a, ENC4_382a B6Cast F1 PND4_Male Gastrocnemius	'PacBio libraries v3 (October, 2020) Protocol to build non-size selected cDNA libraries for Pa


