# Introduction

Brian thinks the protocol being used for Ali's groups long-RNA-seq experiments could be improved.

Jennifer would like everything defined so they can just run the command if this change request comes in after the end of their ENCODE funding.

In [1]:
import pandas
from pathlib import Path
import os
import sys

EC = str(Path("~/proj/encoded_client").expanduser())
if EC not in sys.path:
    sys.path.append(EC)
from encoded_client.encoded import ENCODED, HTTPError

In [2]:
known_document_types = {
    '/documents/ecb1c489-905b-4afb-8389-ed7fb2e36ff2/': "Chromium v2",
    '/documents/515a990c-50c4-459a-8036-502feaf5a18c/': "Chromium v2 technical note",
    '/documents/3408b7a1-6b1b-45d1-a73e-db0556e6abdc/': "Split-seq analysis protocol",
    '/documents/3cdf327b-df85-4149-9fca-a7541e6f1cb1/': "Split-seq overview",
    '/documents/c155e58a-13ab-4af9-8591-cee13d639a40/': "Nuclei Isolation protocol from tissue for 10x multiome",
    '/documents/53fb4382-c54f-4761-89bb-6a01a0dca97d/': "Chromium GEM v3.1",
    '/documents/4d42b1ef-c9e6-4b31-a443-1266079545d0/': "Chromium Multiome ATAC + Gene Expression",
    '/documents/d18bad0e-9a86-40f0-8ded-2c74072910e5/': "Nuclei Isolation protocol from tissue for 10x multiome",  # probably same as c155e58a-13ab-4af9-8591-cee13d639a40
    '/documents/e3110414-4013-425e-94f6-72c549702bde/': "Pool split protocol",
    '/documents/1bb75b62-ac29-4368-9855-68d410e1963a/': "GM12878 Cell Culture protocol",
    '/documents/28fef8e0-e171-450d-b66d-e3d199531cf2/': "Ambion spike in mixture",
    '/documents/a6e06058-5019-488f-9195-8bcd2e9229c1/': "Laser capture microdisection",
    '/documents/c4b8952c-d9f1-42e7-8ea5-54659d14f46b/': "Human tissue preservation protocol flash-freezing (version 1)",
    '/documents/c5c4174c-0f33-4793-be9d-f2b6a9357dae/': "Nuclei Isolation protocol from tissue for 10x multiome", # probably same as c155e58a-13ab-4af9-8591-cee13d639a40
    '/documents/84a66218-cd6a-4959-adc0-91ecc513117f/': "Chromium Multiome ATAC + Gene Expression",
    '/documents/c6235c97-a978-41b6-8a7e-1e16b983f4ba/': "inDrop Sample Collection Protocol",
    '/documents/8179222b-9ad7-4274-a07e-820dbcac6aca/': "inDrop Library Prep",
    '/documents/80953249-5d87-4f2c-84e0-0212c404c8d6/': "Nuclei Isolation protocol from tissue 10x snRNA", # probably same as c155e58a-13ab-4af9-8591-cee13d639a40
    '/documents/57177fd0-da98-4e8f-95a4-459fbaeea146/': "Chromium GEM v3.1", # psame as 53fb4382-c54f-4761-89bb-6a01a0dca97d
    '/documents/5f5193b0-0c61-40c6-a286-b1f6d6f0c0da/': "SmartSeq",
    '/documents/9a7e281f-b901-422b-9f01-378ba4b9e97f/': "Mouse adrenal gland nuclei isolation protocol for single nucleus RNA-seq",
    '/documents/f1b60b4e-b5d2-4b8d-af0b-06b767fa07c3/': "Biosample purification protocol",
    '/documents/49fd5b3b-9878-4ec8-92bf-ef6dce9116e3/': "SMARTer Ultra Low RNA kit",
    "/documents/f16e44c9-ee9d-4ede-9bb5-886ea451d7a0/": "Cortex nuclei isolation protocol",
    "/documents/eb624ae4-53ea-4de0-ba9e-11b40eb20e2c/": "cell isolation protocol",
}

In [3]:
server = ENCODED("www.encodeproject.org")

In [6]:
where = {
    "lab.title": "Ali Mortazavi, UCI",
    "assay_title": "long read RNA-seq",
}
query = server.search_jsonld(type="Experiment", limit="500", **where)

def update_documents(seen_documents, experiment_id, obj):
    for d in obj.get("documents", []):
        seen_documents.setdefault(d, []).append(experiment_id)

seen_documents = {}
for row in query["@graph"]:
    experiment_id = row["@id"]
    experiment = server.get_json(experiment_id)
    update_documents(seen_documents, experiment_id, experiment)
    for rep in experiment["replicates"]:
        library = rep["library"]
        update_documents(seen_documents, experiment_id, library)
        if "biosample" in rep["library"]:
            biosample = rep["library"]["biosample"]
            update_documents(seen_documents, experiment_id, biosample)
    

In [14]:
records = []
for d in seen_documents:
    document = server.get_json(d)
    records.append({
        "id": "https://www.encodeproject.org{}".format(d),
        "used": len(seen_documents[d]),
        "filename": document["attachment"]["download"],
        "description": document["description"],
    })
    
document_table = pandas.DataFrame(records)
document_table

Unnamed: 0,id,used,filename,description
0,https://www.encodeproject.org/documents/6d583a...,39,ENCODE Long Read RNA-Seq Analysis Pipeline v3....,"""This document describes 1) the steps used to ..."
1,https://www.encodeproject.org/documents/3baa46...,180,ENCODE_longread_wetlab_protocolv3.pdf,This protocol describes an optimized method fo...
2,https://www.encodeproject.org/documents/b556e2...,1,3348003_5X_DLPFC.tif,"fresh frozen section, 30 microns, post fixed i..."
3,https://www.encodeproject.org/documents/d3cf9d...,1,7461192_5X_DLPFC.tif,"fresh frozen section, 30 microns, post fixed i..."
4,https://www.encodeproject.org/documents/e90eb5...,1,4368365_5X_DLPFC.tif,"fresh frozen section, 30 microns, post fixed i..."
5,https://www.encodeproject.org/documents/22a7ff...,1,6341028_5X_DLPFC.tif,"fresh frozen section, 30 microns, post fixed i..."
6,https://www.encodeproject.org/documents/e7fcb8...,1,6207143_5X_DLPFC.tif,"fresh frozen section, 30 microns, post fixed i..."
7,https://www.encodeproject.org/documents/303304...,1,7948794_5X_DLPFC.tif,"fresh frozen section, 30 microns, post fixed i..."
8,https://www.encodeproject.org/documents/dcd089...,1,4163791_5X_DLPFC.tif,"fresh frozen section, 30 microns, post fixed i..."
9,https://www.encodeproject.org/documents/d4da5f...,1,5194210_5X_DLPFC.tif,"fresh frozen section, 30 microns, post fixed i..."


In [15]:
document_table.to_csv("long-read-documents.csv", index=False)

In [None]:
seen_documents

In [12]:
ls *.csv

 biosample_nic.csv         purkinje-cross-reference.csv
 long-read-documents.csv  'Snyder tissue correspondence.csv'
 merge_stam_wold.csv       who_made_what.csv


In [13]:
!hostname

amarana
