# Introduction

Trying to submit some test data to the DCC.

This attempt is using some example files given to me by Henry.

I have a cell by gene quantification matrix and a cell location matrix.

In my first attempted I also posted a processed image henry generated which was constructed from several different channels of the tiff.


In [1]:
import sys

!{sys.executable} -m pip install encoded_client



In [2]:
import hashlib

In [5]:
from encoded_client.encoded import ENCODED, DCCValidator, Document
from encoded_client.submission import run_aws_cp
from pathlib import Path
import pandas

In [4]:
server = ENCODED("test.encodedcc.org")
server.load_netrc()
assert server.auth is not None
validator = DCCValidator(server)

In [None]:
alzheimers_dir = Path("/woldlab/castor/proj/alzheimers/work/2022080500")
list(alzheimers_dir.glob("*"))

In [None]:
ls -lh $alzheimers_dir

In [None]:
donor = "/human-donors/ENCDO609ZOG/"
biosample = "/biosample/ENCBS411JUI/"
lab = "/labs/barbara-wold/"
award = "UM1HG009443"

dry_run=False

In [None]:
experiment_id = "barbara-wold:seqfish_5194210_experiment"
experiments = {
    'uuid': '3c2db1d7-2d74-4162-a99f-a7630440223e',
    "accession": "TSTSR166085",
    "aliases:array": [experiment_id],
    "description": "human_brain_Rush ID_5194210",
    "biosample_ontology": "/biosample-types/tissue_UBERON_0006483/",
    "assay_term_name": "seqFISH",
    "lab": lab,
    "award": award,
}

experiments = pandas.DataFrame(experiments)
print(server.post_sheet("/experiments/", experiments, dry_run=True, verbose=True, validator=validator))

In [None]:
composite_slide_id = "barbara-wold:seqfish_5194210_library_composite1"
composite_slide_uuid = '1a2cc7b0-879b-47da-b539-083a1d391077'
composite_slide_doc = Document(
    alzheimers_dir / "rush_middle_frontal_ba46_5194210_composite.png",
    document_type="high resolution slide image",
    aliases=[composite_slide_id],
    description="composite of DAPI, Poly-T, a stain for the Tau protein, and a stain for Amyloid Beta",
    server=server,
)
print(composite_slide_doc.create_if_needed(server, composite_slide_uuid, validator))


In [None]:
library_id = "barbara-wold:seqfish_5194210_library"
libraries = {
    "uuid": '14b1421e-d976-4bc1-9e0a-3354d281a1af',
    "accession": "TSTLB109656",
    "aliases:array": [library_id],
    "biosample": biosample,
    "documents:array": [composite_slide_uuid],
    "nucleic_acid_term_name": "RNA",
    "strand_specificity": "unstranded",
    "documents:array": [composite_slide_uuid],
    "lab": lab,
    "award": award,
}


libraries = pandas.DataFrame(libraries)
server.post_sheet("/libraries/", libraries, dry_run=True, verbose=True, validator=validator)


In [None]:
replicate_id = "barbara-wold:seqfish_5194210_library_b1_t1"
replicates = {
    "uuid": '1fd40e74-1ec3-4005-851a-c70f1ccc389a',
    "experiment": experiment_id, 
    "biological_replicate_number:integer": 1,
    "technical_replicate_number:integer": 1,
    "library": library_id,
    "aliases:array": [replicate_id],
}

replicates = pandas.DataFrame(replicates)
server.post_sheet("/replicates/", replicates, dry_run=True, verbose=True, validator=validator)

In [None]:
# cell coordinates, cell type annotations, and raw imaging signal

In [None]:
#Counts matrix should be by id.

files = [
    {"uuid": 'b6b240d6-6a57-4d8c-809c-a2893fd076b3', "accession": "TSTFF842936", "dataset": experiment_id, "submitted_file_name": "rush_middle_frontal_ba46_5194210_counts.tsv", "file_format": "tsv", "output_type": "gene quantifications", "replicate": replicate_id, "lab": lab, "award": award},
    {"uuid": 'b6402ac1-c2ab-4c96-a1dc-ec9aadb063c0', "accession": "TSTFF241595", "dataset": experiment_id, "submitted_file_name": "rush_middle_frontal_ba46_5194210_obs.tsv", "file_format": "tsv", "output_type": "cell coordinates", "replicate": replicate_id, "lab": lab, "award": award},
    #{"dataset": experiment_id, "submitted_file_name": "rush_middle_frontal_ba46_5194210_composite.png", "file_format": "png", "output_type": "raw imaging signal", "platform": None, "replicate": replicate_id, "lab": lab, "award": award},
]

for f in files:
    if "md5sum" not in f:
        with open(alzheimers_dir / f["submitted_file_name"], "rb") as instream:
            md5 = hashlib.md5()
            block = instream.read()
            md5.update(block)
        f["md5sum"] = md5.hexdigest()

files = pandas.DataFrame(files)
created = server.post_sheet("/files/", files, dry_run=True, verbose=True, validator=validator)


In [None]:
created = _

In [None]:
if 0:
    for row in created:
        print(run_aws_cp(alzheimers_dir / row["submitted_file_name"], row["upload_credentials"]))

In [None]:
if 0:
    for row in created:
        print(run_aws_cp(alzheimers_dir / row["submitted_file_name"], row["upload_credentials"]))