# Introduction

Trying to submit the rush images to the DCC

This was copied from the earlier http://localhost:8888/notebooks/submitting-encode-rush-alzheimers-seqfish-test-one-processed-file.ipynb which submitted parts of one experiment.

But this is working toward being able to submit everything with data stored permanently in a spreadsheet.

Notes about what sections of the notebook need to be updated are comments in the blocks handling different datatypes.



In [1]:
import hashlib
import gzip
import os
import pandas
from pathlib import Path
import re
import json
import requests
import subprocess
import sys
import tarfile
from tqdm import tqdm
import xattr
from zipfile import ZipFile

# Turns out we need a new version of encoded_client
#!{sys.executable} -m pip install encoded_client

EC = str(Path("~/proj/encoded_client").expanduser())
if EC not in sys.path:
    sys.path.append(EC)


In [2]:
from encoded_client.encoded import ENCODED, DCCValidator, Document
from encoded_client.submission import run_aws_cp, post_file_metadata, make_upload_filename

In [3]:
server = ENCODED("www.encodeproject.org")
#server = ENCODED("test.encodedcc.org")
server.load_netrc()
assert server.auth is not None
validator = DCCValidator(server)

In [4]:
#test spreadsheet = "rush-ad-encode-seqfish.ods"
spreadsheet_name = "https://woldlab.caltech.edu/nextcloud/index.php/s/GcrgWt2krAAmqbf/download"

# test sheet rush-ad-encode-seqfish-test.ods
#spreadsheet_name = "https://woldlab.caltech.edu/nextcloud/index.php/s/3w6qmz4WPmzDaN2/download"

engine = "odf"
# maybe also pull it direct from nextcloud?

lab = "/labs/barbara-wold/"
award = "UM1HG009443"
processed_data_dir = Path("~/proj/encode4-curation/rush-ad-processed-data/").expanduser()
raw_data_dir = Path("~alinares/encode/").expanduser()
dry_run = True

In [5]:
metadata_filename = processed_data_dir / "5samples_metadata_2022_1220.csv"
!head $metadata_filename

"","cell_ID","orig.ident","x","y","layer","major_clusters"
"1","c7469794_Cell16_Pos_0","c7469794",1657.65536533512,1430.50005594719,"layer_wm","O"
"2","c7469794_Cell18_Pos_0","c7469794",932.254005655042,1321.40904806786,"layer_wm","O"
"3","c7469794_Cell20_Pos_0","c7469794",1425.79448036797,1205.44417038864,"layer_wm","A"
"4","c7469794_Cell21_Pos_0","c7469794",753.763954909684,1168.33206573408,"layer_wm","O"
"5","c7469794_Cell22_Pos_0","c7469794",1772.5161559656,1121.03790598647,"layer_wm","M"
"6","c7469794_Cell23_Pos_0","c7469794",1618.08926603483,1102.82256998016,"layer_wm","O"
"7","c7469794_Cell25_Pos_0","c7469794",1410.69010889292,1078.07350272232,"layer_wm","O"
"8","c7469794_Cell31_Pos_0","c7469794",945.290132547865,892.119694738252,"layer_wm","O"
"9","c7469794_Cell32_Pos_0","c7469794",1448.11346960168,855.845911949686,"layer_wm","O"


In [6]:
metadata = pandas.read_csv(
    processed_data_dir / "5samples_metadata_2022_1220.csv", 
    index_col=0,
    usecols=["cell_ID", "orig.ident", "x", "y", "layer", "major_clusters"],
)
metadata.head()

Unnamed: 0_level_0,orig.ident,x,y,layer,major_clusters
cell_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
c7469794_Cell16_Pos_0,c7469794,1657.655365,1430.500056,layer_wm,O
c7469794_Cell18_Pos_0,c7469794,932.254006,1321.409048,layer_wm,O
c7469794_Cell20_Pos_0,c7469794,1425.79448,1205.44417,layer_wm,A
c7469794_Cell21_Pos_0,c7469794,753.763955,1168.332066,layer_wm,O
c7469794_Cell22_Pos_0,c7469794,1772.516156,1121.037906,layer_wm,M


In [7]:
donors = {
    x.replace("c", "").replace("ad", ""): x for x in metadata["orig.ident"].unique()
}
donors

{'7469794': 'c7469794',
 '5194210': 'c5194210',
 '4933693': 'ad4933693',
 '6341028': 'ad6341028',
 '7948794': 'ad7948794'}

In [8]:
disease_state = {
    x: "control" if x.startswith("c") else "alzheimers" for x in metadata["orig.ident"].unique()
}
disease_state

{'c7469794': 'control',
 'c5194210': 'control',
 'ad4933693': 'alzheimers',
 'ad6341028': 'alzheimers',
 'ad7948794': 'alzheimers'}

In [9]:
def get_md5sum(filename):
    if "user.md5sum" not in xattr.listxattr(filename):
        md5 = hashlib.md5()
        with open(filename, "rb") as instream:
            for line in instream:
                md5.update(line)
        xattr.setxattr(filename, "user.md5sum", md5.hexdigest())

    return xattr.getxattr(filename, "user.md5sum").decode("ascii")


# Build experiment specific count matrix

In [10]:
count_matrix = pandas.read_csv(processed_data_dir / "5samples_countmatrix_diff1_p20_fdr20_mincount25_2022_0915.csv", index_col=0)
print(count_matrix.shape)
count_matrix.head()

(1373, 62379)


Unnamed: 0,c7469794_Cell16_Pos_0,c7469794_Cell18_Pos_0,c7469794_Cell20_Pos_0,c7469794_Cell21_Pos_0,c7469794_Cell22_Pos_0,c7469794_Cell23_Pos_0,c7469794_Cell25_Pos_0,c7469794_Cell31_Pos_0,c7469794_Cell32_Pos_0,c7469794_Cell35_Pos_0,...,ad7948794_Cell51_Pos_122,ad7948794_Cell65_Pos_122,ad7948794_Cell41_Pos_130,ad7948794_Cell67_Pos_130,ad7948794_Cell94_Pos_130,ad7948794_Cell117_Pos_130,ad7948794_Cell129_Pos_130,ad7948794_Cell135_Pos_133,ad7948794_Cell4_Pos_149,ad7948794_Cell30_Pos_151
FAM107B,0,1,0,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HK2,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
LTBP3,0,0,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
CD74,0,0,0,0,17,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
DYNC1LI2,1,0,0,0,0,0,2,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
raw_image_roots = {}
for path in raw_data_dir.iterdir():
    donor_id = path.name.split("_")[-1]
    raw_image_roots[donor_id] = path
raw_image_roots

{'6341028': PosixPath('/woldlab/loxcyc/home/alinares/encode/rush_dlpfc_ad_6341028'),
 '4933693': PosixPath('/woldlab/loxcyc/home/alinares/encode/rush_dlpfc_ad_4933693'),
 '7469794': PosixPath('/woldlab/loxcyc/home/alinares/encode/rush_dlpfc_control_7469794'),
 '7948794': PosixPath('/woldlab/loxcyc/home/alinares/encode/rush_dlpfc_ad_7948794'),
 '5194210': PosixPath('/woldlab/loxcyc/home/alinares/encode/rush_dlpfc_control_5194210')}

In [12]:
set(donors).intersection(raw_image_roots)

{'4933693', '5194210', '6341028', '7469794', '7948794'}

In [13]:
processed_images = ZipFile(processed_data_dir / "images.zip")

images = {}
for info in processed_images.filelist:
    filename = Path(info.filename)
    parts = filename.name.split("_")
    year, day, project, _, state, donor_id, channel = parts[0:7]
    images.setdefault(donor_id, {})[channel] = info
    
pandas.DataFrame(images).T

Unnamed: 0,abeta,dapi,polyT,tau
7948794,<ZipInfo filename='images/abeta/2021_1031_rush...,<ZipInfo filename='images/dapi/2021_1031_rush_...,<ZipInfo filename='images/polyT_histology/2021...,<ZipInfo filename='images/tau/2021_1031_rush_d...
5194210,<ZipInfo filename='images/abeta/2022_0107_rush...,<ZipInfo filename='images/dapi/2022_0107_rush_...,<ZipInfo filename='images/polyT_histology/2022...,<ZipInfo filename='images/tau/2022_0107_rush_d...
6341028,<ZipInfo filename='images/abeta/2021_1111_rush...,<ZipInfo filename='images/dapi/2021_1111_rush_...,<ZipInfo filename='images/polyT_histology/2021...,<ZipInfo filename='images/tau/2021_1111_rush_d...
4933693,<ZipInfo filename='images/abeta/2021_0910_rush...,<ZipInfo filename='images/dapi/2021_0910_rush_...,<ZipInfo filename='images/polyT_histology/2021...,<ZipInfo filename='images/tau/2021_0910_rush_d...
7469794,<ZipInfo filename='images/abeta/2021_1116_rush...,<ZipInfo filename='images/dapi/2021_1116_rush_...,<ZipInfo filename='images/polyT_histology/2021...,<ZipInfo filename='images/tau/2021_1116_rush_d...
7461192,,<ZipInfo filename='images/dapi/2022_0105_rush_...,<ZipInfo filename='images/polyT_histology/2022...,


In [14]:
for donor_id in sorted(images):
    for channel in images[donor_id]:
        print("{}\t{}\t{}".format(donor_id, channel, Path(images[donor_id][channel].filename).name))

4933693	abeta	2021_0910_rush_dlpfc_ad_4933693_abeta_stiched.tif
4933693	dapi	2021_0910_rush_dlpfc_ad_4933693_dapi_stiched.tif
4933693	polyT	2021_0910_rush_dlpfc_ad_4933693_polyT_stiched.tif
4933693	tau	2021_0910_rush_dlpfc_ad_4933693_tau_stiched.tif
5194210	abeta	2022_0107_rush_dlpfc_control_5194210_abeta_stiched.tif
5194210	dapi	2022_0107_rush_dlpfc_control_5194210_dapi_stiched.tif
5194210	polyT	2022_0107_rush_dlpfc_control_5194210_polyT_stiched.tif
5194210	tau	2022_0107_rush_dlpfc_control_5194210_tau_stiched.tif
6341028	abeta	2021_1111_rush_dlpfc_ad_6341028_abeta_stiched.tif
6341028	dapi	2021_1111_rush_dlpfc_ad_6341028_dapi_stiched.tif
6341028	polyT	2021_1111_rush_dlpfc_ad_6341028_polyT_stiched.tif
6341028	tau	2021_1111_rush_dlpfc_ad_6341028_tau_stiched.tif
7461192	dapi	2022_0105_rush_dlpfc_control_7461192_dapi_stiched.tif
7461192	polyT	2022_0105_rush_dlpfc_control_7461192_polyT_stiched.tif
7469794	abeta	2021_1116_rush_dlpfc_control_7469794_abeta_stiched.tif
7469794	dapi	2021_1116_ru

In [15]:
set(donors).symmetric_difference(images)

{'7461192'}

In [16]:
# read from experiments in "rush-ad-encode-seqfish.ods"
book = pandas.ExcelFile(spreadsheet_name, engine="odf")
book.sheet_names

['Experiments',
 'Documents',
 'Library',
 'Replicate',
 'File',
 "'file:///home/diane/woldlab/ENCODE/rush-ad-encode-seqfish-test.ods'#Documents"]

In [17]:
experiments = pandas.read_excel(spreadsheet_name, sheet_name="Experiments", engine="odf")

created = server.post_sheet("/experiments/", experiments, 
                            dry_run=True, 
                            verbose=True, 
                            validator=validator)
print(len(created))

0


In [18]:
if len(created) > 0:
    experiments.to_excel("/dev/shm/experiments.xlsx", index=False)

In [19]:
experiments

Unnamed: 0,uuid,accession,aliases:array,biosample_ontology,assay_term_name,rush_id:skip,lab,award
0,5e7397dd-1d3a-4d3f-91ec-d346399459c3,ENCSR876VXR,barbara-wold:seqfish_E4933693,/biosample-types/tissue_UBERON_0006483/,seqFISH,4933693,/labs/barbara-wold/,UM1HG009443
1,8a02d2c1-f18a-4f48-8a03-4b436b451c21,ENCSR970EPL,barbara-wold:seqfish_E5194210,/biosample-types/tissue_UBERON_0006483/,seqFISH,5194210,/labs/barbara-wold/,UM1HG009443
2,20f81829-b726-4e23-bf87-58b8f8e71eff,ENCSR051YOP,barbara-wold:seqfish_E6341028,/biosample-types/tissue_UBERON_0006483/,seqFISH,6341028,/labs/barbara-wold/,UM1HG009443
3,2a47a59d-08e8-4844-b6c2-592760316b12,ENCSR309QSV,barbara-wold:seqfish_E7469794,/biosample-types/tissue_UBERON_0006483/,seqFISH,7469794,/labs/barbara-wold/,UM1HG009443
4,95388e6e-d759-4122-9662-bfe542acbea3,ENCSR398OAO,barbara-wold:seqfish_E7948794,/biosample-types/tissue_UBERON_0006483/,seqFISH,7948794,/labs/barbara-wold/,UM1HG009443


In [20]:
documents = pandas.read_excel(spreadsheet_name, sheet_name="Documents", engine="odf")

#this is only going to work for validation, as post_sheets doesn't know how to handle
created = server.post_sheet("/documents/", documents, 
                            dry_run=True, 
                            verbose=True, 
                            validator=validator)


In [21]:
documents = pandas.read_excel(spreadsheet_name, sheet_name="Documents", engine="odf")

def submit_stitched_image(documents, row, dry_run, verbose):
    # extract file
    donor_id = str(row["rush_id:skip"])
    channel = row["channel_short_name:skip"]
    
    current_image = images[donor_id][channel]
    source = Path(current_image.filename)
    fixed_name = current_image.filename.replace("stiched", "stitched")
    if pandas.isnull(row["uuid"]):
        try:
            processed_images.extract(current_image)
            source = source.rename(fixed_name)

            document = Document(
                source,
                aliases=row["aliases:array"].split(","),
                document_type=row["document_type"],
                description=row["description"],
                server=server
            )
            payload = document.create_payload()
            validator.validate(payload, "document")
            if dry_run:
                uuid = "would create"
            else:
                result = server.post_json("/documents/", payload)
                if verbose:
                    print(result)
                uuid = result["@graph"][0].get("uuid")
        finally:
            if source.exists():
                source.unlink()
        return uuid
    
def submit_composite_image(documents, row, dry_run, verbose):
    henry_composites = Path("/woldlab/castor/proj/alzheimers/work/2023010900/submission/")
    source = henry_composites / row["filename:skip"]
    if pandas.isnull(row["uuid"]):
        document = Document(
            source,
            aliases=row["aliases:array"].split(","),
            document_type=row["document_type"],
            description=row["description"],
            server=server
        )
        payload = document.create_payload()
        validator.validate(payload, "document")
        if dry_run:
            uuid = "would create"
        else:
            result = server.post_json("/documents/", payload)
            if verbose:
                print(result)
            uuid = result["@graph"][0].get("uuid")
    return uuid

def submit_documents(documents, dry_run=True, verbose=True):
    server.remove_sheet_aliases(validator, documents)
    for i, row in tqdm(documents.iterrows()):
        if pandas.isnull(row["uuid"]):
            # extract file
            donor_id = str(row["rush_id:skip"])
            channel = row["channel_short_name:skip"]
            
            if channel == "composite":
                uuid = submit_composite_image(documents, row, dry_run, verbose)
            else:
                uuid = submit_stitched_image(documents, row, dry_run, verbose)
            documents.loc[i, "uuid"] = uuid
                    
    return documents

documents = submit_documents(documents, dry_run=True)
documents.to_excel("/dev/shm/documents.xlsx", index=False)
documents

25it [00:00, 8129.13it/s]


Unnamed: 0,uuid,aliases:array,rush_id:skip,disease_state:skip,channel_long_name:skip,channel_short_name:skip,filename:skip,description,document_type,lab,award
0,29613b20-a9a9-467a-ae61-af3e99476509,barbara-wold:seqfish_4933693_alzheimer_abeta_s...,4933693,alzheimer,Amyloid Beta,abeta,2021_0910_rush_dlpfc_ad_4933693_abeta_stitched...,stitched Amyloid Beta stain,high resolution slide image,/labs/barbara-wold/,UM1HG009443
1,87eaa63a-f4aa-4ce3-96fc-19813b452fc6,barbara-wold:seqfish_4933693_alzheimer_dapi_st...,4933693,alzheimer,DAPI,dapi,2021_0910_rush_dlpfc_ad_4933693_dapi_stitched.tif,stitched DAPI stain,high resolution slide image,/labs/barbara-wold/,UM1HG009443
2,90f6989b-b7e5-45af-8223-0ef7a63c2b96,barbara-wold:seqfish_4933693_alzheimer_polyT_s...,4933693,alzheimer,Poly-T,polyT,2021_0910_rush_dlpfc_ad_4933693_polyT_stitched...,stitched Poly-T stain,high resolution slide image,/labs/barbara-wold/,UM1HG009443
3,c0ac8185-e286-466c-a8b3-a8074197e62b,barbara-wold:seqfish_4933693_alzheimer_tau_sti...,4933693,alzheimer,Tau protein,tau,2021_0910_rush_dlpfc_ad_4933693_tau_stitched.tif,stitched Tau protein stain,high resolution slide image,/labs/barbara-wold/,UM1HG009443
4,6f82a662-9ac9-4bad-9302-73336394028b,barbara-wold:seqfish_4933693_alzheimer_composite,4933693,alzheimer,Composite,composite,2021_0910_rush_dlpfc_ad_4933693_stitched_overv...,"Composite of Amyloid Beta, DAPI, Poly-T and Ta...",high resolution slide image,/labs/barbara-wold/,UM1HG009443
5,76327f5b-69ff-4d7c-956f-09431a10dc24,barbara-wold:seqfish_5194210_control_abeta_sti...,5194210,control,Amyloid Beta,abeta,2022_0107_rush_dlpfc_control_5194210_abeta_sti...,stitched Amyloid Beta stain,high resolution slide image,/labs/barbara-wold/,UM1HG009443
6,7fd4fa03-d457-4cca-b411-bb0ad57b6722,barbara-wold:seqfish_5194210_control_dapi_stit...,5194210,control,DAPI,dapi,2022_0107_rush_dlpfc_control_5194210_dapi_stic...,stitched DAPI stain,high resolution slide image,/labs/barbara-wold/,UM1HG009443
7,281ff017-a706-4ebc-808f-537217e4b32e,barbara-wold:seqfish_5194210_control_polyT_sti...,5194210,control,Poly-T,polyT,2022_0107_rush_dlpfc_control_5194210_polyT_sti...,stitched Poly-T stain,high resolution slide image,/labs/barbara-wold/,UM1HG009443
8,421cf6ca-5d43-4cb9-a639-c36e4be96bcc,barbara-wold:seqfish_5194210_control_tau_stitched,5194210,control,Tau protein,tau,2022_0107_rush_dlpfc_control_5194210_tau_stitc...,stitched Tau protein stain,high resolution slide image,/labs/barbara-wold/,UM1HG009443
9,d6e03fff-5f9a-40f2-8434-4ddebb34339c,barbara-wold:seqfish_5194210_control_composite,5194210,control,Composite,composite,2022_0107_rush_dlpfc_control_5194210_stitched_...,"Composite of Amyloid Beta, DAPI, Poly-T and Ta...",high resolution slide image,/labs/barbara-wold/,UM1HG009443


# Libraries

In [22]:
libraries = pandas.read_excel(spreadsheet_name, sheet_name="Library", engine="odf")

for i, row in libraries.iterrows():
    match = re.search(r":E(?P<rush_id>[\d]+)", row["donor:skip"])
    rush_id_set = set([match.group("rush_id")])
    document_set = set([x.replace("barbara-wold:seqfish_", "")[:7] for x in row["documents:array"].split(",")])
    assert rush_id_set == document_set


In [23]:
libraries = pandas.read_excel(spreadsheet_name, sheet_name="Library", engine="odf")

created = server.post_sheet(
    "/libraries/", 
    libraries, 
    dry_run=True, 
    verbose=True, 
    validator=validator)
print(len(created))

if len(created) > 0:
    libraries.to_excel("/dev/shm/libraries.xlsx", index=False)
    
libraries    

0


Unnamed: 0,uuid,accession,aliases:array,donor:skip,biosample,nucleic_acid_term_name,strand_specificity,documents:array,lab,award
0,102222e9-f962-47d4-927c-06cb65b89ce6,ENCLB072LGY,barbara-wold:seqfish_E4933693_library,john-stamatoyannopoulos:E4933693,ENCBS436WHA,RNA,forward,barbara-wold:seqfish_4933693_alzheimer_abeta_s...,/labs/barbara-wold/,UM1HG009443
1,589f8365-bf0d-4a26-9c69-25a443f044ad,ENCLB397BAO,barbara-wold:seqfish_E5194210_library,john-stamatoyannopoulos:E5194210,ENCBS411JUI,RNA,forward,barbara-wold:seqfish_5194210_control_abeta_sti...,/labs/barbara-wold/,UM1HG009443
2,a1f122ec-953f-470e-8445-340b94536dd0,ENCLB346UCC,barbara-wold:seqfish_E6341028_library,john-stamatoyannopoulos:E6341028,ENCBS660AUE,RNA,forward,barbara-wold:seqfish_6341028_alzheimer_abeta_s...,/labs/barbara-wold/,UM1HG009443
3,e85e0572-e58a-4c65-898c-c33c413d0782,ENCLB238CHY,barbara-wold:seqfish_E7469794_library,john-stamatoyannopoulos:E7469794,ENCBS904CGT,RNA,forward,barbara-wold:seqfish_7469794_control_abeta_sti...,/labs/barbara-wold/,UM1HG009443
4,b3b51e87-440b-4061-8a87-67b11203935e,ENCLB440QUF,barbara-wold:seqfish_E7948794_library,john-stamatoyannopoulos:E7948794,ENCBS422TMB,RNA,forward,barbara-wold:seqfish_7948794_alzheimer_abeta_s...,/labs/barbara-wold/,UM1HG009443


In [24]:
replicates = pandas.read_excel(spreadsheet_name, sheet_name="Replicate", engine="odf")

created = server.post_sheet(
    "/replicates/",
    replicates,
    dry_run=True,
    verbose=True,
    validator=validator)
print(len(created))

if len(created) > 0:
    replicates.to_excel("/dev/shm/replicates.xlsx", index=False)

0


In [25]:
replicates

Unnamed: 0,uuid,experiment,biological_replicate_number:integer,technical_replicate_number:integer,library,rush_id:skip,aliases:array
0,a83178a1-19ba-47b8-9c36-4707c895d6db,barbara-wold:seqfish_E4933693,1,1,barbara-wold:seqfish_E4933693_library,4933693,barbara-wold:seqfish_E4933693_replicate_b1_t1
1,4534e7fe-7f6e-4764-9cdd-87c28dbed5b9,barbara-wold:seqfish_E5194210,1,1,barbara-wold:seqfish_E5194210_library,5194210,barbara-wold:seqfish_E5194210_replicate_b1_t1
2,79f7010d-56b9-4935-815f-3fd64096a40c,barbara-wold:seqfish_E6341028,1,1,barbara-wold:seqfish_E6341028_library,6341028,barbara-wold:seqfish_E6341028_replicate_b1_t1
3,a9d7a890-e521-4c51-8287-8f0ba0fe7620,barbara-wold:seqfish_E7469794,1,1,barbara-wold:seqfish_E7469794_library,7469794,barbara-wold:seqfish_E7469794_replicate_b1_t1
4,a53697fb-fd8a-4bd7-8781-2e8229284d63,barbara-wold:seqfish_E7948794,1,1,barbara-wold:seqfish_E7948794_library,7948794,barbara-wold:seqfish_E7948794_replicate_b1_t1


# Prepping and submitting the raw data

In [26]:
def tar_and_hash(directory, target):
    if target.exists():
        return xattr.getxattr(target, "user.md5sum").decode('ascii')
    
    md5 = hashlib.md5()
    tar = subprocess.Popen(["tar", "c", directory], stdout=subprocess.PIPE)
    gzip = subprocess.Popen(["gzip", "-n"], stdin=tar.stdout, stdout=subprocess.PIPE)

    with open(target, "wb") as outstream:
        while gzip.poll() is None:
            block = gzip.stdout.read(10240)
            outstream.write(block)
            md5.update(block)

    digest = md5.hexdigest()
    xattr.setxattr(target, "user.md5sum", digest.encode("ascii"))
    return digest

def upload_file(encode, validator, metadata, dry_run=True, retry=False):
    """Upload a file to the DCC

    :Parameters:
      - encode: ENCODED instance pointing to server to upload to
      - validator: DCCValidator instance
      - dry_run: bool indicating if this is for real
      - retry: try uploading again.
    """
    if not isinstance(validator, DCCValidator):
        raise RuntimeError("arguments to upload_file changed")

    validator.validate(metadata, "file")

    file_name_fields = ["submitted_file_name", "pathname:skip", "pathname"]
    file_name_field = None
    for field in file_name_fields:
        if field in metadata and os.path.exists(metadata[field]):
            file_name_field = field

    if file_name_field is None:
        print("Couldn't find file name to upload in metadata")
        print(json.dumps(metadata, indent=4, sort_keys=True))
        return

    upload = make_upload_filename(metadata, encode)
    if retry or not os.path.exists(upload):
        #logger.debug(json.dumps(metadata, indent=4, sort_keys=True))
        if not dry_run:
            item = post_file_metadata(encode, metadata, upload, retry)
            creds = item["upload_credentials"]
            #run_aws_cp(metadata[file_name_field], creds)
            submitted_file_name = item["submitted_file_name"]
            print(f"AWS_ACCESS_KEY_ID={creds['access_key']} AWS_SECRET_ACCESS_KEY={creds['secret_key']} AWS_SECURITY_TOKEN={creds['session_token']} aws s3 cp {submitted_file_name} {creds['upload_url']}")

            return item
        else:
            print("Would upload {}".format(metadata[file_name_field]))
            metadata["accession"] = "would create"
            return metadata
    else:
        print("{} already uploaded".format(metadata[file_name_field]))


def submit_image_tar(server, dataset, replicate, current_dir, target, dry_run=True, retry=False):
    md5 = tar_and_hash(current_dir, target)
    
    file_row = {
        #"uuid": None, 
        #"accession": None, 
        "dataset": dataset, 
        "submitted_file_name": target.name, 
        "file_format": "tar", 
        "output_type": "raw imaging signal", 
        "replicate": replicate,
        "md5sum": md5,
        "lab": lab, 
        "award": award
    }

    upload_name = make_upload_filename(file_row, server)
    try:
        curdir = os.getcwd()
        os.chdir(archive_dir)
        # validator.validate(file_row, "file")
        result = upload_file(server, validator, file_row, dry_run=dry_run, retry=retry)
        # get credentials from post
        # upload target.name
    finally:
        os.chdir(curdir)
    return result


rush_id = None
#rush_id = "4933693" #"rush_dlpfc_ad_4933693.tar.gz"
#rush_id = "5194210" #"rush_dlpfc_control_5194210.tar.gz"
#rush_id = "6341028" #"rush_dlpfc_ad_6341028.tar.gz"
#rush_id = "7469794" #"rush_dlpfc_control_7469794.tar.gz"
#rush_id = "7948794" #"rush_dlpfc_ad_7948794.tar.gz"

if rush_id is not None:
    current_dir = raw_image_roots[rush_id].relative_to(raw_data_dir)    
    archive_dir = Path("/tmp")
    target = archive_dir / "{}.tar.gz".format(current_dir.name)
    print(target)

    curation_dir = Path("~/proj/encode4-curation").expanduser()
    os.chdir(curation_dir)
    try:
        os.chdir(raw_data_dir)
        result = submit_image_tar(
            server, 
            "barbara-wold:seqfish_E{}".format(rush_id), 
            "barbara-wold:seqfish_E{}_replicate_b1_t1".format(rush_id), 
            current_dir, 
            target,
            dry_run=True,
            retry=False,
        )
        print(result)
        print(result.get("uuid"), result.get("accession"))
    finally:
        os.chdir(curation_dir)


# Prepping processed data

This requires all the raw data be submitted before being run

In [27]:
def build_metadata_matrix(full_matrix, library_id):
    filtered_rows = [x.startswith(library_id) for x in full_matrix.index]
    subset_matrix = full_matrix.loc[filtered_rows,:]

    filename = Path("rush_dlpfc_{}_obs.tsv".format(library_id))
    if not filename.exists():
        subset_matrix.to_csv(filename, sep="\t", index=True)
    
    md5sum = get_md5sum(filename)
    return (filename, md5sum)

def build_library_count_matrix(full_matrix, library_id):
    columns_filter = [x.startswith(library_id) for x in full_matrix.columns]
    subset_matrix = full_matrix.loc[:, columns_filter]
    subset_matrix.index.name = "gene_name"
    filename = Path("rush_dlpfc_{}_counts.tsv".format(library_id))
    if not filename.exists():
        subset_matrix.to_csv(filename, sep="\t", index=True)
    
    md5sum = get_md5sum(filename)
    return (filename, md5sum)

In [28]:
files = pandas.read_excel(spreadsheet_name, sheet_name="File", engine="odf")

replicates = pandas.read_excel(spreadsheet_name, sheet_name="Replicate", engine="odf")
processed_files = []
for i, row in tqdm(replicates.iterrows(), total=replicates.shape[0]):
    experiment_id = row["experiment"]
    replicate_id = row["aliases:array"].split(",")[0]
    library_id = donors[str(row["rush_id:skip"])]
    raw_data = files[(files["file_format"] == "tar") & (files["replicate"] == replicate_id)]
    derived_from = raw_data.loc[raw_data.first_valid_index()].accession
    if pandas.isnull(derived_from):
        derived_from = ""
    print(experiment_id, replicate_id, library_id, derived_from)
    
    filename, md5sum = build_library_count_matrix(count_matrix, library_id)
    print(filename, md5sum)
    count = {
        "uuid": None,
        "accession": None,
        "dataset": experiment_id, 
        "submitted_file_name": filename, 
        "md5sum": md5sum,
        "derived_from:array": derived_from,
        "file_format": "tsv", 
        "output_type": "gene quantifications", 
        "replicate": replicate_id, 
        "lab": lab, 
        "award": award
    }
    processed_files.append(count)

    filename, md5sum = build_metadata_matrix(metadata, library_id)
    print(filename, md5sum)
    obs = {
        "uuid": None,
        "accession": None,
        "dataset": experiment_id, 
        "submitted_file_name": filename, 
        "md5sum": md5sum,
        "derived_from:array": derived_from,
        "file_format": "tsv", 
        "output_type": "cell coordinates", 
        "replicate": replicate_id, 
        "lab": lab, 
        "award": award
    }
    processed_files.append(obs)

processed_files = pandas.DataFrame(processed_files)
#processed_files.to_excel("rush-ad-seqfish-processed-tables.xlsx", sheet_name="File", index=False)
#created = server.post_sheet("/files/", processed_files, dry_run=True, verbose=True, validator=validator)
#print(created)
processed_files    

 20%|██        | 1/5 [00:00<00:00,  5.58it/s]

barbara-wold:seqfish_E4933693 barbara-wold:seqfish_E4933693_replicate_b1_t1 ad4933693 ENCFF451MKW
rush_dlpfc_ad4933693_counts.tsv 589a6baaf33c2534c3d87ce7a4664703
rush_dlpfc_ad4933693_obs.tsv e05a8bef900b9a7c2d8f61a46ee92220
barbara-wold:seqfish_E5194210 barbara-wold:seqfish_E5194210_replicate_b1_t1 c5194210 ENCFF682UIQ


 40%|████      | 2/5 [00:00<00:00,  4.98it/s]

rush_dlpfc_c5194210_counts.tsv 362208987e3cc406e3a2198b4f8705a5
rush_dlpfc_c5194210_obs.tsv 7956178c1528de02070fb8c7c79fed28
barbara-wold:seqfish_E6341028 barbara-wold:seqfish_E6341028_replicate_b1_t1 ad6341028 ENCFF662GQT


 80%|████████  | 4/5 [00:00<00:00,  5.51it/s]

rush_dlpfc_ad6341028_counts.tsv 22aee79aea4100dcba6b72401a0583d4
rush_dlpfc_ad6341028_obs.tsv e1bcb3294a71e97fae0fe4c2d59ebe38
barbara-wold:seqfish_E7469794 barbara-wold:seqfish_E7469794_replicate_b1_t1 c7469794 ENCFF619GCD
rush_dlpfc_c7469794_counts.tsv 85dccc7a2202be1a09724b3b12ee1d3a
rush_dlpfc_c7469794_obs.tsv 7c348c5047e29d3589e752adf4cd26d3
barbara-wold:seqfish_E7948794 barbara-wold:seqfish_E7948794_replicate_b1_t1 ad7948794 ENCFF852RMT


100%|██████████| 5/5 [00:00<00:00,  5.59it/s]

rush_dlpfc_ad7948794_counts.tsv 434c3aca9d7d04782f1ff21718790bac
rush_dlpfc_ad7948794_obs.tsv 9a8ceb8cb06e39a655e0dad3f5928c46





Unnamed: 0,uuid,accession,dataset,submitted_file_name,md5sum,derived_from:array,file_format,output_type,replicate,lab,award
0,,,barbara-wold:seqfish_E4933693,rush_dlpfc_ad4933693_counts.tsv,589a6baaf33c2534c3d87ce7a4664703,ENCFF451MKW,tsv,gene quantifications,barbara-wold:seqfish_E4933693_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443
1,,,barbara-wold:seqfish_E4933693,rush_dlpfc_ad4933693_obs.tsv,e05a8bef900b9a7c2d8f61a46ee92220,ENCFF451MKW,tsv,cell coordinates,barbara-wold:seqfish_E4933693_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443
2,,,barbara-wold:seqfish_E5194210,rush_dlpfc_c5194210_counts.tsv,362208987e3cc406e3a2198b4f8705a5,ENCFF682UIQ,tsv,gene quantifications,barbara-wold:seqfish_E5194210_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443
3,,,barbara-wold:seqfish_E5194210,rush_dlpfc_c5194210_obs.tsv,7956178c1528de02070fb8c7c79fed28,ENCFF682UIQ,tsv,cell coordinates,barbara-wold:seqfish_E5194210_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443
4,,,barbara-wold:seqfish_E6341028,rush_dlpfc_ad6341028_counts.tsv,22aee79aea4100dcba6b72401a0583d4,ENCFF662GQT,tsv,gene quantifications,barbara-wold:seqfish_E6341028_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443
5,,,barbara-wold:seqfish_E6341028,rush_dlpfc_ad6341028_obs.tsv,e1bcb3294a71e97fae0fe4c2d59ebe38,ENCFF662GQT,tsv,cell coordinates,barbara-wold:seqfish_E6341028_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443
6,,,barbara-wold:seqfish_E7469794,rush_dlpfc_c7469794_counts.tsv,85dccc7a2202be1a09724b3b12ee1d3a,ENCFF619GCD,tsv,gene quantifications,barbara-wold:seqfish_E7469794_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443
7,,,barbara-wold:seqfish_E7469794,rush_dlpfc_c7469794_obs.tsv,7c348c5047e29d3589e752adf4cd26d3,ENCFF619GCD,tsv,cell coordinates,barbara-wold:seqfish_E7469794_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443
8,,,barbara-wold:seqfish_E7948794,rush_dlpfc_ad7948794_counts.tsv,434c3aca9d7d04782f1ff21718790bac,ENCFF852RMT,tsv,gene quantifications,barbara-wold:seqfish_E7948794_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443
9,,,barbara-wold:seqfish_E7948794,rush_dlpfc_ad7948794_obs.tsv,9a8ceb8cb06e39a655e0dad3f5928c46,ENCFF852RMT,tsv,cell coordinates,barbara-wold:seqfish_E7948794_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443


# Finalize pipeline

Hi Diane, I’ve set up the simple pipeline for the seqFISH data at https://www.encodeproject.org/pipelines/ENCPL990KNL/.  The processed data can be patched with this step_run: /analysis-step-runs/f3d3bf42-6267-4de6-874b-68d32b90faf4/In addition to that, we can add https://www.encodeproject.org/files/GRCh38_no_alt_analysis_set_GCA_000001405.15/ in the derived_from for those files, and add assembly for the files as well.I’ll add in the analysis objects for the 5 datasets; I think there are also Disease Series where I can add the seqFISH datasets.Let me know if that makes sense, also happy to help patching.

In [33]:
files = pandas.read_excel(spreadsheet_name, sheet_name="File", engine="odf")
files

Unnamed: 0,uuid,accession,dataset,submitted_file_name,md5sum,derived_from,flowcell_details:json,file_format,output_type,replicate,lab,award
0,ee17742b-9930-488b-aa8c-52b0e7e76e74,ENCFF451MKW,ENCSR876VXR,rush_dlpfc_ad_4933693.tar.gz,126dc6f14483ae042a4d681ff1976e75,,[],tar,raw imaging signal,barbara-wold:seqfish_E4933693_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443
1,7d7b3739-1179-4411-a956-eec4e88e65bc,ENCFF682UIQ,ENCSR970EPL,rush_dlpfc_control_5194210.tar.gz,65e5ad10c801d8f88e2e3111edfa0096,,[],tar,raw imaging signal,barbara-wold:seqfish_E5194210_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443
2,3469e092-b56a-4e51-986e-5acefbdc0c0c,ENCFF662GQT,ENCSR051YOP,rush_dlpfc_ad_6341028.tar.gz,16a39a266ae0e5f44db98b61fdd606f0,,[],tar,raw imaging signal,barbara-wold:seqfish_E6341028_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443
3,5d10cd61-235b-410d-8ecf-00657922575d,ENCFF619GCD,ENCSR309QSV,rush_dlpfc_control_7469794.tar.gz,257f9698df9b50241bfe8cb90410bbd5,,[],tar,raw imaging signal,barbara-wold:seqfish_E7469794_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443
4,840f844e-b7ae-4d6a-809f-76905ba6de47,ENCFF852RMT,ENCSR398OAO,rush_dlpfc_ad_7948794.tar.gz,e2c60c75f645e06c3d4101e72be7dda5,,[],tar,raw imaging signal,barbara-wold:seqfish_E7948794_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443
5,7b92efec-6922-48b3-9397-efec7ccc80b5,ENCFF880DOR,barbara-wold:seqfish_E4933693,rush_dlpfc_ad4933693_counts.tsv,589a6baaf33c2534c3d87ce7a4664703,ENCFF451MKW,tsv,gene quantifications,barbara-wold:seqfish_E4933693_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443,
6,219d01ab-9195-4767-9675-84a3ce82edcd,ENCFF698MRP,barbara-wold:seqfish_E4933693,rush_dlpfc_ad4933693_obs.tsv,e05a8bef900b9a7c2d8f61a46ee92220,ENCFF451MKW,tsv,cell coordinates,barbara-wold:seqfish_E4933693_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443,
7,ebd452b7-8049-4b53-aca3-8b65810608cb,ENCFF608CVW,barbara-wold:seqfish_E5194210,rush_dlpfc_c5194210_counts.tsv,362208987e3cc406e3a2198b4f8705a5,ENCFF682UIQ,tsv,gene quantifications,barbara-wold:seqfish_E5194210_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443,
8,2f3d1f4e-168b-4959-a93c-c9d52c483566,ENCFF161MPN,barbara-wold:seqfish_E5194210,rush_dlpfc_c5194210_obs.tsv,7956178c1528de02070fb8c7c79fed28,ENCFF682UIQ,tsv,cell coordinates,barbara-wold:seqfish_E5194210_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443,
9,4c14cd42-0882-4a24-aadb-8ca9d0c9c202,ENCFF697TRE,barbara-wold:seqfish_E6341028,rush_dlpfc_ad6341028_counts.tsv,22aee79aea4100dcba6b72401a0583d4,ENCFF662GQT,tsv,gene quantifications,barbara-wold:seqfish_E6341028_replicate_b1_t1,/labs/barbara-wold/,UM1HG009443,


In [51]:
files = pandas.read_excel(spreadsheet_name, sheet_name="File", engine="odf")

for i, row in files.iterrows():
    if not pandas.isnull(row["derived_from:array"]):
        url = "https://www.encodeproject.org/files/{}/".format(row["accession"])
        obj = server.get_json(url)
        desired = row["derived_from:array"].split(',')
        updates = {}
        if set(obj.get("derived_from")) != set(desired):
            updates["derived_from"] = desired
        if obj.get("step_run", {}).get("@id") != row["step_run"]:
            updates["step_run"] = row["step_run"]
        if obj.get("assembly") != row["assembly"]:
            updates["assembly"] = row["assembly"]
        
        if len(updates) > 0:
            print(obj["accession"], updates)
            #print(server.patch_json(obj["@id"], updates))

ENCFF880DOR {'assembly': 'GRCh38'}
ENCFF608CVW {'assembly': 'GRCh38'}
ENCFF161MPN {'assembly': 'GRCh38'}
ENCFF697TRE {'assembly': 'GRCh38'}
ENCFF489ZNK {'assembly': 'GRCh38'}
ENCFF345NPX {'assembly': 'GRCh38'}
ENCFF845CUG {'assembly': 'GRCh38'}
ENCFF381DOD {'assembly': 'GRCh38'}
ENCFF286PNZ {'assembly': 'GRCh38'}


# Scratchspace

In [None]:
processed_files

In [None]:
pandas.read_excel("rush-ad-seqfish-processed-tables-posted-test.encodedcc.org.xlsx")

In [None]:
donors

In [None]:
current_dir

In [None]:
# recovering after an interrupted upload
if 0:
    with open("/tmp/rush_dlpfc_ad_7948794.tar.gz.test.encodedcc.org.upload", "rt") as instream:
        data = json.load(instream)["@graph"][0]
        creds = data['upload_credentials']
        submitted_file_name = data["submitted_file_name"]
        print(f"AWS_ACCESS_KEY_ID={creds['access_key']} AWS_SECRET_ACCESS_KEY={creds['secret_key']} AWS_SECURITY_TOKEN={creds['session_token']} aws s3 cp {submitted_file_name} {creds['upload_url']}")
    

In [None]:
data

In [None]:
replicates = pandas.read_excel(spreadsheet_name, sheet_name="Replicate", engine="odf")
curdir = os.getcwd()
os.chdir(raw_data_dir)
files = []
try:
    
    for i, row in replicates.iterrows():
        rush_id = str(row["rush_id:skip"])
        current_dir = raw_image_roots[rush_id].relative_to(raw_data_dir)
        
        file_row = submit_image_tar(server, row["experiment"], row["aliases:array"], current_dir)
        files.append(file_row)        
finally:
    os.chdir(curdir)
    
pandas.DataFrame(files)

In [None]:
#xattr.getxattr("/tmp/rush_dlpfc_control_5194210.tar.gz", "user.md5sum")

In [None]:
#xattr.setxattr("/tmp/rush_dlpfc_ad_7948794.tar.gz", "user.md5sum", "e2c60c75f645e06c3d4101e72be7dda5")

In [None]:
assert False

In [None]:
if 0:
    for row in created:
        print(run_aws_cp(alzheimers_dir / row["submitted_file_name"], row["upload_credentials"]))

In [None]:
if 0:
    for row in created:
        print(run_aws_cp(alzheimers_dir / row["submitted_file_name"], row["upload_credentials"]))

In [None]:
posted = pandas.read_excel("rush-ad-seqfish-processed-tables-posted-test.encodedcc.org.xlsx")
for i, row in posted.iterrows():
    obj = server.get_json(row["accession"])
    print(obj["accession"], obj["dataset"])