# Introduction

Using the models we have try to fill in the metadata spreadsheet & submit test records to the IGVF test portal.

This try had all the files from each platform type attached to one measurement set. But with Jennifer we decided this is probably a poor idea.

In [1]:
import bz2
from collections import Counter, namedtuple
import datetime
import numpy
import os
import pandas
from pathlib import Path
import re
from subprocess import run, PIPE
import sys
import time
from tqdm import tqdm
import zoneinfo

os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mousedemo.settings')
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

import django
from django.contrib.auth import get_user_model
from django.db import DEFAULT_DB_ALIAS

MOUSEDEMO = str(Path("mousedemo").absolute())
if MOUSEDEMO not in sys.path:
    sys.path.append(MOUSEDEMO)

django.setup()

from mousedemo import settings
from igvf_mice import models

AttributeError: module 'django.contrib.admin' has no attribute 'display'

In [None]:
EC = str(Path("~/proj/encoded_client").expanduser())
if EC not in sys.path:
    sys.path.append(EC)
    
from encoded_client import encoded

In [None]:
server = encoded.ENCODED("api.sandbox.igvf.org")

validator = encoded.DCCValidator(server)

In [None]:
award = "/awards/HG012077/"
labs = ["/labs/lior-pachter/", "/labs/grant-macgregor/", "/labs/barbara-wold/", "/labs/ali-mortazavi/"]
lab = labs[-1]
jax = "/sources/jackson-labs/"
species = 'Mus musculus'

In [None]:
strain_background = {
    "A/J": "A/J (AJ)",
    "C57BL/6J": "C57BL/6J (B6)",
    "129S1/SvImJ": "129S1/SvImJ (129)",
    "NOD/ShiLtJ": "NOD/ShiLtJ (NOD)",
    "NZO/HlLtJ": "NZO/H1LtJ (NZO)",
    "CAST/EiJ": "CAST/EiJ (CAST)",
    "PWK/PhJ": "PWK/PhJ (PWK)",
    "WSB/EiJ": "WSB/EiJ (WSB)",
    #"CAST (M. m. castaneus)",
    #"WSB (M. m. domesticus)",
    #"PWK (M. m. musculus)",
}

In [None]:
#plate_name =  "IGVF_B01"
plate_name = "IGVF_B01"
plate = models.SplitSeqPlate.objects.get(name=plate_name)

In [None]:
tissues = {}
mice = {}
for well in plate.splitseqwell_set.all():
    for biosample in well.biosample.all():
        for tissue in biosample.tissue.all():
            tissues[tissue.name] = tissue
            mice[tissue.mouse.name] = tissue.mouse
            
print(len(tissues))

# rodent_donor

In [None]:
def format_sex(value):
    if models.SexEnum.MALE == value:
        return "male"
    elif models.SexEnum.FEMALE == value:
        return "female"

def get_accession_string_or_none(record):
    accessions = record.accession.all()
    if len(accessions) == 0:
        return None
    else:
        return ",".join([x.name for x in accessions])
    
rodent_donor = []
for mouse_name in sorted(mice):
    mouse = mice[mouse_name]
    dcc_row = {
        #"#response": None,
        #"#response_time": None,
        "accession": get_accession_string_or_none(mouse),
        "uuid": None, 
        "aliases:array": f"ali-mortazavi:{mouse.name}",
        "award": award,
        "lab": lab,
        "taxa": species,
        "sex": format_sex(mouse.sex),
        "strain": mouse.strain.name,
        "references": None,
        "url": mouse.strain.url,
        "source": jax,
        "lot_id": None,
        "product_id": mouse.strain.jax_catalog_number,
        "documents": None,
        "alternate_accessions": None,
        "submitter_comment": None,
        "description": None,
        "parents": None,
        "traits": None,
        "phenotypic_features": None,
        "external_resources": None,
        "strain_background": strain_background[mouse.strain.name],
        "genotype": None,
        "individual_rodent:boolean": True,
        "rodent_identifier": mouse.name,
    }
    rodent_donor.append(dcc_row)
    
rodent_donor = pandas.DataFrame(rodent_donor)

dry_run = True
created = server.post_sheet("rodent_donor", rodent_donor, dry_run=dry_run, verbose=True, validator=validator)
if len(created) > 0 and not dry_run:
    rodent_donor.to_excel("{}_rodent_donor.xlsx".format(plate_name))
rodent_donor

# tissue

In [None]:
def get_accession_list_or_none(record, length):
    accessions = record.accession.all()
    if len(accessions) == 0:
        return [None] * length
    elif len(accessions) == length:
        return [x.name for x in accessions]
    else:
        raise ValueError("Unexpected number of accessions {} {}".format(accessions, length))

tissue_sheet = []
for tissue_name in tissues:
    tissue = tissues[tissue_name]
    donor_alias = f"ali-mortazavi:{tissue.mouse.name}"
    ontology_terms = tissue.ontology_term.all()
    accessions = get_accession_list_or_none(tissue, len(ontology_terms))
    for term, accession_id in zip(ontology_terms, accessions):
        curie = term.curie.replace(":", "_")
        alias = f"ali-mortazavi:{tissue.name}_{curie}"
        sample_term = "/sample-terms/{}/".format(curie)
        if numpy.all(pandas.isnull(accession_id)):
            accession_id = None
        dcc_row = {
            "accession": accession_id,
            "uuid": None,
            "aliases:array": alias,
            "award": award,
            "lab": lab,
            "source": jax,
            "donors:array": donor_alias,
            #"taxa": species,
            "biosample_term": sample_term,
            "term_names:skip": term.name,
        }
        tissue_sheet.append(dcc_row)
    
tissue_sheet = pandas.DataFrame(tissue_sheet)

dry_run = True
created = server.post_sheet("tissue", tissue_sheet, dry_run=dry_run, verbose=True, validator=validator)
if len(created) > 0 and not dry_run:
    tissue_sheet.to_excel("{}_tissue_sheet.xlsx".format(plate_name))
tissue_sheet

# measurement_set

In [None]:
def get_samples_donors(plate):
    samples = set()
    donors = {}
    for well in plate.splitseqwell_set.all():
        for biosample in well.biosample.all():
            for tissue in biosample.tissue.all():
                mouse_alias = f"ali-mortazavi:{tissue.mouse.name}"
                # I need an ordered set of donors
                donors[mouse_alias] = None
                for term in tissue.ontology_term.all():
                    curie = term.curie.replace(":", "_")
                    tissue_alias = f"ali-mortazavi:{tissue.name}_{curie}"
                    samples.add(tissue_alias)

    return {
        "samples": samples,
        "donor": list(donors.keys())
    }

In [None]:

measurement_set = []

plate_details = get_samples_donors(models.SplitSeqPlate.objects.get(name=plate_name))
families = models.Platform.objects.values("family").distinct()

for subpool in models.Subpool.objects.filter(plate__name=plate_name).order_by("name"):
    for platform in families:
        subpoolinrun_set = subpool.subpoolinrun_set.filter(sequencing_run__platform__family=platform["family"])
        if subpoolinrun_set.exists():
            subpoolinrun = subpoolinrun_set.first()
            if subpoolinrun.measurement_set is not None:
                accession = ",".join([x.name for x in subpoolinrun.measurement_set.accession.all()])
            else:
                accesssion = None
                
            alias = "ali-mortazavi:{}_{}".format(subpool.name, platform["family"])
            dcc_row = {
                "accession": accession,
                "uuid": None,
                "aliases:array": alias,
                "award": award,
                "lab": lab,
                "assay_term": "/assay-terms/OBI_0003109/", # single-nucleus RNA sequencing assay
                "documents": None,
                "alternate_accessions": None,
                "submitter_comment": None,
                "description": None,
                "samples:array": ",".join(plate_details["samples"]),
                #"samples_len:skip": len(plate_details["samples"]),
                #"donors:array": ",".join(plate_details["donor"]),
                #"donors_len:skip": len(plate_details["donor"]),
                "protocol": None,
            }
            measurement_set.append(dcc_row)

measurement_set = pandas.DataFrame(measurement_set)
dry_run = True
created = server.post_sheet("measurement_set", measurement_set, dry_run=dry_run, verbose=True, validator=validator)
if len(created) > 0 and not dry_run:
    measurement_set.to_excel("{}_measurement_set.xlsx".format(plate_name))

measurement_set

In [None]:
for i, row in measurement_set.iterrows():
    measurement = server.get_json("/measurement_set/{}/".format(row.accession))
    samples = row["samples:array"].split(",")
    print(row.accession, len(measurement["samples"]), len(samples))
    if len(samples) > len(measurement["samples"]):
        print(server.patch_json(measurement["@id"], {"samples": samples}))


In [None]:
families = models.Platform.objects.values("family").distinct()

t0 = time.monotonic()
sequence_file = []
for subpool in models.Subpool.objects.filter(plate__name=plate_name).order_by("name"):
    #alias = "ali-mortazavi:{}_{}".format(subpool.name, platform["family"])
    sequencing_run_break = None
    sequencing_run_count = 0
    last_family = None
    for run in subpool.subpoolinrun_set.all().order_by("id"):
        platform_term_id = run.sequencing_run.platform.igvf_id
        family = run.sequencing_run.platform.family
        if family == "nanopore":
            continue

        if family != last_family:
            sequencing_run_count = 0
            last_family = family
        measurement_set_alias = "ali-mortazavi:{}_{}".format(subpool.name, family)
        for subpool_in_run_file in run.subpoolinrunfile_set.all().order_by("subpool_run", "lane", "fragment", "read"):
            
            #, subpool_in_run_file.lane, subpool_in_run_file.fragment
            current_sequencing_run = (subpool_in_run_file.subpool_run)
            if sequencing_run_break != current_sequencing_run:
                sequencing_run_break = current_sequencing_run
                sequencing_run_count += 1
                
            #print(subpool_in_run_file.filename, current_sequencing_run, sequencing_run_count)
            
            illumina_read = subpool_in_run_file.read if family == "illumina" else None
            
            if subpool_in_run_file.accession.exists():
                accession = ",".join([accession.name for accession in subpool_in_run_file.accession.all()])
            else:
                accession = None

            dcc_row = {
                "accession": accession,
                "uuid": None,
                #"aliases": None,
                "award": award,
                "lab": lab,
                "md5sum": subpool_in_run_file.md5sum,
                "file_format": "fastq",
                "file_set": measurement_set_alias,
                "content_type": "reads",
                "flowcell_id": subpool_in_run_file.flowcell_id,
                "lane:integer": subpool_in_run_file.lane,
                "sequencing_run:integer": sequencing_run_count,
                #"documents:array": None,
                #"submitter_comment": None,
                #"description": None,
                #"dbxrefs": None,
                #"derived_from": None,
                #"file_format_specifications": None,
                "submitted_file_name": subpool_in_run_file.filename,
                "illumina_read_type": illumina_read,
                "sequencing_platform": platform_term_id,
            }
            sequence_file.append(dcc_row)

print(time.monotonic() - t0)
sequence_file = pandas.DataFrame(sequence_file)

dry_run = True
created = server.post_sheet("sequence_file", sequence_file, dry_run=dry_run, verbose=True, validator=validator)
if len(created) > 0 and not dry_run:
    sequence_file.to_excel("{}_sequence_file.xlsx".format(plate_name))

sequence_file


In [None]:
submission_book_name = "{}_submission.xlsx".format(plate_name)
print(submission_book_name)
with pandas.ExcelWriter(submission_book_name) as book:
    rodent_donor.to_excel(book, sheet_name="rodent_donor", index=False)
    tissue_sheet.to_excel(book, sheet_name="tissue", index=False)
    measurement_set.to_excel(book, sheet_name="measurement_set", index=False)
    sequence_file.to_excel(book, sheet_name="sequence_file", index=False)
