# Introduction

UCI has a complex google doc they've been using to track their samples so far.

https://docs.google.com/spreadsheets/d/13M6-Ry6oXgkx94BHZOGioYPI6F_hWDqjGgcaNu2JNYs/edit#gid=1838362486

Also there's a metadata spreadsheet for IGVF at
https://docs.google.com/spreadsheets/d/1BLMledzmqOqXnJHzpijgw91IOs-9tSlVeZDG_MtXddk/edit#gid=1284120531

In [1]:
import bz2
from collections import Counter, namedtuple
import datetime
import numpy
import os
import pandas
from pathlib import Path
import re
from subprocess import run, PIPE
import sys
import zoneinfo

os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mousedemo.settings')
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

import django
from django.contrib.auth import get_user_model
from django.db import DEFAULT_DB_ALIAS

MOUSEDEMO = str(Path("mousedemo").absolute())
if MOUSEDEMO not in sys.path:
    sys.path.append(MOUSEDEMO)
    

# Pre-initialization setup

Backup the old ones and make a new database file before we initialize the database.

In [2]:
# Create a new database
result = run(["python3", "manage.py", "check"], capture_output=True)

if len(result.stderr) > 0:
    print(result.stderr.decode("utf-8"))
    
result.check_returncode()

In [3]:
initial = Path("igvf_mice/migrations/0001_initial.py")
if initial.exists():
    initial.unlink()
    
result = run(["python3", "manage.py", "makemigrations", "igvf_mice"], capture_output=True)

if len(result.stderr) > 0:
    print(result.stderr.decode("utf-8"))
    
result.check_returncode()

In [4]:
# Backup several old sqlite database files

db_name = Path("db.sqlite3")

backup_names = []
for i in range(0, 4):
    backup_names.append(Path("db{}.sqlite3".format(i+1)))

if backup_names[-1].exists():
    backup_names[-1].unlink()

if db_name.exists():    
    for i in reversed(range(0, 3)):
        if backup_names[i].exists():
            print("renaming {} to {}".format(backup_names[i], backup_names[i+1]))
            backup_names[i].rename(backup_names[i+1])

    db_name.rename(backup_names[0])
    print("renaming {} to {}".format(db_name, backup_names[0]))

renaming db3.sqlite3 to db4.sqlite3
renaming db2.sqlite3 to db3.sqlite3
renaming db1.sqlite3 to db2.sqlite3
renaming db.sqlite3 to db1.sqlite3


In [5]:
# Create a new database
result = run(["python3", "manage.py", "migrate"], capture_output=True)

if len(result.stderr) > 0:
    print(result.stderr.decode("utf-8"))
    
result.check_returncode()
    
assert Path("db.sqlite3").exists()

# Setup users

Now that we have a fresh clean database, lets create user accounts

In [6]:
django.setup()

from mousedemo import settings
from igvf_mice import models

In [7]:
# Create accounts.
with open("pw", "rt") as instream:
    pw = instream.read().strip()

user_model = get_user_model()
user_model._default_manager.db_manager(DEFAULT_DB_ALIAS).create_superuser(
    username="diane",
    password=pw,   
)

with open("guest.pw", "rt") as instream:
    pw = instream.read().strip()

user_model = get_user_model()
user_model._default_manager.db_manager(DEFAULT_DB_ALIAS).create_user(
    username="guest",
    password=pw,   
)

pw = None


# Load data

In [8]:
#spreadsheet_name = "IGVF_Split-seq_20230118.xlsx"
lizs_sheet_name = "https://woldlab.caltech.edu/nextcloud/index.php/s/eEtjBfDqQFnLpSS/download"

book = pandas.ExcelFile(lizs_sheet_name)
for name in book.sheet_names:
    print(name)

Sample Tissue IDs
Planning
Counting
DissectionSorting
schedule - all
Line information
mice
Samples - 8 founders
Samples - F1s
Samples - CClines
Samples - Bridge
Founder Samples into experiment
F1 Samples into experiment
Bridge samples into experiment
PBMCs
Founder Nuclei isolation Sample
F1 Nuclei isolation SampleID
Plate setups
Experiment
ONT Sequencing
DCC
Tissue_body_weight_metadata
Failed_samples
Sheet40
tissue weights
practice mice
CellDrop_Demo
Gastroc_numbers
Gastroc_all_reps
Recount for gene capture
Gastroc_testing
Cerebellum testing
IGVF002 Pilot Samples
IGVF002 Pulverized Samples
IGVF002 Plate Setup
IGVF002 Metadata


In [9]:
submitted_book_names = {
    "igvftst": {
        "IGVF_003": "https://woldlab.caltech.edu/nextcloud/index.php/s/nBnE6j2RBMkpM9i/download",
        "IGVF_b01": "https://woldlab.caltech.edu/nextcloud/index.php/s/5cJteSWgitN5BDM/download",
    },
    "igvf": {
        "IGVF_b01": "https://woldlab.caltech.edu/nextcloud/index.php/s/HTbfN6btm3kqJXb/download",        
    }
}
submitted_sheets = {}
for accession_prefix in submitted_book_names:
    for book_name in submitted_book_names[accession_prefix]:
        book = pandas.ExcelFile(submitted_book_names[accession_prefix][book_name])

        for sheet_name in book.sheet_names:
            if sheet_name not in submitted_sheets:
                submitted_sheets.setdefault(accession_prefix, {})[sheet_name] = book.parse(sheet_name)
            else:
                submitted_sheets.setdefault(accession_prefix, {})[sheet_name] = pandas.concat([submitted_sheets[sheet_name], book.parse(sheet_name)])
    


Converters

In [10]:
def truncate(model):
    table_name = model._meta.db_table
    assert "\\" not in table_name
    with django.db.connection.cursor() as cursor:
        # I don't know why the sql params didn't work. this is a sql vulnerability waiting to happen
        cursor.execute("delete from \"{}\"".format(table_name))
        cursor.execute("DELETE FROM SQLITE_SEQUENCE WHERE name=\"{}\"".format(table_name))
        cursor.fetchone()

In [11]:
def int_or_none(x):
    if x in ("N/A", '#DIV/0!', '-'):
        return None
    elif pandas.isnull(x):
        return None
    else:
        return int(x)
    
def int_or_0(x):
    if pandas.isnull(x):
        return 0
    else:
        return int(x)

def float_or_none(x):
    if pandas.isnull(x):
        return None
    else:
        return float(x)
    
def str_or_empty(x):
    if pandas.isnull(x):
        return ""
    else:
        return x

def str_or_none(x):
    if pandas.isnull(x):
        return None
    else:
        return x
    
def date_or_none(x):
    if pandas.isnull(x):
        return None
    elif isinstance(x, datetime.datetime):
        return x.date()
    elif x in ("-"):
        return None
    else:
        return x

def datetime_or_none(x):
    if pandas.isnull(x):
        return None
    else:
        return x


# initialize general use tables

In [12]:
# Delete later: switch to just using an enum for the accessions.
#
#accession_namespaces = [
#    {"name": "ENCODE", "homepage": "https://www.encodeproject.org", "accession_prefix": "encode"},
#    {"name": "ENCODE test", "homepage": "https://test.encodedcc.org", "accession_prefix": "encodetst"},
#    {"name": "IGVF", "homepage": "https://data.igvf.org", "accession_prefix": "igvf"},
#    {"name": "IGVF test", "homepage": "https://sandbox.igvf.org", "accession_prefix": "igvftst"},
#]
#
#if models.AccessionNamespace.objects.count() > 0:
#    truncate(models.AccessionNamespace)
#
#for row in accession_namespaces:
#    record = models.AccessionNamespace(
#        name=row["name"],
#        homepage=row["homepage"],
#        accession_prefix=row["accession_prefix"],
#    )
#    record.save()
#
#igvf_namespace = models.AccessionNamespace.objects.get(name="IGVF")
#igvf_test_namespace = models.AccessionNamespace.objects.get(name="IGVF test")

In [13]:
source = [
    {
        "name": "jackson-labs",
        "display_name": "The Jackson Laboratory", 
        "homepage": "http://www.jax.org/index.html", 
        "igvf_id": "/sources/jackson-labs/",
    },
    {
        "name": "parse-biosciences",
        "display_name": "Parse Bioscience", 
        "homepage": "https://www.parsebiosciences.com/",
        "igvf_id": "/sources/parse-biosciences/",
    },
    {
        "name": "illumina",
        "display_name": "Illumina", 
        "homepage": "https://www.illumina.com", 
        "igvf_id": "/sources/illumina/",
    },
]

if models.Source.objects.count() > 0:
    truncate(models.Source)

for row in source:
    record = models.Source(
        name=row["name"],
        homepage=row["homepage"],
        igvf_id=row["igvf_id"])
    record.save()

jax_source = models.Source.objects.get(name="jackson-labs")

In [14]:
# Should this include machine model?
illumina_source = models.Source.objects.get(name="illumina")
platforms = [
    {"display_name": "Nextseq 2000", "name": "nextseq2000", "family": "illumina", "igvf_id": "/platform-terms/EFO_0010963/", "source": illumina_source},
    {"display_name": "Novaseq 6000", "name": "novaseq6000", "family": "illumina", "igvf_id": "/platform-terms/EFO_0008637/", "source": illumina_source},
    {"display_name": "Oxford Nanopore", "name": "nanopore", "family": "nanopore",},
    {"display_name": "Pac Bio", "name": "pacbio", "family": "pacbio",},
]

for row in platforms:
    record = models.Platform(
        name=row["name"],
        display_name=row["display_name"],
        family=row["family"],
        igvf_id=row.get("igvf_id"),
        source=row.get("source"),
    )
    record.save()

# Library kit type & version

In [15]:
library_construction_kit = [
    {"name": "wt-mega", "display_name": "Parse WT Mega", "version": "v2", "source": "parse-biosciences"},
    {"name": "wt", "display_name": "Parse WT", "version": "v2", "source": "parse-biosciences"},
]

if models.LibraryConstructionKit.objects.count() > 0:
    truncate(models.LibraryConstructionKit)

for row in library_construction_kit:
    try:
        source = models.Source.objects.get(name=row["source"])
        
        record = models.LibraryConstructionKit(
            name=row["name"],
            display_name=row["display_name"],
            version=row["version"],
            source=source,
        )
        record.save()        
    except models.Source.DoesNotExist:
        print(f"Couldn't find {row['source']}")
        


# Load LibraryBarcodes

In [16]:
models.LibraryConstructionKit.objects.get(name="wt-mega")

<LibraryConstructionKit: Parse WT Mega v2>

In [17]:
if models.LibraryBarcode.objects.count() > 0:
    truncate(models.LibraryBarcode)


# Load WT Mega v2 first barcode
bc1 = pandas.read_csv("bc_data_n192_v4.csv")

mega_kit = models.LibraryConstructionKit.objects.get(name="wt-mega", version="v2")

for i, row in bc1.iterrows():
    record = models.LibraryBarcode(
        kit=mega_kit,
        name=row["uid"],
        code=row["well"],
        sequence=row["sequence"],
        barcode_type=row["type"], # TODO what do the codes mean again T,R?
    )
    record.save()

mega_subpool = [
    ("1", "CAGATC"),
    ("2", "ACTTGA"),
    ("3", "GATCAG"),
    ("4", "TAGCTT"),
    ("5", "ATGTCA"),
    ("6", "CTTGTA"),
    ("7", "AGTCAA"),
    ("8", "AGTTCC"),
    ("9", "GAGTGG"),
    ("10", "CCGTCC"),
    ("11", "GTAGAG"),
    ("12", "GTCCGC"),
    ("13", "GTGAAA"),
    ("14", "GTGGCC"),
    ("15", "GTTTCG"),
    ("16", "CGTACG"),
]
subpool = pandas.DataFrame(mega_subpool, columns=["code","sequence"])

for i, row in subpool.iterrows():
    record = models.LibraryBarcode(
        kit=mega_kit,
        code=row["code"],
        sequence=row["sequence"],
    )
    record.save()

regular_kit = models.LibraryConstructionKit.objects.get(name="wt", version="v2")    
    
regular_subpool = [
    ("1", "CAGATC"),
    ("2", "ACTTGA"),
    ("3", "GATCAG"),
    ("4", "TAGCTT"),
    ("5", "ATGTCA"),
    ("6", "CTTGTA"),
    ("7", "AGTCAA"),
    ("8", "AGTTCC"),
]
subpool = pandas.DataFrame(regular_subpool, columns=["code","sequence"])

for i, row in subpool.iterrows():
    record = models.LibraryBarcode(
        kit=regular_kit,
        code=row["code"],
        sequence=row["sequence"],
    )
    record.save()
    

# Initialize TissueOntology

In [18]:
seen_tissues = Counter()
for sample_sheet_name in ["Samples - 8 founders", "Samples - Bridge"]:
    tissue_sheet = pandas.read_excel(
        lizs_sheet_name, 
        sample_sheet_name,
        usecols=["Tissue"]).dropna(axis=0, how='all')

    for i, row in tissue_sheet.iterrows():
        if not pandas.isnull(row["Tissue"]):
            seen_tissues[row["Tissue"]] += 1
    print(sample_sheet_name, tissue_sheet.shape, len(seen_tissues))

seen_tissues

Samples - 8 founders (1621, 1) 22
Samples - Bridge (416, 1) 44


Counter({'Hypothalamus/Pituitary': 97,
         'Cerebellum': 81,
         'Cortex/Hippocampus left': 85,
         'Cortex/Hippocampus right': 85,
         'Liver': 97,
         'Heart': 97,
         'Lung': 81,
         'Adrenal': 97,
         'Kidney': 81,
         'Gonads -1 Ovary': 41,
         'Gonads -2 Oviduct': 41,
         'Perigonadal fat': 81,
         'Brown fat': 81,
         'Soleus': 81,
         'Plantaris': 81,
         'Gastrocnemius': 82,
         'TA': 81,
         'EDL': 81,
         'Tail': 121,
         'PBMC - WBC': 121,
         'Gonads -1 Testis': 40,
         'Gonads -2 Epididymis': 40,
         'Left cortex': 24,
         'Right cortex': 24,
         'Left hippocampus': 24,
         'Right hippocampus': 24,
         'Testis left': 8,
         'Testis right': 8,
         'Epididymis left': 8,
         'Epididymis right': 8,
         'Kidney left': 16,
         'Kidney right': 16,
         'Gastrocnemius left': 16,
         'Gastrocnemius right': 16,
         

In [19]:
tissue_dissection_to_ontology_map = {
    'Hypothalamus/Pituitary': [("UBERON:0001898","hypothalamus"), ("UBERON:0000007","pituitary gland")],
    'Cerebellum': [("UBERON:0002037","cerebellum")],
    # Hippocampus might be:
    #   Hippocampal formation UBERON:0002421 https://www.ebi.ac.uk/ols/ontologies/uberon/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FUBERON_0002421
    #   Layer of hippocampus UBERON:0002305 https://www.ebi.ac.uk/ols/ontologies/uberon/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FUBERON_0002305
    'Cortex/Hippocampus left': [("NTR:0000646","left cerebral cortex"), ("NTR:0000750", "Hippocampal formation left")],
    'Cortex/Hippocampus right': [("NTR:0000647","right cerebral cortex"), ("NTR:0000751", "Hippocampal formation right")],
    'Liver': [("UBERON:0002107", "liver")],
    'Heart': [("UBERON:0000948","heart")],
    'Lung': [("UBERON:0002048","lung")],
    'Adrenal': [("UBERON:0002369","adrenal gland")],
    'Kidney': [("UBERON:0002113","kidney")],
    'Kidney left': [("UBERON:0002113","kidney")],
    'Kidney right': [("UBERON:0002113","kidney")],
    "Ovary": [("UBERON:0000992", "ovary")],    
    'Gonads -1 Ovary': [("UBERON:0000992", "ovary")],
    'Gonads -1 (Ovary)': [("UBERON:0000992", "ovary")],
    'Oviduct': [("UBERON:0000993","oviduct")],
    'Gonads -2 Oviduct': [("UBERON:0000993","oviduct")],
    'Gonads -2 (Oviduct)': [("UBERON:0000993","oviduct")],
    'Gonads - 2 (Oviduct)': [("UBERON:0000993","oviduct")],
    'Perigonadal fat': [("UBERON:0003428", "gonadal fat pad")], #"is closer" to dissection
    'Brown fat': [("UBERON:0001348","brown adipose tissue")], 
    'Soleus': [("UBERON:0001389","soleus muscle")],
    'Plantaris': [("UBERON:0011905","plantaris")],
    'Gastrocnemius': [("UBERON:0001388","gastrocnemius")],
    'Gastrocnemius left': [("UBERON:0001388","gastrocnemius")],
    'Gastrocnemius right': [("UBERON:0001388","gastrocnemius")],
    'TA': [("UBERON:0001385","tibialis anterior")],
    'TA left': [("UBERON:0001385","tibialis anterior")],
    'TA right': [("UBERON:0001385","tibialis anterior")],
    'EDL': [("UBERON:0001386","extensor digitorum longus")],
    'Tail': [("UBERON:0002415","tail")],
    'PBMC - WBC': [("CL:2000001", "peripheral blood mononuclear cell")],
    'Gonads -1 Testis': [("UBERON:0000473","testis")],
    'Gonads -2 Epididymis': [("UBERON:0001301","epididymis")],
    "Left cortex": [("NTR:0000646","left cerebral cortex"),],
    "Cortex left": [("NTR:0000646","left cerebral cortex"),],
    "Right cortex": [("NTR:0000647","right cerebral cortex"),],
    "Cortex right": [("NTR:0000647","right cerebral cortex"),],
    "Left hippocampus": [("NTR:0000750", "Hippocampal formation left")],
    "Hippocampus left": [("NTR:0000750", "Hippocampal formation left")],
    "Right hippocampus": [("NTR:0000751", "Hippocampal formation right")],
    "Hippocampus right": [("NTR:0000751", "Hippocampal formation right")],
    # does left & right matter?
    "Testis left": [("UBERON:0000473", "Testis")],
    "Testis right": [("UBERON:0000473", "Testis")],
    "Epididymis left": [("UBERON:0001301", "Epididymis")],
    "Epididymis right": [("UBERON:0001301", "Epididymis")],
}

print(set(seen_tissues).difference(tissue_dissection_to_ontology_map.keys()))

assert len(set(seen_tissues).difference(tissue_dissection_to_ontology_map.keys())) == 0, "Add in more names to term ids"

set()


In [20]:
term_details = pandas.read_csv("obo.tsv.bz2", compression="bz2", sep="\t", index_col="term_id")

if models.OntologyTerm.objects.count() > 0:
    truncate(models.OntologyTerm)
    
for key in tissue_dissection_to_ontology_map:
    for term_curie, term_name in tissue_dissection_to_ontology_map[key]:
        if term_curie.startswith("NTR:"):
            description=None
        else:
            details = term_details.loc[term_curie]
            description = details.description

        record = models.OntologyTerm(
            curie=term_curie,
            name=term_name,
            description=description
        )
        record.save()
    

In [21]:
strain_urls = {
    "AJ": "https://www.jax.org/strain/000646",
    "B6J": "https://www.jax.org/strain/000664",
    "129S1J": "https://www.jax.org/strain/002448",
    "NODJ": "https://www.jax.org/strain/001976",
    "NZOJ": "https://www.jax.org/strain/002105",
    "CASTJ": "https://www.jax.org/strain/000928",
    "PWKJ": "https://www.jax.org/strain/003715",
    "WSBJ": "https://www.jax.org/strain/001145",
    'B6129SF1J': "https://www.jax.org/strain/101043",
    'B6AF1J': "https://www.jax.org/strain/100002",
    'B6CASTF1J': None,
    'B6NODF1J': None,
    'B6NZOF1J': None,
    'B6PWKF1J': None,
    'B6WSBF1J': "https://www.jax.org/strain/019019",
    'TREM2R47HNSS_HO': 'https://www.jax.org/strain/033781',
    'CC001': "https://www.jax.org/strain/021238",
    'CC002': "https://www.jax.org/strain/021236",
    'CC003': "https://www.jax.org/strain/021237",
    'CC004': "https://www.jax.org/strain/020944",
    'CC005': "https://www.jax.org/strain/020945",
    'CC006': "https://www.jax.org/strain/022869",
    'CC007': "https://www.jax.org/strain/029625",
    'CC008': "https://www.jax.org/strain/026971",
    'CC009': "https://www.jax.org/strain/018856",
    'CC010': "https://www.jax.org/strain/021889",
    'CC011': "https://www.jax.org/strain/018854",
    'CC012': "https://www.jax.org/strain/028409",
    'CC013': "https://www.jax.org/strain/021892",
    'CC015': "https://www.jax.org/strain/018859",
    'CC017': "https://www.jax.org/strain/022870",
    'CC018': "https://www.jax.org/strain/021890",
    'CC024': "https://www.jax.org/strain/021891",
    'CC025': "https://www.jax.org/strain/018857",
    'CC028': "https://www.jax.org/strain/025126",
    'CC029': "https://www.jax.org/strain/026972",
    'CC030': "https://www.jax.org/strain/025426",
    'CC032': "https://www.jax.org/strain/020946",
    'CC036': "https://www.jax.org/strain/025127",
    'CC037': "https://www.jax.org/strain/025423",
    'CC038': None,
    'CC041': "https://www.jax.org/strain/021893",
    'CC055': None,
    'CC057': "https://www.jax.org/strain/024683",
    'CC060': "https://www.jax.org/strain/026427",
    'CC062': None,
    'CC065': None,
    'CC071': None,
    'CC074': "https://www.jax.org/strain/018855",
}

# used as name. Should this be 
strain_igvf_id = {
    "AJ": "A/J (AJ)",
    "B6J": "C57BL/6J (B6)",
    "129S1J": "129S1/SvImJ (129)",
    "NODJ": "NOD/ShiLtJ (NOD)",
    "NZOJ": "NZO/HlLtJ (NZO)",
    "CASTJ": "CAST/EiJ (CAST)",
    "PWKJ": "PWK/PhJ (PWK)",
    "WSBJ": "WSB/EiJ (WSB)",
    'B6129SF1J': "B6129SF1/J",
    'B6AF1J': "B6AF1/J",
    'B6CASTF1J': "B6CASTF1/J",
    'B6NODF1J': "B6NODF1/J",
    'B6NZOF1J': "B6NZOF1/J",
    'B6PWKF1J': "B6PWKF1/J",
    'B6WSBF1J': "B6WSBF1/J",
    "TREM2R47HNSS_HO": "TREM2R47HNSS_HO",
    'CC001': "CC001",
    'CC002': "CC002",
    'CC003': "CC003",
    'CC004': "CC004",
    'CC005': "CC005",
    'CC006': "CC006",
    'CC007': "CC007",
    'CC008': "CC008",
    'CC009': "CC009",
    'CC010': "CC010",
    'CC011': "CC011",
    'CC012': "CC012",
    'CC013': "CC013",
    'CC015': "CC015",
    'CC017': "CC017",
    'CC018': "CC018",
    'CC024': "CC024",
    'CC025': "CC025",
    'CC028': "CC028",
    'CC029': "CC029",
    'CC030': "CC030",
    'CC032': "CC032",
    'CC036': "CC036",
    'CC037': "CC037",
    'CC038': "CC038",
    'CC041': "CC041",
    'CC055': "CC055",
    'CC057': "CC057",
    'CC060': "CC060",
    'CC062': "CC062",
    'CC065': "CC065",
    'CC071': "CC071",
    'CC074': "CC074",
}
strain_name = {k: strain_igvf_id[k].split(" ")[0] for k in strain_igvf_id}
strain_name_to_code = {strain_name[k]: k for k in strain_name}

In [22]:
def int_csv_to_hex(x):
    r, g, b = x.split(",")
    
    return "#{:02x}{:02x}{:02x}".format(int(r), int(g), int(b))

def remove_citation(x):
    return x.split(" ")[0]

strains = pandas.read_excel(
    lizs_sheet_name, 
    sheet_name="Line information", 
    usecols=[
        "Designation",
        "Strain",
        "Note",
        "Jax Catalog No",
        "Sample CODE",
        "Strain notes",
    ],
    converters={
        "Strain": remove_citation,
        "Jax Catalog No": str,
        "Strain notes": str_or_empty,
    }
).dropna(how="all")

# Remove the extra lines with the bridge samples for now. I don't want to track thta level of detail
#bridge = strains[strains["Designation"] == "Bridge sample"].first_valid_index() - 1
#strains = strains.iloc[0:bridge]

# Add in Strain URL
strains["see_also"] = strains["Sample CODE"].apply(lambda x: strain_urls.get(x, x))
strains

Unnamed: 0,Designation,Strain,Note,Jax Catalog No,Sample CODE,Strain notes,see_also
0,A,A/J,CC founder,646.0,AJ,Yellower adrenal gland.,https://www.jax.org/strain/000646
1,B,C57BL/6J,CC founder,664.0,B6J,,https://www.jax.org/strain/000664
2,C,129S1/SvImJ,CC founder,2448.0,129S1J,Male skin is tougher than female skin. Gallbla...,https://www.jax.org/strain/002448
3,D,NOD/ShiLtJ,CC founder,1976.0,NODJ,,https://www.jax.org/strain/001976
4,E,NZO/HlLtJ,CC founder,2105.0,NZOJ,,https://www.jax.org/strain/002105
5,F,CAST/EiJ,CC founder,928.0,CASTJ,,https://www.jax.org/strain/000928
6,G,PWK/PhJ,CC founder,3715.0,PWKJ,,https://www.jax.org/strain/003715
7,H,WSB/EiJ,CC founder,1145.0,WSBJ,,https://www.jax.org/strain/001145
8,,B6129SF1/J,CC F1,,B6129SF1J,,https://www.jax.org/strain/101043
9,,B6AF1/J,CC F1,,B6AF1J,,https://www.jax.org/strain/100002


In [23]:
strain_type_lookup = {
    "CC founder": models.StrainType.FOUNDER,
    "CC F1": models.StrainType.F1,
    "CC Cross": models.StrainType.CROSS,
    "CC Mutant": models.StrainType.MUTANT,
}

if models.MouseStrain.objects.count() > 0:
    truncate(models.MouseStrain)

for i, row in strains.iterrows():
    record = models.MouseStrain(
        name=row["Sample CODE"],
        display_name=strain_name[row["Sample CODE"]],
        igvf_id=strain_igvf_id[row["Sample CODE"]],
        strain_type=strain_type_lookup[row["Note"]],
        jax_catalog_number=row["Jax Catalog No"],
        notes=str_or_empty(row["Strain notes"]),
        see_also=row["see_also"],
        source=jax_source,
    )
    record.save()

This is closest to the Mouse table

In [24]:
submitted_mice = {
}

for accession_prefix in submitted_sheets:
    for i, row in submitted_sheets[accession_prefix]["rodent_donor"].iterrows():
        if not pandas.isnull(row["accession"]):
            rodent_identifier = row["rodent_identifier"]
            if accession_prefix == "igvftst":
                see_also_template = "https://api.sandbox.igvf.org/rodent-donors/{}/"
            elif accession_prefix == "igvf":
                see_also_template = "https://api.data.igvf.org/rodent-donors/{}/"
            submitted_mice.setdefault(rodent_identifier, []).append({
                "accession_prefix": accession_prefix,
                "name": row["accession"],
                "uuid": row["uuid"],
                "see_also": see_also_template.format(row["accession"]),
            })
        
print(len(submitted_mice))

8


In [25]:
def estrus_cycle(x):
    if pandas.isnull(x):
        return "NA"
    else:
        return x

sex_lookup = {
    numpy.nan: models.SexEnum.UNKNOWN,
    "Female": models.SexEnum.FEMALE,
    "Male": models.SexEnum.MALE,
}

estrus_stage = {
    "Unknown": models.EstrusCycle.UNKNOWN,
    "Anestrus": models.EstrusCycle.ANESTRUS,
    "Anestrus>Proestrus": models.EstrusCycle.ANESTRUS_PROESTRUS,
    "Proestrus": models.EstrusCycle.PROESTRUS,
    "Proestrus>Estrus": models.EstrusCycle.PROESTRUS_ESTRUS,
    "Estrus": models.EstrusCycle.ESTRUS,
    "Estrus>Metestrus": models.EstrusCycle.ESTRUS_METESTRUS,
    "Metestrus": models.EstrusCycle.METESTRUS,
    "Metestrus>Diestrus": models.EstrusCycle.METESTRUS_DIESTRUS,
    "Diestrus": models.EstrusCycle.DIESTRUS,
    "Diestrus>Proestrus": models.EstrusCycle.DIESTRUS_PROESTRUS,
}

mice = pandas.read_excel(
    lizs_sheet_name, 
    sheet_name="mice", 
    usecols=range(0,16),
    converters={
        "Estres cycle stage": estrus_cycle,
        "Dissection ID": int,
        "RMS number": int_or_none,
        "Housing number": int_or_none,
    }
)

if models.Mouse.objects.count() > 0:
    truncate(models.Mouse)

los_angeles_tz = zoneinfo.ZoneInfo("America/Los_Angeles")



failed = False
for i, row in mice.iterrows():
    if not (pandas.isnull(row["Mouse Name"]) or pandas.isnull(row["Dissection ID"])):
        name = row["Mouse Name"].replace("F1/J", "F1J")
        strain_code = strain_name_to_code[row["Strain"]]
        
        if name == "056_WSBJ_10F->PWKJ_9F":
            name = "056_PWKJ_9F"

        try:
            strain = models.MouseStrain.objects.get(name=strain_code)
        except models.MouseStrain.DoesNotExist as e:
            print(f"Unable to find {row['Strain']} for row {i+2} with code {strain_code}")
            failed = True
            continue

        if not name[-1] in ("M", "F"):
            raise ValueError(f"Unrecognized sex field {mouse_name}")
            
        name_fields = name.split("_")
        
        # deal with TERM2R47HNSS_HO
        if len(name_fields) > 3:
            name_fields = [name_fields[0], "_".join(name_fields[1:-1]), name_fields[-1]]
        #if len(name_fields) != 3:
        #    raise ValueError(f"Wrong number of attributes in {name} line {i+1}")
            
        if not name_fields[1] in strain_igvf_id:
            raise ValueError(f"strain background in {name} not recognized line {i+1}")
        
        sample_box = "" if pandas.isnull(row["Sample box"]) else row["Sample box"]
        
        record = models.Mouse(
            # should i use liz's disection id?
            name=name,
            strain=strain,
            sex=sex_lookup[row["Sex"]],
            weight_g=row["Weight (g)"],
            date_of_birth=row["DOB"].date() if not pandas.isnull(row["DOB"]) else None,
            harvest_date=row["Harvest Date"].date() if not pandas.isnull(row["Harvest Date"]) else None,
            operator=row["Operator"],
            notes=str_or_empty(row["Comments"]),
            sample_box=sample_box,
        )
        
        if not pandas.isnull(row["Estres cycle stage"]):
            # Ignore those transitions.
            stage = row["Estres cycle stage"]
            if stage.endswith("?"):
                stage = stage[:-1]
            record.estrus_cycle=estrus_stage[stage]
       
        record.save()
        
        if name in submitted_mice:
            accessions = []
            for accession_record in submitted_mice[name]:
                accession = models.Accession(
                    accession_prefix=accession_record["accession_prefix"],
                    name=accession_record["name"],
                    uuid=accession_record["uuid"],
                    see_also=accession_record["see_also"],
                )
                accession.save()
                accessions.append(accession)
            record.accession.set(accessions)
            record.save()
        
        
        
assert not failed, "Check warning messages"

In [26]:
name_fields

['627', 'B6J', '10M']

# Tissues

In [27]:
#if models.Tissue.objects.count() > 0:
#    truncate(models.Tissue)

tissue_sheets = ["Samples - 8 founders", "Samples - Bridge"]
tissues = []
for tissue_sheet in tissue_sheets:
    tissue = pandas.read_excel(
        lizs_sheet_name, 
        sheet_name=tissue_sheet,
        header=0,
    ).dropna(axis=0, how="all")
    print(tissue_sheet, tissue.shape)
    tissues.append(tissue)

tissues = pandas.concat(tissues)

Samples - 8 founders (1621, 17)
Samples - Bridge (416, 24)


In [28]:
def parse_timepoint(value):
    value, units = value.split(" ")
    value = float(value)
    
    unit_map = {
        "day": models.AgeUnitsEnum.DAY,
        "days": models.AgeUnitsEnum.DAY,
        "week": models.AgeUnitsEnum.WEEK,
        "weeks": models.AgeUnitsEnum.WEEK,
        "month": models.AgeUnitsEnum.MONTH,
        "months": models.AgeUnitsEnum.MONTH,
    }
    return (value, unit_map[units])


In [29]:
ontology_map = {}
for record in models.OntologyTerm.objects.all():
    ontology_map[record.curie] = record

In [30]:
submitted_tissues = {}

for accession_prefix in submitted_sheets:
    for i, row in submitted_sheets[accession_prefix]["tissue"].iterrows():
        if not pandas.isnull(row["accession"]):
            if accession_prefix == "igvftst":
                see_also_template = "https://api.sandbox.igvf.org/tissues/{}/"
            elif accession_prefix == "igvf":
                see_also_template = "https://api.data.igvf.org/tissues/{}/"
            
            aliases = row["aliases:array"].split(",")
            assert len(aliases) == 1
            alias_fields = aliases[0][len("ali-mortazavi:"):].split("_")
            tissue_id = "_".join(alias_fields[0:4])
            submitted_tissues.setdefault(tissue_id, []).append({
                "accession_prefix": accession_prefix,
                "name": row["accession"],
                "uuid": row["uuid"],
                "see_also": see_also_template.format(row["accession"]),
            })

print(len(submitted_tissues))

8


In [31]:
if models.Tissue.objects.count() > 0:
    truncate(models.Tissue)


failed = False
for i, row in tissues.iterrows():
    if not pandas.isnull(row["Tissue"]) and row["IGVF Sample BOX"] != "DNA Extraction Optimization":
        mouse_tissue_id_label = "Mouse_Tissue ID"
        mouse_tissue_name_fields = row[mouse_tissue_id_label].split("_")
        
        # some validation on tissue name
        if len(mouse_tissue_name_fields) != 4:
            raise ValueError(f"Not enough elements in mouse tissue id {row[mouse_tissue_id_label]}")
            
        if not mouse_tissue_name_fields[1] in strain_name:
            raise ValueError(f"tissue strain field in {row[mouse_tissue_id_label]} not recognized")
        
        mouse_name = "_".join(mouse_tissue_name_fields[0:3])
        try:
            mouse = models.Mouse.objects.get(name=mouse_name)
        except models.Mouse.DoesNotExist:
            print("row {}, {} was not found".format(i+2, mouse_name))
            failed = True
            continue

        genotype = row["Genotype"]
        
        # this is the "label swap" on spreadsheet rows 602-605.
        if mouse_name == "092_CASTJ_10F":
            genotype = "CASTJ"
        # this is the other half the swap on spreadsheet rows 1522-1525
        elif mouse_name == "046_NZOJ_10F":
            genotype = 'NZOJ'

        assert mouse.strain.name == genotype, f"{mouse.strain.name} != {genotype}"
        tissue_terms = []
        for term_curie, term_name in tissue_dissection_to_ontology_map[row["Tissue"]]:
            tissue_terms.append(ontology_map[term_curie])
        
        #age, age_units = parse_timepoint(row["Timepoint"])
        
        if pandas.isnull(row["Approx. sac time"]):
            sac_time = datetime.time(0,0,0)
        else:
            sac_time = datetime.time(
                row["Approx. sac time"].hour,
                row["Approx. sac time"].minute,
                row["Approx. sac time"].second,    
            )

        
        record = models.Tissue(
            mouse=mouse,
            name = row[mouse_tissue_id_label],
            description = row["Tissue"],
            #dissection_time=dissection,
            #age=age,
            #age_units=age_units,
            tube_label=row["Tube label"],
            timepoint_description=row["Timepoint"],
            life_stage=models.LifeStageEnum.ADULT,
            dissector=str_or_empty(row["Dissector"]),
            dissection_notes=str_or_empty(row["Comment"]),
        )

        if not pandas.isnull(row["Dissection date"]):
            record.dissection = datetime.datetime(
                row["Dissection date"].year,
                row["Dissection date"].month,
                row["Dissection date"].day,
                sac_time.hour,
                sac_time.minute,
                sac_time.second,
                tzinfo=los_angeles_tz,
            )
        
        tube_weight_label = "tube weight (g)"
        if not pandas.isnull(row[tube_weight_label]):
            record.tube_weight_g = float(row[tube_weight_label])

        total_weight_label = "tube+tissue wight (g)"
        if not pandas.isnull(row[total_weight_label]):
            record.total_weight_g = float(row[total_weight_label])
        
        record.save()
        record.ontology_term.set(tissue_terms)
        
        if row[mouse_tissue_id_label] in submitted_tissues:
            accessions = []
            for accession_row in submitted_tissues[row[mouse_tissue_id_label]]:
                accession = models.Accession(
                    accession_prefix=accession_row["accession_prefix"],
                    name=accession_row["name"],
                    uuid=accession_row["uuid"],
                    see_also=accession_row["see_also"],
                )
                accession.save()
                accessions.append(accession)
            record.accession.set(accessions)
        record.save()

assert not failed, "Check warning messages."

In [32]:
row

IGVF Sample BOX                            IGVF_BridgeSample_5
Mouse_Tissue ID                               266_CASTJ_10F_34
Unnamed: 2                                                 NaN
Tissue                                     Gastrocnemius right
Sex                                                          F
Timepoint                                             10 weeks
Genotype                                                 CASTJ
Tube label                                              266-34
tube weight (g)                                          1.034
tube+tissue wight (g)                                      NaN
tissue weight (mg) or blood volume (ul)                 -1.034
cryopreserved blood location                               NaN
Dissection date                                            NaN
Approx. sac time                                           NaN
Body weight (g)                                            NaN
Dissector                                              

In [33]:
#submitted_tissues[row[mouse_tissue_id_label]], type(submitted_tissues[row[mouse_tissue_id_label]])

# Information from Samples into experiment

- Total barcoded nuclei (Samples into experiment)

Is this FixedTissue, FixedSample, FixedBiosample?



In [34]:
if models.FixedSample.objects.count() > 0:
    truncate(models.FixedSample)


In [35]:
def load_samples_into_experiment(book_name, sheet_name, header=0):

    samples_into_experiment = pandas.read_excel(
        book_name, 
        sheet_name=sheet_name,
        header=header,
        index_col=None,
    ).dropna(how="all")

    # was trying to normalize two row hierarchical headers, 
    #level0 = []
    #level1 = []
    #for header in samples_into_experiment.columns:
    #    level0.append("" if header[0].startswith("Unnamed: ") else header[0])
    #    level1.append(header[1])
    #samples_into_experiment.columns = pandas.MultiIndex.from_arrays([level0, level1], names=["phase", "name"])

    one_based = 1
    header_lines = 2
    failed = False
    for i, row in samples_into_experiment.iterrows():
        line_no = i + one_based + header_lines
        box_name = row["IGVF Fixation BOX"]
        if isinstance(box_name, str):
            box_name = box_name.strip()

        if box_name in ["IGVF_FIX_001", "IGVF_FIX_002"]:
            continue
            
        if pandas.isnull(row["Mouse_Tissue ID"]):
            continue

        tissue_id = row["Mouse_Tissue ID"].replace("F1/J", "F1J")
        pooled_from = row.get("pooled_from")

        if pandas.isnull(tissue_id):
            continue

        #if (tissue_id.endswith("_25") or tissue_id.endswith("_26")) and pandas.isnull(pooled_from):
        #    print(f"How are we going to merge these samples? {tissue_id}")
        #    continue

        if pandas.isnull(pooled_from):
            pooled_from = [tissue_id]
        else:
            pooled_from = pooled_from.split(",")

        tissues = []
        for pooled_id in pooled_from:
            try:
                tissues.append(models.Tissue.objects.get(name=pooled_id))
            except models.Tissue.DoesNotExist:
                print(f"Tissue {pooled_id} not found in tissue table on {line_no}")
                failed = True

        weight = row["weight (mg)"]
        fixation_date = row["Fixation date"]
        notes = row["Notes"]

        if weight == "#VALUE!":
            pass
        elif weight < 0:
            pass
        elif pandas.isnull(weight):
            pass
        else:
            total_weight = 0
            for tissue in tissues:
                if not pandas.isnull(tissue.weight_mg):
                    total_weight += tissue.weight_mg

            if not numpy.isclose(total_weight, weight):
                print(f"Sum of tissue weights from {pooled_from} {total_weight} doesn't match {weight}. {line_no}")

        sample_id = row["Sample ID"].replace("F1/J", "F1J") if not pandas.isnull(row["Sample ID"]) else None
        if not (pandas.isnull(sample_id) or pandas.isnull(tissue_id)):
            if tissue_id != sample_id:
                print(f"If defined, {tissue_id} should equal {sample_id} line {line_no}")
                failed = True

        cap_label = row["Cap label"]
        if not (pandas.isnull(cap_label) or pandas.isnull(sample_id)):
            sample_fields = sample_id.split("_")
            predicted_label = "_".join([sample_fields[0], sample_fields[-1]])
            if predicted_label != cap_label:
                print(f"{cap_label} should equal {predicted_label} line {line_no}")
                failed = True

        well_id = row["wells in Barcoding plate"]

        record = models.FixedSample(
            name=tissue_id,
            tube_label=cap_label,
            fixation_name=box_name,
            fixation_date=date_or_none(row["Fixation date"]),
            starting_nuclei=int_or_none(row["Total nuclei (x10^6)"]),
            nuclei_into_fixation=int_or_none(row["Nuclei into fixation (x10^6)"]),
            fixed_nuclei=int_or_none(row["Total fixed nuclei (x10^6)"]),
            aliquots_made=int_or_none(row["# aliquots"]),
            aliquot_volume_ul=float_or_none(row["uL per aliquot"]),
        )
        record.save()
        record.tissue.set(tissues)
        record.save()

    assert not failed, "Check warning messages"

    

In [36]:
load_samples_into_experiment(lizs_sheet_name, "Founder Samples into experiment", header=1)
    

Sum of tissue weights from ['046_NZOJ_10F_17'] 116.0 doesn't match 190.0. 557
Sum of tissue weights from ['095_CASTJ_10M_08'] 11.0 doesn't match 15.0. 748


In [37]:
load_samples_into_experiment(lizs_sheet_name, "Bridge samples into experiment", header=0)

In [38]:
#load_samples_into_experiment(lizs_sheet_name, "F1 Samples into experiment", header=1)

# Plate layout

(the hard thing)

In [39]:
WellContent = namedtuple("well_content", ["genotype", "tissue_id"])

def validate_tissue_ids(contents, expected_genotypes):
    for expected, (row_index, row) in zip(expected_genotypes, contents.iterrows()):
        for cell in row:
            fields = cell.split("_")
            assert len(fields) == 4, f"Field label validation fail {cell}"
            # there's a few sample swaps, don't validate them.
            if fields[1] not in ("092_CASTJ_10F_03"):
                assert fields[1] == expected, f"{fields[1]} failed to match {expected}"

def is_plate_name(name):
    return not pandas.isnull(name) and name.startswith("IGVF_")

def find_plate_start(sheet, offsets):
    for plate_id_row in sheet[sheet[offsets["plate_label"]].apply(is_plate_name)].index:
        plate_name = sheet.loc[plate_id_row, offsets["plate_label"]]
        for i in range(plate_id_row, plate_id_row + 4):
            cell = sheet.loc[i, offsets["well_start"]]
            if isinstance(cell, str) and (cell.startswith("Tissue") or cell in ("M", "F")):
                yield (plate_name, i)
                
def get_plate_genotype(sheet, start, offsets):
    plate_tissues = []
    for cell in sheet.loc[start+2:start+9, offsets["tissue_label"]].tolist():
        if not pandas.isnull(cell):
            match = re.match("(B6J|NODJ|AJ|PWKJ|129S1J|CASTJ|WSBJ|NZOJ)", cell)
            if match is not None:
                plate_tissues.append(match.group(0))
            else:
                print("Diffculty matching {} in {} {}".format(cell, start+2, start+9))
    
    return plate_tissues

def parse_single_well_block(sheet, plate_start, offsets):
    # column ids
    simple_plate_column_ids = sheet.loc[plate_start+1, offsets["well_range"]].tolist()

    plate_row_labels = sheet.loc[plate_start+2:plate_start+9, offsets["well_row_label"]].tolist()

    contents = sheet.loc[plate_start+2:plate_start+9, offsets["well_range"]].copy()
    contents.index = plate_row_labels
    contents.columns = simple_plate_column_ids

    genotypes = get_plate_genotype(sheet, plate_start, offsets)
    validate_tissue_ids(contents, genotypes)

    well_contents = {}
    for genotype, (well_row, row) in zip(genotypes, contents.iterrows()):
        for well_column, cell in zip(simple_plate_column_ids, row):
            well_contents[(well_row, str(well_column))] = [WellContent(genotype, cell)]
    return well_contents

def parse_multiplexed_well_block(sheet, start, expected_genotypes, offsets):
    well_re = re.compile("^[A-H]1?[\d]$")
    wells = sheet.loc[start+3, offsets["well_range"]].tolist()
    for cell in wells:
        if pandas.isnull(cell) or well_re.match(cell) is None:
            raise ValueError(f"well value {cell} in row {start+3} does not look correct")

    contents = sheet.loc[start+0:start+1, offsets["well_range"]]
    contents.columns = [(x[0], x[1:]) for x in wells]
    validate_tissue_ids(contents, expected_genotypes)
    contents.index = expected_genotypes

    well_contents = {}
    for genotype, row in contents.iterrows():
        for well, tissue_id in row.iteritems():
            well_contents.setdefault(well, []).append(WellContent(genotype, tissue_id))

    return well_contents

def parse_plate(sheet):
    max_rows_from_plate_name_to_well_names = 4

    offsets = {
        "plate_label": 1,
        "tissue_label": 1,
        "well_row_label": 2,
        "well_start": 3,
        "well_range": slice(3, 10),
    }
    offsets_simple = offsets.copy()
    offsets_simple["well_range"] = slice(3, 14)

    for plate_name, plate_start in find_plate_start(sheet, offsets):
        if plate_name.endswith("XX"):
            return

        well_contents = {}
        genotypes = get_plate_genotype(sheet, plate_start, offsets)        
        
        if plate_name in ("IGVF_B01", "IGVF_012"):
            well_contents.update(parse_single_well_block(sheet, plate_start, offsets_simple))
            complex_blocks = []        
        else:
            well_contents.update(parse_single_well_block(sheet, plate_start, offsets))

            complex_blocks = [
                (plate_start + 12, slice(0, 2)),
                (plate_start + 12 + 5, slice(2, 4)),
                (plate_start + 12 + 10, slice(4, 6)),
                (plate_start + 12 + 15, slice(6, 8)),
            ]

            for complex_start, genotype_slice in complex_blocks:
                expected_genotypes = genotypes[genotype_slice]
                well_contents.update(parse_multiplexed_well_block(sheet, complex_start, expected_genotypes, offsets))
        
        yield (plate_name, well_contents)
        
plate_layout = pandas.read_excel(
    lizs_sheet_name, 
    sheet_name="Plate setups",
    header=None,
)
    

In [40]:
# Validate tissue references:
failed = False
for plate_name, plate_contents in parse_plate(plate_layout):
    for well_id in plate_contents:
        well_contents = plate_contents[well_id]
        
        for well_fraction in well_contents:
            try:
                biosample = models.FixedSample.objects.get(name=well_fraction.tissue_id)
            except models.FixedSample.DoesNotExist:
                print(f"Unable to find {well_fraction.tissue_id} on plate {plate_name}")
                failed = True
                
#assert not failed, "Resolve tissues"

# populate database
if models.SplitSeqWell.biosample.through.objects.count() > 0:
    truncate(models.SplitSeqWell.biosample.through)

if models.SplitSeqWell.objects.count() > 0:
    truncate(models.SplitSeqWell)
    
if models.SplitSeqPlate.objects.count() > 0:
    truncate(models.SplitSeqPlate)

wt_mega_2_kit = models.LibraryConstructionKit.objects.get(name="wt-mega", version="v2")
wt_regular_2_kit = models.LibraryConstructionKit.objects.get(name="wt", version="v2")

errors = 0
for plate_name, plate_contents in parse_plate(plate_layout):
    plate_record = models.SplitSeqPlate(
        name=plate_name,
        size=models.PlateSizeEnum.size_96,
        pool_location=None,
        date_performed=None,
    )
    plate_record.save()

    for well_id in plate_contents:
        well_contents = plate_contents[well_id]
        
        biosamples = []
        for well_content in well_contents:
            try:
                biosamples.append(models.FixedSample.objects.get(name=well_content.tissue_id))
            except models.FixedSample.DoesNotExist:
                print(f"unable to find tissue {well_content.tissue_id} for {plate_name} {well_id}")
                errors += 1
        
        if plate_name in ("IGVF_B01", "IGVF_012"):
            kit = wt_regular_2_kit
        else:
            kit = wt_mega_2_kit
        
        barcodes = models.LibraryBarcode.objects.filter(
            kit=kit,
            code="{}{}".format(well_id[0], well_id[1]),
        )
        
        well_record = models.SplitSeqWell(
            plate=plate_record,
            row=well_id[0],
            column=well_id[1],
        )
        well_record.save()
        well_record.biosample.set(biosamples)
        well_record.barcode.set(barcodes)
        well_record.save()
        
assert not errors, "Check error messages"

# Submitted measurement sets

In [41]:
measurement_sets = {}

for accession_prefix in submitted_sheets:
    for i, row in submitted_sheets[accession_prefix]["measurement_set"].iterrows():
        if not pandas.isnull(row["accession"]):
            aliases = row["aliases:array"].split(',')
            name = aliases[0].replace("ali-mortazavi:", "")

            try:
                record = models.MeasurementSet.objects.get(name=name)
            except models.MeasurementSet.DoesNotExist:
                record = models.MeasurementSet(
                    name=name,
                )
                record.save()

                if accession_prefix == "igvftst":
                    see_also_template = "https://api.sandbox.igvf.org/measurment-sets/{}/"
                elif accession_prefix == "igvf":
                    see_also_template = "https://api.data.igvf.org/measurement-sets/{}/"
                
                accession = models.Accession(
                    accession_prefix=accession_prefix,
                    name=row["accession"],
                    uuid=row["uuid"],
                    see_also=see_also_template.format(row["accession"]),
                )
                accession.save()
                record.accession.add(accession)
                record.save()
            measurement_sets[name] = record
print(len(measurement_sets))

8


# Subpool

## Extract meaning from the Experiments tab

- number of nuclei per subpool (Experiment) 
- cDNA amp # PCR cycles (Experiment)
- cDNA ng/ul in 25ul (experiment)
- total cDNA ng (experiment)
- Bioanalyzer cDNA ave bp length (expierment)
- Sub library Index PCR # (experiment)
- Sublibrary Index (experiment)
- Sequence for SampleSheet (experiment)
- library ng/ul (experiment)
- Bioanalyzer library ave bp length (experiment)
- Nextseq run number (experiment)

- number of 67k aliquots  (experiment)
- number of 8k aliquots (experiment)

- QC # raw reads (1 mismatch) (experiment)

In [42]:
experiment = pandas.read_excel(
    lizs_sheet_name, 
    sheet_name="Experiment",
    header=0,
    index_col=None,
    #usecols=range(0, 49)
)
experiment

Unnamed: 0,Experiment,mice,Sample Box,Fixation Box,Split-seq prep start date,Sublibrary,[Barcoded Nuclei] (nuclei/uL),Barcoded Nuclei Vol. (uL),Total barcoded nuclei (1st 3 rounds),# of backup aliquots,...,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56
0,IGVF_Splitseq_002,B6 (2F+2M),IGVF_Sample_3,IGVF_FIX_001,2022-08-24,002_4A,4540.0,240.0,1089600.0,3.0,...,,,,,,,,,,
1,,CAST (2F+2M),IGVF_Sample_4,IGVF_FIX_002,NaT,002_4B,,,,,...,,,,,,,,,,
2,Fixation V1,,,,NaT,002_64A,,,,,...,,,,,,,,,,
3,1M V1,,,,NaT,002_64B,,,,,...,,,,,,,,,,
4,,,,,NaT,002_64C,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1032,,,,,NaT,,,,,,...,,,,,,,,,,
1033,,,,,NaT,,,,,,...,,,,,,,,,,
1034,,,,,NaT,,,,,,...,,,,,,,,,,
1035,,,,,NaT,,,,,,...,,,,,,,,,,


In [43]:
experiment = pandas.read_excel(
    lizs_sheet_name, 
    sheet_name="Experiment",
    header=0,
    index_col=None,
    #usecols=range(0, 49)
).rename(columns={
    # Unnamed: 42 seems like an index or reference to the sample sheet?
    "Unnamed: 43": "Novaseq raw reads",
    "Unnamed: 44": "Novaseq1 L001",
    "Unnamed: 45": "Novaseq1 L002",
    "Unnamed: 46": "Novaseq1 L003",
    "Unnamed: 47": "Novaseq1 L004",
    "Unnamed: 48": "Novaseq2 L001",
    "Unnamed: 49": "Novaseq2 L002",
    "Unnamed: 50": "Novaseq2 L003",
    "Unnamed: 51": "Novaseq2 L004",
})

experiment

Unnamed: 0,Experiment,mice,Sample Box,Fixation Box,Split-seq prep start date,Sublibrary,[Barcoded Nuclei] (nuclei/uL),Barcoded Nuclei Vol. (uL),Total barcoded nuclei (1st 3 rounds),# of backup aliquots,...,Novaseq1 L004,Novaseq2 L001,Novaseq2 L002,Novaseq2 L003,Novaseq2 L004,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56
0,IGVF_Splitseq_002,B6 (2F+2M),IGVF_Sample_3,IGVF_FIX_001,2022-08-24,002_4A,4540.0,240.0,1089600.0,3.0,...,,,,,,,,,,
1,,CAST (2F+2M),IGVF_Sample_4,IGVF_FIX_002,NaT,002_4B,,,,,...,,,,,,,,,,
2,Fixation V1,,,,NaT,002_64A,,,,,...,,,,,,,,,,
3,1M V1,,,,NaT,002_64B,,,,,...,,,,,,,,,,
4,,,,,NaT,002_64C,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1032,,,,,NaT,,,,,,...,,,,,,,,,,
1033,,,,,NaT,,,,,,...,,,,,,,,,,
1034,,,,,NaT,,,,,,...,,,,,,,,,,
1035,,,,,NaT,,,,,,...,,,,,,,,,,


In [44]:
def is_experiment_name(name):
    not_submittable = ["IGVF_Splitseq_EX"]
    return not pandas.isnull(name) and name.startswith("IGVF_Splitseq")

experiment[experiment["Experiment"].apply(is_experiment_name) & (pandas.notnull(experiment["mice"]))]

Unnamed: 0,Experiment,mice,Sample Box,Fixation Box,Split-seq prep start date,Sublibrary,[Barcoded Nuclei] (nuclei/uL),Barcoded Nuclei Vol. (uL),Total barcoded nuclei (1st 3 rounds),# of backup aliquots,...,Novaseq1 L004,Novaseq2 L001,Novaseq2 L002,Novaseq2 L003,Novaseq2 L004,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56
0,IGVF_Splitseq_002,B6 (2F+2M),IGVF_Sample_3,IGVF_FIX_001,2022-08-24,002_4A,4540.0,240.0,1089600.0,3.0,...,,,,,,,,,,
17,IGVF_Splitseq_003,8 founders (4M+4F),,IGVF_FIX_003,2022-12-01,003_8A,6860.0,250.0,1715000.0,7.0,...,,Total,L001,L002,L003,L004,L001,L002,L003,L004
36,IGVF_Splitseq_004,8 founders (4M+4F),,IGVF_FIX_005,2022-12-15,004_8A,2460.0,640.0,1574400.0,5.0,...,,Total,L001,L002,L003,L004,L001,L002,L003,L004
55,IGVF_Splitseq_005,8 founders (4M+4F),,IGVF_FIX_010,2023-01-23,005_8A,4675.0,390.0,1823250.0,6.0,...,,Total,L001,L002,L003,L004,L001,L002,L003,L004
74,IGVF_Splitseq_006,8 founders (4M+4F),,IGVF_FIX_006,2023-02-10,006_8A,3160.0,780.0,2464800.0,8.0,...,,Total,L001,L002,L003,L004,L001,L002,L003,L004
93,IGVF_Splitseq_007,8 founders (4M+4F),,IGVF_FIX_006,2023-03-07,007_8A,2810.0,580.0,1629800.0,8.0,...,,Total,L001,L002,L003,L004,L001,L002,L003,L004
112,IGVF_Splitseq_008,8 founders (4M+4F),,IGVF_FIX_008,2023-03-24,008_8A,2600.0,250.0,650000.0,0.0,...,,Total,L001,L002,L003,L004,,,,
124,IGVF_Splitseq_B01,CASTJ + B6J (2M+2F),,,2023-04-03,B01_13A,4920.0,83.6,411312.0,10.0,...,,,,,,,,,,
133,IGVF_Splitseq_008B,8 founders (4M+4F),,IGVF_FIX_008,2023-04-19,008B_13A,3330.0,250.0,832500.0,0.0,...,,Total,L001,L002,L003,L004,,,,
146,IGVF_Splitseq_009,8 founders (4M+4F),,IGVF_FIX_004,2023-05-02,009_13A,3370.0,285.0,960450.0,0.0,...,,Total,L001,L002,L003,L004,L001,L002,L003,L004


In [45]:
def find_fixation_start(sheet):
    def is_experiment_name(name):
        not_submittable = ["IGVF_Splitseq_EX"]
        return not pandas.isnull(name) and name.startswith("IGVF_Splitseq") # and not name in not_submittable

    for fixation_id_row in sheet[sheet["Experiment"].apply(is_experiment_name) & (pandas.notnull(experiment["mice"]))].index:
        name = sheet.loc[fixation_id_row, "Experiment"]
        start = fixation_id_row

        stop = sheet.shape[0]
        for i, value in sheet.iloc[start:]["Sublibrary"].iteritems():
            if pandas.isnull(value):
                stop = i - 1
                break
        
        
        yield (name, start, stop)

for name, start, stop in find_fixation_start(experiment):
    print(name, start, stop)

IGVF_Splitseq_002 0 15
IGVF_Splitseq_003 17 33
IGVF_Splitseq_004 36 52
IGVF_Splitseq_005 55 71
IGVF_Splitseq_006 74 91
IGVF_Splitseq_007 93 109
IGVF_Splitseq_008 112 121
IGVF_Splitseq_B01 124 131
IGVF_Splitseq_008B 133 143
IGVF_Splitseq_009 146 161
IGVF_Splitseq_010 163 178
IGVF_Splitseq_011 180 195
IGVF_Splitseq_EX 207 214
IGVF_Splitseq_012 218 225


In [46]:
def parse_selection_type(value):
    if value is None:
        return None
    elif value.lower() in ("normal", "no"):
        return models.SublibrarySelectionType.no_selection
    elif value in ("capture", "EX"):
        return models.SublibrarySelectionType.exome_capture
    else:
        raise ValueError(f"Unrecognized capture type {value}")


def parse_fixation(sheet, experiment_name, start, stop):
    block = sheet.loc[start:stop]

    experiment_name_fields = experiment_name.split("_")
    plate_name = f"{experiment_name_fields[0]}_{experiment_name_fields[-1]}"
    
    aliquots = {}
    for i, row in block.iterrows():
        count = row["# of backup aliquots"]
        size = row["Backup Aliquot Size"]
        if not (pandas.isnull(count) or pandas.isnull(size)):
            if isinstance(size, str) and size.endswith("k"):
                size = size[:-1]
            size = int(size) * 1000
            aliquots[size] = int(count)
    
    experiment = {
        "experiment_name": experiment_name,
        "plate_name": plate_name,
        "prep_date": block.loc[start, "Split-seq prep start date"],
        "unused_aliquots": aliquots,
        "subpool": [],
    }

    for i, row in block.iterrows():
        if not pandas.isnull(row["Sublibrary"]):
            if pandas.isnull(row["NovaSeq Sequencing date"]):
                novaseq_run_id = None
            else:
                novaseq_run_id = row["NovaSeq Sequencing date"].isoformat().split("T")[0]

            if pandas.isnull(row.get("Novaseq raw reads")):
                novaseq_raw_reads = None
                novaseq1_raw_reads = None
                novaseq2_raw_reads = None
            elif row["Novaseq raw reads"] == "Total":
                novaseq_raw_reads = None
                novaseq1_raw_reads = None
                novaseq2_raw_reads = None
            else:
                novaseq_raw_reads = int_or_none(row.get("Novaseq raw reads"))
                novaseq1_raw_reads = sum([int_or_none(row.get(x)) for x in["Novaseq1 L001", "Novaseq1 L002", "Novaseq1 L003", "Novaseq1 L004"]])
                novaseq2_raw_reads = sum([int_or_none(row.get(x)) for x in["Novaseq2 L001", "Novaseq2 L002", "Novaseq2 L003", "Novaseq2 L004"]])

            print(row["Sublibrary Type"], row["Sublibrary"])
            subpool = {
                "name": row["Sublibrary"],
                "subpool_subname": row["Sublibrary"].split("_")[1],
                "nuclei": int_or_none(row["Nuclei / Sublibrary"]),
                "selection_type": parse_selection_type(row["Sublibrary Type"]),
                "cdna_pcr_rounds": row["cDNA Amp # PCR Cycles"],
                "cdna_ng_per_ul": row["[cDNA] (ng/uL)"],
                "cdna_volume": row["cDNA Vol. (uL)"],
                #total_cdna = cdna_ng_per_ul_in_25ul * 25
                "bioanalyzer_date": row["BA date"],
                "cdna_average_bp_length": int_or_none(row["cDNA Ave. Length (bp)"]),
                "index_pcr_no": int_or_none(row["Sub library Index PCR #"]),
                "index": int_or_none(row["Sublibrary Index"]),
                "barcode": row["Sequence for SampleSheet"],
                "library_ng_per_ul": float_or_none(row["[Library] (ng/ul)"]),
                "library_average_bp_length": float_or_none(row["Library Ave. Length (bp)"]),
                "nextseq_run_date": date_or_none(row["QC Sequencing date"]),
                "nextseq_run_id": str_or_none(int_or_none(row["Nextseq run #"])),
                "nextseq_run_raw_reads": int_or_none(row["QC # raw reads  (1 mismatch)"]),
                "novaseq_run_date": date_or_none(row["NovaSeq Sequencing date"]),
                "novaseq_run_id": novaseq_run_id,
                "novaseq_run_raw_reads": novaseq_raw_reads,
                "novaseq1_raw_reads": novaseq1_raw_reads,
                "novaseq2_raw_reads": novaseq2_raw_reads,
            }
            experiment["subpool"].append(subpool)
            #if not pandas.isnull(subpool["nuclei"]):
            #    nuclei = subpool["nuclei"]
            #    aliquots[nuclei] = aliquots.get(nuclei) + 1

    return experiment

#parse_fixation(experiment, "IGVF_Splitseq_003", 17, 32)
#parse_fixation(experiment, "IGVF_Splitseq_008", 108, 117)
parse_fixation(experiment, "IGVF_Splitseq_B01", 124, 131)
#parse_fixation(experiment, "IGVF_Splitseq_008B", 133, 143)


NO B01_13A
NO B01_13B
NO B01_13C
NO B01_13D
NO B01_13E
NO B01_13F
EX B01_13G
NO B01_13H


{'experiment_name': 'IGVF_Splitseq_B01',
 'plate_name': 'IGVF_B01',
 'prep_date': Timestamp('2023-04-03 00:00:00'),
 'unused_aliquots': {13000: 10},
 'subpool': [{'name': 'B01_13A',
   'subpool_subname': '13A',
   'nuclei': 13000,
   'selection_type': <SublibrarySelectionType.no_selection: 'NO'>,
   'cdna_pcr_rounds': '5 + 7',
   'cdna_ng_per_ul': 40,
   'cdna_volume': 25.0,
   'bioanalyzer_date': datetime.datetime(2023, 4, 5, 0, 0),
   'cdna_average_bp_length': 1413,
   'index_pcr_no': 11,
   'index': 1,
   'barcode': 'CAGATC',
   'library_ng_per_ul': 25.2,
   'library_average_bp_length': 437.0,
   'nextseq_run_date': None,
   'nextseq_run_id': None,
   'nextseq_run_raw_reads': None,
   'novaseq_run_date': None,
   'novaseq_run_id': None,
   'novaseq_run_raw_reads': None,
   'novaseq1_raw_reads': None,
   'novaseq2_raw_reads': None},
  {'name': 'B01_13B',
   'subpool_subname': '13B',
   'nuclei': 13000,
   'selection_type': <SublibrarySelectionType.no_selection: 'NO'>,
   'cdna_pcr_ro

In [47]:
platforms = {x.name: x for x in models.Platform.objects.all()}
platforms

{'nextseq2000': <Platform: Nextseq 2000>,
 'novaseq6000': <Platform: Novaseq 6000>,
 'nanopore': <Platform: Oxford Nanopore>,
 'pacbio': <Platform: Pac Bio>}

In [48]:
class SubpoolName:
    def __init__(self, name=None):
        if isinstance(name, str):
            self._normalized = SubpoolName.subpool_name_to_numbers(name)
        elif isinstance(name, list):
            self._normalized = name
        elif name is None:
            self._normalized = None
        else:
            raise ValueError("Unrecognized type {} for {}".format(type(name), name))

    def __str__(self):
        if self._normalized is None:
            return "None"
        else:
            return SubpoolName.numbers_to_subpool_name(self._normalized)
        
    def __repr__(self):
        if self._normalized is None:
            return str("{}()".format(self.__class__.__name__))
        else:
            return str("{}('{}')".format(self.__class__.__name__, str(self)))
        
    def __eq__(self, cmp):
        if self._normalized is None and cmp is None:
            return True
        elif self._normalized is None and cmp._normalized is None:
            return True
        elif self._normalized is None or cmp._normalized is None:
            return False
        elif len(self._normalized) != len(cmp._normalized):
            return False
        else:
            for left, right in zip(self._normalized, cmp._normalized):
                if left != right:
                    return False
                
        return True
    
    def __lt__(self, cmp):
        if self._normalized is None and cmp is None:
            return False
        elif self._normalized is None and cmp._normalized is None:
            return False
        elif self._normalized is None:
            return True
        elif cmp is None or cmp._normalized is None:
            return False
        elif self._normalized[0] < cmp._normalized[0]:
            return True
        elif len(self._normalized) < len(cmp._normalized):
            return True
        elif len(self._normalized) > len(cmp._normalized):
            return False
        else:
            for left, right in zip(self._normalized, cmp._normalized):
                if left == right:
                    continue
                elif left < right:
                    return True
                else:
                    return False
                
        return False
                
    def __gt__(self, cmp):
        if self._normalized is None and cmp is None:
            return False
        elif self._normalized is None and cmp._normalized is None:
            return False
        elif cmp is None or cmp._normalized is None:
            return True
        elif self._normalized[0] > cmp._normalized[0]:
            return True
        elif len(self._normalized) < len(cmp._normalized):
            return False
        elif len(self._normalized) > len(cmp._normalized):
            return True
        else:
            for left, right in zip(self._normalized, cmp._normalized):
                if left == right:
                    continue
                elif left < right:
                    return False
                else:
                    return True
            
        return False
        
    @classmethod
    def subpool_name_to_numbers(cls, name):
        STATE_DIGITS = 1
        STATE_LETTERS = 2
        STATE_ERROR = -1

        digits = []
        letters = []
        state = STATE_DIGITS
        for char in name:
            if state == STATE_DIGITS:
                if char.isdigit():
                    digits.append(char)
                elif char.isalpha():
                    state = STATE_LETTERS
                    letters.append(char)
                else:
                    raise ValueError("Unrecognized symbol {} in {}".format(char, name))
            elif state == STATE_LETTERS:
                if char.isdigit():
                    raise ValueError("Unexpected digit {} in {}".format(char, name))
                elif char.isalpha():
                    letters.append(char)
                else:
                    raise ValueError("Unrecognized symbol {} in {}".format(char, name))

        result = [int("".join(digits))]
        for number in letters:
            if number < "A" or number > "Z":
                raise ValueError("out of bounds letter code in {}".format(name))
            else:
                result.append(ord(number) - ord("A"))

        return result

    @classmethod
    def numbers_to_subpool_name(cls, subpool_list):

        result = [str(subpool_list[0])]
        for char in subpool_list[1:]:
            result.append(chr(char + ord("A")))

        return "".join(result)

    def next_code(self):
        if self._normalized is None:
            return None

        carry = 0
        advancable = []
        for i, place in enumerate(reversed(self._normalized[1:])):
            if i == 0:
                place += 1
            else:
                place += carry
                carry = 0

            if place > 25:
                carry = 1
                place = 0

            advancable.append(place)

        if carry > 0:
            advancable.append(0)

        result = [self._normalized[0]]
        result.extend(reversed(advancable))
        
        return SubpoolName(result)

s = SubpoolName("8ACZ")
print(s)

decoded = SubpoolName.subpool_name_to_numbers("8ACZ") 
print("decoded", decoded)
encoded = SubpoolName.numbers_to_subpool_name(decoded)
print("encoded", encoded)

print(SubpoolName("8ACZ").next_code())



for pairs in [("8A", "8B"), ("8AA", "8Z"), ("8B", "8E"), ("8B", "8A"), ("8Z", "8AA"), ("67A", "67A"), ("8AA", "8AB")]:
    left = SubpoolName(pairs[0])
    right = SubpoolName(pairs[1])
    print(left, right, left < right, left == right, left > right)
    
print(max(SubpoolName("8A"), SubpoolName("8B")))
print(max(None, SubpoolName("8B")))
print(max(SubpoolName("8A"), None))


8ACZ
decoded [8, 0, 2, 25]
encoded 8ACZ
8ADA
8A 8B True False False
8AA 8Z False False True
8B 8E True False False
8B 8A False False True
8Z 8AA True False False
67A 67A False True False
8AA 8AB True False False
8B
8B
8A


# load subpools, sequencing runs and subpools in sequencing run

In [49]:
multiplexed_samples = {}

for accession_prefix in submitted_sheets:
    for i, row in submitted_sheets[accession_prefix]["multiplexed_sample"].iterrows():
        if not pandas.isnull(row["accession"]):
            aliases = row["aliases:array"].split(',')
            name = aliases[0].replace("ali-mortazavi:", "")
            if accession_prefix == "igvftst":
                see_also_template = "https://api.sandbox.igvf.org/multiplexed-samples/{}/"
            elif accession_prefix == "igvf":
                see_also_template = "https://api.data.igvf.org/multiplexed-samples/{}/"

            multiplexed_samples.setdefault(name, []).append({
                "accession_prefix": accession_prefix,
                "name": row["accession"],
                "uuid": row["uuid"],
                "see_also": see_also_template.format(row["accession"]),
            })
print(len(multiplexed_samples))

8


In [50]:
compression=r"(?<compression>gz|bz2|xz|zstd)"
nanopore_library_name_re_template = r"igvf(?P<run>[\d]+)_(?P<library_id>{library_name})_.*_(?P<fragment>[\d]+)\.fastq\.{compression}"
nextseq_library_name_re_template = r"(?P<run>[\d]+)_(?P<library_id>{library_name})_(?P<read_id>[RI][\d])\.fastq\.{compression}"
nextseq_library_index_re_template = r"Sublibrary_(?P<library_id>{library_index})_S(?P<sample_id>[\d]+)_(?P<lane_id>L[\d]+)_(?P<read_id>[RI][\d])_(?P<fragment>[\d]+)\.fastq\.{compression}"
novaseq_library_index_re_template = r"Sublibrary_(?P<library_id>{library_index})_S(?P<sample_id>[\d]+)_(?P<lane_id>L[\d]+)_(?P<read_id>[RI][\d])_(?P<fragment>[\d]+)\.fastq\.{compression}"

expected_names = {
    "IGVF_Splitseq_003": {
        "nanopore": nanopore_library_name_re_template,
        "nextseq": nextseq_library_name_re_template,
        "novaseq": novaseq_library_index_re_template,
    },
    "IGVF_Splitseq_004": {
        "nanopore": nanopore_library_name_re_template,
        "nextseq": nextseq_library_name_re_template,
        "novaseq": novaseq_library_index_re_template,
    },
    "IGVF_Splitseq_005": {
        "nanopore": nanopore_library_name_re_template,
        "nextseq": nextseq_library_index_re_template,
        "novaseq": novaseq_library_index_re_template,
    },
    "IGVF_Splitseq_006": {
        "nanopore": nanopore_library_name_re_template,
        "nextseq": nextseq_library_index_re_template,
        "novaseq": novaseq_library_index_re_template,
    },
    
}

nanopore_runs = {
    "IGVF_Splitseq_003": {
        "name": "2023-01-03-8A",
        "run_date": datetime.datetime(2023,1,3),
        "raw_reads": 87784162,
    }
}



failed_plates = {"IGVF_006",}
failed = False
for experiment_name, experiment_start, experiment_stop in find_fixation_start(experiment):
    fixation = parse_fixation(experiment, experiment_name, experiment_start, experiment_stop)
    
    try:
        plate = models.SplitSeqPlate.objects.get(name=fixation["plate_name"])
    except models.SplitSeqPlate.DoesNotExist:
        print(f"Unable to find plate {fixation['plate_name']}")
        continue
        
    if fixation["plate_name"] in failed_plates:
        print(f"Skipping failed plate {fixation['plate_name']}")
        continue
        
    last_subpools = {}
    
    wt_mega_2_kit = models.LibraryConstructionKit.objects.get(name="wt-mega", version="v2")
    for line_offset, subpool in enumerate(fixation["subpool"]):
        try:
            barcodes = models.LibraryBarcode.objects.filter(
                kit=wt_mega_2_kit, 
                barcode_type=None, 
                code=subpool["index"])
        except models.LibraryBarcode.DoesNotExist:
            print(f"Unable to find barcode {subpool['index']}")

        # validate barcodes
        library_barcodes_sequence = {b.sequence for b in barcodes}
        if not pandas.isnull(subpool["barcode"]):
            expected_sequence = set(subpool["barcode"].split(","))
        else:
            expected_sequence = set()
        if library_barcodes_sequence != expected_sequence:
            print(f"{subpool['name']} Database lookup of barcodes {library_barcodes_sequence} doesn't match human entry {expected_sequence}")
            failed = True
            continue
        
        subpool_prefix = subpool["name"].split("_")[0]
        last_subpools[subpool["nuclei"]] = max(
            last_subpools.get(subpool["nuclei"]), 
            SubpoolName(subpool["subpool_subname"]))

        record = models.Subpool(
            name=subpool["name"],
            plate=plate,
            nuclei=subpool["nuclei"],
            selection_type=subpool["selection_type"],
            cdna_pcr_rounds=subpool["cdna_pcr_rounds"],
            cdna_ng_per_ul=float_or_none(subpool["cdna_ng_per_ul"]),
            cdna_volume=float_or_none(subpool["cdna_volume"]),
            bioanalyzer_date=date_or_none(subpool["bioanalyzer_date"]),
            cdna_average_bp_length=int_or_none(subpool["cdna_average_bp_length"]),
            index_pcr_number=int_or_none(subpool["index_pcr_no"]),
            index=int_or_none(subpool["index"]),
            library_ng_per_ul=float_or_none(subpool["library_ng_per_ul"]),
            library_average_bp_length=int_or_none(subpool["library_average_bp_length"]),
        )
        record.save()
        record.barcode.set(barcodes)
        record.save()
        
        name = "subpool_{}".format(subpool["name"].lower())
        if name in multiplexed_samples:
            for accession_row in multiplexed_samples[name]:
                print(name, record.name, multiplexed_samples[name])
                accession = models.Accession(**accession_row)
                accession.save()
                record.accession.add(accession)
                record.save()            
        
    for nuclei in fixation["unused_aliquots"]:
        subpool_name = last_subpools[nuclei]
        for i in range(fixation["unused_aliquots"][nuclei]):
            subpool_name = subpool_name.next_code()

            record = models.Subpool(
                name="{}_{}".format(subpool_prefix, subpool_name),
                plate=plate,
                nuclei=nuclei,
            )
            print(record.name)
            record.save()

assert not failed

NO 002_4A
NO 002_4B
NO 002_64A
NO 002_64B
NO 002_64C
NO 002_64D
NO 002_64E
NO 002_64F
NO 002_64G
NO 002_64H
NO 002_64I
NO 002_64J
NO 002_64K
NO 002_64L
NO 002_64M
NO 002_64N
Unable to find plate IGVF_002
NO 003_8A
NO 003_13A
NO 003_67A
NO 003_67B
NO 003_67C
NO 003_67D
NO 003_67E
NO 003_67F
NO 003_67G
NO 003_67H
NO 003_67I
NO 003_67J
NO 003_67K
NO 003_67L
NO 003_67M
NO 003_67N
NO 003_67O
003_67P
003_67Q
003_67R
003_67S
003_67T
003_67U
003_67V
003_13B
003_8B
003_8C
NO 004_8A
NO 004_13A
NO 004_67A
NO 004_67B
NO 004_67C
NO 004_67D
NO 004_67E
NO 004_67F
NO 004_67G
NO 004_67H
NO 004_67I
NO 004_67J
NO 004_67K
NO 004_67L
NO 004_67M
NO 004_67N
NO 004_67O
004_67P
004_67Q
004_67R
004_67S
004_67T
004_13B
004_13C
004_8B
004_8C
NO 005_8A
NO 005_13A
NO 005_67A
NO 005_67B
NO 005_67C
NO 005_67D
NO 005_67E
NO 005_67F
NO 005_67G
NO 005_67H
NO 005_67I
NO 005_67J
NO 005_67K
NO 005_67L
NO 005_67M
NO 005_67N
NO 005_67O
005_67P
005_67Q
005_67R
005_67S
005_67T
005_67U
005_13B
005_13C
005_8B
005_8C
005_8D
005_8

NO 008B_67G
NO 008B_67H
NO 008B_67I
NO 008B_67J
Unable to find plate IGVF_008B
EX 009_13A
NO 009_67A
NO 009_67B
NO 009_67C
NO 009_67D
NO 009_67E
NO 009_67F
NO 009_67G
NO 009_67H
NO 009_67I
NO 009_67J
NO 009_67K
NO 009_67L
NO 009_67M
NO 009_67N
NO 009_67O
EX 010_13A
NO 010_67A
NO 010_67B
NO 010_67C
NO 010_67D
NO 010_67E
NO 010_67F
NO 010_67G
NO 010_67H
NO 010_67I
NO 010_67J
NO 010_67K
NO 010_67L
NO 010_67M
NO 010_67N
NO 010_67O
010_67P
010_67Q
010_67R
010_13B
010_13C
010_13D
010_13E
EX 011_13A
NO 011_67A
NO 011_67B
NO 011_67C
NO 011_67D
NO 011_67E
NO 011_67F
NO 011_67G
NO 011_67H
NO 011_67I
NO 011_67J
NO 011_67K
NO 011_67L
NO 011_67M
NO 011_67N
NO 011_67O
011_67P
011_67Q
011_67R
011_67S
011_67T
011_67U
011_67V
011_13B
011_13C
011_13D
011_13E
011_13F
011_13G
011_13H
EX 003_13A
EX 004_13A
EX 005_13A
EX 007_13A
EX 008B_13A
EX 009_13A
EX 010_13A
EX 011_13A
Unable to find plate IGVF_EX
NO 012_13A
NO 012_13B
NO 012_13C
NO 012_13D
NO 012_13E
NO 012_13F
NO 012_13G
NO 012_13H
Unable to find plat

In [51]:
# Old creating runs from the sample sheet

#        if experiment_name in nanopore_runs and subpool["nuclei"] == 8000:
#            measurement = get_or_create_sequencing_run(
#                # nanopore_runs[experiment_name]["name"]
#                name="{}/{}".format(plate.name.lower(), "nanopore"),
#                run_date=nanopore_runs[experiment_name]["run_date"],
#                platform=platforms["nanopore"],
#                plate=plate,
#            )
#            
#            pattern = expected_names[experiment_name]["nanopore"].format(
#                library_name=subpool["subpool_subname"],
#                library_index=line_offset+1,
#                compression=compression,
#            )
#            
#            run = models.SubpoolInRun(
#                subpool=record,
#                sequencing_run=measurement,
#                raw_reads=nanopore_runs[experiment_name]["raw_reads"],
#                status=models.RunStatus.PASS,
#            )
#            run.save()            
#
#        if subpool["nextseq_run_date"] is not None:
#            measurement = get_or_create_sequencing_run(
#                #sublibrary["nextseq_run_id"]
#                name="{}/{}".format(plate.name.lower(), "nextseq"),
#                run_date=subpool["nextseq_run_date"],
#                platform=platforms["nextseq"],
#                plate=plate,
#            )
#
#            pattern = expected_names[experiment_name]["nextseq"].format(
#                library_name=subpool["subpool_subname"],
#                library_index=line_offset+1,
#                compression=compression,
#            )            
#            
#            run = models.SubpoolInRun(
#                subpool=record,
#                sequencing_run=measurement,
#                raw_reads=subpool["nextseq_run_raw_reads"],
#                status=models.RunStatus.PASS,
#            )
#            run.save()
#            
#        if subpool["novaseq_run_date"] is not None:
#            for i, nova_raw_reads in enumerate([subpool["novaseq1_raw_reads"], subpool["novaseq2_raw_reads"]]):
#                #nova_run_name = "{}-{}".format(sublibrary["novaseq_run_id"], i+1)
#                nova_run_name = "{}/{}".format(plate.name.lower(), "nova{}".format(i+1))
#                measurement = get_or_create_sequencing_run(
#                    name=nova_run_name,
#                    run_date=subpool["novaseq_run_date"],
#                    platform=platforms["novaseq"],
#                    plate=plate,
#                )
#
#                pattern = expected_names[experiment_name]["novaseq"].format(
#                    library_name=subpool["subpool_subname"],
#                    library_index=line_offset+1,
#                    compression=compression,
#                )
#
#                run = models.SubpoolInRun(
#                    subpool=record,
#                    sequencing_run=measurement,
#                    raw_reads=nova_raw_reads,
#                    status=models.RunStatus.PASS,                    
#                )
#                run.save()



In [52]:
fastqs = pandas.read_csv(
    "fastq_metadata.tsv", sep="\t",
    dtype={
        "sample_id": str,
        "read_id": str,
        "fragment": str,
        "sequencer_run": str,
        "lane": str,
    }).sort_values(["experiment", "library_id", "lane_id", "read_id"])
fastqs.head()

Unnamed: 0,experiment,filename,md5sum,size,ctime,library_id,sample_id,lane_id,read_id,fragment,compression,sequencer,sequencer_run,flowcell_id,lane,barcode,plate_id
1587,igvf_003,igvf_003/nextseq/003_8A_R1.fastq.gz,bb696e63b6a8710ec8435b60ef6c81f8,8886302014,2023-02-07,1,,,R1,,gz,VH00582,1,AAATMGFHV,1.0,CAGATC,3
1592,igvf_003,igvf_003/nextseq/003_8A_R2.fastq.gz,21967ba1a5a3218508961fac482a2952,5866415324,2023-02-07,1,,,R2,,gz,VH00582,1,AAATMGFHV,1.0,CAGATC,3
1576,igvf_003,igvf_003/nanopore/igvf003_8A_lig-ss_11.fastq.gz,494bd18e8eb9bcef5b79f18f8f21d569,7241241549,2022-12-13,1,,,,11.0,gz,nanopore,e0fe4672609d60d67277a22973f5c8534df5fc4c,FAU06484,,,3
1577,igvf_003,igvf_003/nanopore/igvf003_8A_lig-ss_4.fastq.gz,af524063452a305fa81585f760ad0cd4,1241893857,2022-12-07,1,,,,4.0,gz,nanopore,1ab5f6b09adc4af2c1c9a1f93c7a752037dbdfce,FAU11624,,,3
1578,igvf_003,igvf_003/nanopore/igvf003_8A_lig-ss_7.fastq.gz,0dbdc8a7fe445f8a9e6bbbd7a7d42cfd,7602537389,2022-12-08,1,,,,7.0,gz,nanopore,c3ebdb16d712e04e2ff2f2ea4e7b83be4de41c77,FAU06496,,,3


In [53]:
models.SplitSeqPlate.objects.get(name="IGVF_003")

<SplitSeqPlate: IGVF_003>

In [54]:
models.Subpool.objects.get(plate__name="IGVF_003", index=10).barcode.get().sequence

'CCGTCC'

In [55]:
models.Subpool.objects.filter(plate__name="IGVF_B01")

<QuerySet [<Subpool: B01_13A>, <Subpool: B01_13B>, <Subpool: B01_13C>, <Subpool: B01_13D>, <Subpool: B01_13E>, <Subpool: B01_13F>, <Subpool: B01_13G>, <Subpool: B01_13H>, <Subpool: B01_13I>, <Subpool: B01_13J>, <Subpool: B01_13K>, <Subpool: B01_13L>, <Subpool: B01_13M>, <Subpool: B01_13N>, <Subpool: B01_13O>, <Subpool: B01_13P>, <Subpool: B01_13Q>, <Subpool: B01_13R>]>

In [56]:
for i, row in fastqs.iterrows():
    plate_name = row.experiment.upper()
    library_id = int(row.library_id)

    subpool = models.Subpool.objects.get(plate__name=plate_name, index=library_id)
    assert subpool.barcode.count() == 1, "{} {}".format(subpool, library_id)

    barcode = subpool.barcode.get()

    if barcode.sequence != row.barcode:
        print(f"Barcode mismatch {plate_name} {library_id} {barcode.sequence} {row.barcode} {row.filename}")

Barcode mismatch IGVF_003 1 CAGATC nan igvf_003/nanopore/igvf003_8A_lig-ss_11.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC nan igvf_003/nanopore/igvf003_8A_lig-ss_4.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC nan igvf_003/nanopore/igvf003_8A_lig-ss_7.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC nan igvf_003/nanopore/igvf003_8A_lig-ss_6.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC nan igvf_003/nanopore/igvf003_8A_lig-ss_8.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC nan igvf_003/nanopore/igvf003_8A_lig-ss_5.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC nan igvf_003/nanopore/igvf003_8A_lig-ss_1.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC nan igvf_003/nanopore/igvf003_8A_lig-ss_2.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC nan igvf_003/nanopore/igvf003_8A_lig-ss_10.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC nan igvf_003/nanopore/igvf003_8A_lig-ss_9.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC nan igvf_003/nanopore/igvf003_8A_lig-ss_3.fastq.gz


DoesNotExist: Subpool matching query does not exist.

In [None]:
%debug

> /usr/lib/python3/dist-packages/django/db/models/query.py(435)get()
    433             return clone._result_cache[0]
    434         if not num:
--> 435             raise self.model.DoesNotExist(
    436                 "%s matching query does not exist." %
    437                 self.model._meta.object_name

ipdb> up
> /usr/lib/python3/dist-packages/django/db/models/manager.py(85)manager_method()
     83         def create_method(name, method):
     84             def manager_method(self, *args, **kwargs):
---> 85                 return getattr(self.get_queryset(), name)(*args, **kwargs)
     86             manager_method.__name__ = method.__name__
     87             manager_method.__doc__ = method.__doc__

ipdb> up
> <ipython-input-56-1b004ef68a89>(5)<module>()
      3     library_id = int(row.library_id)
      4 
----> 5     subpool = models.Subpool.objects.get(plate__name=plate_name, index=library_id)
      6     assert subpool.barcode.count() == 1, "{} {}".format(subpool, libr

In [None]:
plate_name, library_id

Load information about previously submitted records

In [None]:
submitted_sequence_file = {}

for accession_prefix in submitted_sheets:
    for i, row in submitted_sheets["sequence_file"].iterrows():
        if not pandas.isnull(row["accession"]):
            if access_prefix == "igvftst":
                see_also_template = "https://api.sandbox.igvf.org/sequence-files/{}/"
            elif access_prefix == "igvf":
                see_also_template = "https://api.data.igvf.org/sequence-files/{}/"
            
            submitted_sequence_file.setdefault(row["submitted_file_name"], []).append({
                "accession_prefix": accession_prefix,
                "name": row["accession"],
                "uuid": row["uuid"],
                "see_also": see_also_template.format(row["accession"]),
            })
        
print("loaded {} submitted sequence_file records".format(len(submitted_sequence_file)))


In [None]:
submitted_sheets.keys()

In [None]:
def built_in_cache(a, b, cache={}):
    cache.setdefault(a, b)
    print(cache)
    
built_in_cache('a', 1)
built_in_cache('b', 3)
built_in_cache('c', 4)
built_in_cache('a', 2)

In [None]:
sequencing_experiments = {}
sequencing_subpools = {}

In [None]:
def get_or_create_sequencing_run(name, run_date, platform, plate, accessions=None):
    if name in sequencing_experiments:
        return sequencing_experiments[name]
    
    try:
        sequencing_run = models.SequencingRun.objects.get(name=name)
    except models.SequencingRun.DoesNotExist:
        sequencing_run = models.SequencingRun(
            name=name,
            run_date=run_date,
            platform=platform,
            plate=plate,
            stranded=models.Stranded.REVERSE,
        )
        sequencing_run.save()
        if accessions is not None:
            sequencing_run.accession.set(accessions)
            sequencing_run.save
        
    sequencing_experiments[name] = sequencing_run
    return sequencing_run

def get_or_create_subpool_in_run(subpool, sequencing_run, raw_reads=None, measurement_set=None):
    try:
        subpool_in_run = models.SubpoolInRun.objects.get(subpool=subpool, sequencing_run=sequencing_run)
    except models.SubpoolInRun.DoesNotExist:
        subpool_in_run = models.SubpoolInRun(
            subpool=subpool, 
            sequencing_run=sequencing_run,
            raw_reads=None,
            status=models.RunStatus.PASS,
            measurement_set=measurement_set,
        )
        subpool_in_run.save()
        
    sequencing_subpools[name] = subpool_in_run
    return subpool_in_run


def lane_or_none(value):
    if pandas.isnull(value):
        return None
    elif isinstance(value, float):
        return int(value)
    elif value.startswith("L"):
        return int(value[1:])
    else:
        return int(value)


fastqs = pandas.read_csv(
    "fastq_metadata.tsv", sep="\t",
    dtype={
        "sample_id": str,
        "read_id": str,
        "fragment": str,
        "sequencer_run": str,
        "lane": str,
    }).sort_values(["experiment", "library_id", "lane_id", "read_id"])
fastqs.head()

for i, row in fastqs.iterrows():
    plate_name = row.experiment.upper()
    library_id = int(row.library_id)
    fastq_relative = Path(row.filename)
    sequencing_run_name = str(Path(fastq_relative.parts[0])/fastq_relative.parts[1])
    sequencer = row.get("sequencer")
    run_date = row["ctime"]
    md5sum = row["md5sum"]

    if row["read_id"] in ('I1',):
        continue

    subpool = models.Subpool.objects.get(plate__name=plate_name, index=library_id)
    plate = subpool.plate

    platform = {
        "A00850": models.Platform.objects.get(display_name="Novaseq 6000"),
        "VH00582": models.Platform.objects.get(display_name="Nextseq 2000"),
        "nanopore": models.Platform.objects.get(display_name="Oxford Nanopore"),  # TODO: figure out a machine id
    }[sequencer]

    sequencing_run = get_or_create_sequencing_run(sequencing_run_name, run_date, platform, plate, accessions=None)
    subpool_run = get_or_create_subpool_in_run(subpool=subpool, sequencing_run=sequencing_run)
    
    file_set_name = None
    try:
        file_set_alias = submitted_sheets["sequence_file"].set_index("md5sum").loc[row.md5sum, "file_set"]
        file_set_name = file_set_alias.split(":")[1]
    except KeyError:
        
        print(f"Unable to find {row.md5sum}")

    if file_set_name is None:
        pass
    elif subpool_run.measurement_set is None:
        subpool_run.measurement_set = measurement_sets[file_set_name]
        subpool_run.save()
        print(f"updating {subpool_run} measurement set to {measurement_sets[file_set_name]}")
    else:
        assert subpool_run.measurement_set.name == file_set_name, f"SubpoolInRun measurement_set mismatch {subpool_run.measurement_set.name} {file_set_name}"
        

    accession_info = submitted_sequence_file.get(str(fastq_relative))
    subpool_file = models.SubpoolInRunFile(
        sequencing_run=sequencing_run,
        subpool_run=subpool_run,
        md5sum=md5sum,
        filename=str(fastq_relative),
        flowcell_id=row["flowcell_id"],
        lane=lane_or_none(row["lane"]),
    )
    if not pandas.isnull(row["read_id"]):
        subpool_file.read = row["read_id"]
    subpool_file.save()

    if accession_info is not None:
        accession = models.Accession(
            namespace=accession_info["namespace"],
            name=accession_info["name"],
            uuid=accession_info["uuid"],
            see_also=accession_info["see_also"],
        )
        accession.save()                
        subpool_file.accession.set([accession])
        subpool_file.save()


        
print("total plates", models.SplitSeqPlate.objects.count())
print("total sequencing runs", models.SequencingRun.objects.count())
print("total subpools", models.Subpool.objects.count())
print("total subpool in run", models.SubpoolInRun.objects.count())
print("total subpool in run files", models.SubpoolInRunFile.objects.count())


In [None]:
models.Mouse.objects.count()