# Introduction

UCI has a complex google doc they've been using to track their samples so far.

https://docs.google.com/spreadsheets/d/13M6-Ry6oXgkx94BHZOGioYPI6F_hWDqjGgcaNu2JNYs/edit#gid=1838362486

Also there's a metadata spreadsheet for IGVF at
https://docs.google.com/spreadsheets/d/1BLMledzmqOqXnJHzpijgw91IOs-9tSlVeZDG_MtXddk/edit#gid=1284120531

In [1]:
import bz2
from collections import Counter, namedtuple
import datetime
from io import StringIO
import numpy
import os
import pandas
from pathlib import Path
import re
from subprocess import run, PIPE
import sys
import zoneinfo

os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'mousedemo.settings')
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

import django
from django.contrib.auth import get_user_model
from django.db import DEFAULT_DB_ALIAS

MOUSEDEMO = str(Path("mousedemo").absolute())
if MOUSEDEMO not in sys.path:
    sys.path.append(MOUSEDEMO)
    

# Pre-initialization setup

Backup the old ones and make a new database file before we initialize the database.

In [2]:
# Create a new database
result = run(["python3", "manage.py", "check"], capture_output=True)

if len(result.stderr) > 0:
    print(result.stderr.decode("utf-8"))
    
result.check_returncode()

In [3]:
initial = Path("igvf_mice/migrations/0001_initial.py")
if initial.exists():
    initial.unlink()
    
result = run(["python3", "manage.py", "makemigrations", "igvf_mice"], capture_output=True)

if len(result.stderr) > 0:
    print(result.stderr.decode("utf-8"))
    
result.check_returncode()

In [4]:
# Backup several old sqlite database files

db_name = Path("db.sqlite3")

backup_names = []
for i in range(0, 4):
    backup_names.append(Path("db{}.sqlite3".format(i+1)))

if backup_names[-1].exists():
    backup_names[-1].unlink()

if db_name.exists():    
    for i in reversed(range(0, 3)):
        if backup_names[i].exists():
            print("renaming {} to {}".format(backup_names[i], backup_names[i+1]))
            backup_names[i].rename(backup_names[i+1])

    db_name.rename(backup_names[0])
    print("renaming {} to {}".format(db_name, backup_names[0]))

renaming db3.sqlite3 to db4.sqlite3
renaming db2.sqlite3 to db3.sqlite3
renaming db1.sqlite3 to db2.sqlite3
renaming db.sqlite3 to db1.sqlite3


In [5]:
# Create a new database
result = run(["python3", "manage.py", "migrate"], capture_output=True)

if len(result.stderr) > 0:
    print(result.stderr.decode("utf-8"))
    
result.check_returncode()
    
assert Path("db.sqlite3").exists()

# Setup users

Now that we have a fresh clean database, lets create user accounts

In [6]:
django.setup()

from django.contrib.auth.models import ContentType
from mousedemo import settings
from igvf_mice import models

In [7]:
# Create accounts.
with open("pw", "rt") as instream:
    pw = instream.read().strip()

user_model = get_user_model()
user_model._default_manager.db_manager(DEFAULT_DB_ALIAS).create_superuser(
    username="diane",
    password=pw,   
)

# load igvf_mice permissions    
permissions = set()
for ct in ContentType.objects.filter(app_label="igvf_mice"):
    permissions.update(ct.permission_set.all())

with open("guest.pw", "rt") as instream:
    pw = instream.read().strip()

user_model = get_user_model()
guest = user_model._default_manager.db_manager(DEFAULT_DB_ALIAS).create_user(
    username="guest",
    password=pw,
    is_staff=True,
)
guest.user_permissions.set(permissions)
print(guest)



guest


In [8]:
guest = user_model._default_manager.db_manager(DEFAULT_DB_ALIAS).get(username="guest")
diane = guest = user_model._default_manager.db_manager(DEFAULT_DB_ALIAS).get(username="diane")

In [9]:
assert guest.is_staff

In [10]:
guest.user_permissions.all()

<QuerySet []>

In [11]:
def validate_alias(alias):
    alias_pattern = re.compile("^(?:j-michael-cherry|ali-mortazavi|barbara-wold|lior-pachter|grant-macgregor|kim-green|mark-craven|qiongshi-lu|audrey-gasch|robert-steiner|jesse-engreitz|thomas-quertermous|anshul-kundaje|michael-bassik|will-greenleaf|marlene-rabinovitch|lars-steinmetz|jay-shendure|nadav-ahituv|martin-kircher|danwei-huangfu|michael-beer|anna-katerina-hadjantonakis|christina-leslie|alexander-rudensky|laura-donlin|hannah-carter|bing-ren|kyle-gaulton|maike-sander|charles-gersbach|gregory-crawford|tim-reddy|ansuman-satpathy|andrew-allen|gary-hon|nikhil-munshi|w-lee-kraus|lea-starita|doug-fowler|luca-pinello|guillaume-lettre|benhur-lee|daniel-bauer|richard-sherwood|benjamin-kleinstiver|marc-vidal|david-hill|frederick-roth|mikko-taipale|anne-carpenter|hyejung-won|karen-mohlke|michael-love|jason-buenrostro|bradley-bernstein|hilary-finucane|chongyuan-luo|noah-zaitlen|kathrin-plath|roy-wollman|jason-ernst|zhiping-weng|manuel-garber|xihong-lin|alan-boyle|ryan-mills|jie-liu|maureen-sartor|joshua-welch|stephen-montgomery|alexis-battle|livnat-jerby|jonathan-pritchard|predrag-radivojac|sean-mooney|harinder-singh|nidhi-sahni|jishnu-das|hao-wu|sreeram-kannan|hongjun-song|alkes-price|soumya-raychaudhuri|shamil-sunyaev|len-pennacchio|axel-visel|jill-moore|ting-wang|feng-yue|igvf|igvf-dacc):[a-zA-Z\\d_$.+!*,()'-]+(?:\\s[a-zA-Z\\d_$.+!*,()'-]+)*$")
    
    if alias_pattern.match(alias) is None:
        raise ValueError("Invalid alias")
    else:
        return True


try:
    validate_alias("ali-mortazavi:194_B6CASTF1/J_10F_20")
    assert False, "This should fail"
except ValueError:
    pass

# Load data

In [12]:
#spreadsheet_name = "IGVF_Split-seq_20230118.xlsx"
lizs_sheet_name = "https://woldlab.caltech.edu/nextcloud/index.php/s/eEtjBfDqQFnLpSS/download"

book = pandas.ExcelFile(lizs_sheet_name)
for name in book.sheet_names:
    print(name)

schedule - all
Line information
mice
Planning
DissectionSorting
Counting
Sample Tissue IDs
Samples - 8 founders
Samples - F1s
Samples - F1s_6mo
Samples - CClines
Samples - Bridge
Founder Samples into experiment
F1 Samples into experiment
CC Samples into experiment
Bridge samples into experiment
PBMCs
Founder Nuclei isolation Sample
F1 Nuclei isolation SampleID
Plate setups
Experiment
nreads_per_sublibrary_novaseq
nreads_per_sublibrary_nextseq
ONT Sequencing
DCC
Tissue_body_weight_metadata
Failed_samples
Sheet40
tissue weights
practice mice
CellDrop_Demo
Gastroc_numbers
Gastroc_all_reps
Recount for gene capture
Gastroc_testing
Cerebellum testing
IGVF002 Pilot Samples
IGVF002 Pulverized Samples
IGVF002 Plate Setup
IGVF002 Metadata


In [13]:
def load_submitted_sheets():
    submitted_book_names = {
        # Sheets on sandbox
        "igvftst": {
            "IGVF_003": "https://woldlab.caltech.edu/nextcloud/index.php/s/nBnE6j2RBMkpM9i/download",
            "IGVF_b01": "https://woldlab.caltech.edu/nextcloud/index.php/s/5cJteSWgitN5BDM/download",
        },
        # Sheets on data. (production)
        "igvf": {
            "IGVF_b01": "https://woldlab.caltech.edu/nextcloud/index.php/s/HTbfN6btm3kqJXb/download",
            "IGVF_003": "https://woldlab.caltech.edu/nextcloud/index.php/s/wRFxdcknisFfM2t/download",
            "IGVF_004": "https://woldlab.caltech.edu/nextcloud/index.php/s/8ygLzeyjAQ9CxXn/download",
            "IGVF_005": "https://woldlab.caltech.edu/nextcloud/index.php/s/2Pba74eLFwmFfgS/download",
            # IGVF_006 failed.
            "IGVF_007": "https://woldlab.caltech.edu/nextcloud/index.php/s/7rQKGSTS7937kTW/download",
            "IGVF_008": "https://woldlab.caltech.edu/nextcloud/index.php/s/iECMTJF2iPMy3kT/download",
            "IGVF_008B": "https://woldlab.caltech.edu/nextcloud/index.php/s/YF46eNqSASJC8fr/download",
            "IGVF_009": "https://woldlab.caltech.edu/nextcloud/index.php/s/PtNn96zR9XbybFZ/download",
            "IGVF_010": "https://woldlab.caltech.edu/nextcloud/index.php/s/CocTP9pePJobyPs/download",
            "IGVF_011": "https://woldlab.caltech.edu/nextcloud/index.php/s/Sm72GtmdN4QsXJr/download",
            "IGVF_012": "https://woldlab.caltech.edu/nextcloud/index.php/s/MmatjmzQgofeM2Y/download",
            "IGVF_013": "https://woldlab.caltech.edu/nextcloud/index.php/s/EHret5a6Sjiro4q/download",
            "IGVF_014": "https://woldlab.caltech.edu/nextcloud/index.php/s/GRQCaSZ3WgeonzK/download",
        }
    }
    submitted_sheets = {}
    for accession_prefix in submitted_book_names:
        for book_name in submitted_book_names[accession_prefix]:
            print("reading {} {} {}".format(accession_prefix, book_name, submitted_book_names[accession_prefix][book_name]))
            book = pandas.ExcelFile(submitted_book_names[accession_prefix][book_name])

            for sheet_name in book.sheet_names:
                if sheet_name not in submitted_sheets.get(accession_prefix, {}):
                    submitted_sheets.setdefault(accession_prefix, {})[sheet_name] = book.parse(sheet_name)
                else:
                    previous_sheet = submitted_sheets[accession_prefix][sheet_name]
                    next_sheet = book.parse(sheet_name)
                    print("next_sheet {} {}".format(sheet_name, next_sheet.shape))
                    
                    already_seen = set(previous_sheet["accession"])
                    need_to_add = set(next_sheet["accession"]).difference(already_seen)
                    next_sheet = next_sheet[next_sheet["accession"].isin(need_to_add)]
                    print("next_sheet after duplicate filter {}".format(next_sheet.shape))
                    
                    submitted_sheets.setdefault(accession_prefix, {})[sheet_name] = pandas.concat([previous_sheet, next_sheet])
                print(accession_prefix, sheet_name, submitted_sheets[accession_prefix][sheet_name].shape)

    return submitted_sheets

submitted_sheets = load_submitted_sheets()

reading igvftst IGVF_003 https://woldlab.caltech.edu/nextcloud/index.php/s/nBnE6j2RBMkpM9i/download
igvftst rodent_donor (64, 25)
igvftst tissue (192, 9)
igvftst measurement_set (17, 12)
igvftst sequence_file (246, 14)
reading igvftst IGVF_b01 https://woldlab.caltech.edu/nextcloud/index.php/s/5cJteSWgitN5BDM/download
next_sheet rodent_donor (8, 25)
next_sheet after duplicate filter (8, 25)
igvftst rodent_donor (72, 25)
next_sheet tissue (8, 9)
next_sheet after duplicate filter (8, 9)
igvftst tissue (200, 9)
igvftst multiplexed_sample (8, 8)
next_sheet measurement_set (8, 13)
next_sheet after duplicate filter (8, 13)
igvftst measurement_set (25, 13)
next_sheet sequence_file (32, 14)
next_sheet after duplicate filter (32, 14)
igvftst sequence_file (278, 14)
igvftst configuration_file (16, 8)
reading igvf IGVF_b01 https://woldlab.caltech.edu/nextcloud/index.php/s/HTbfN6btm3kqJXb/download
igvf rodent_donor (8, 25)
igvf tissue (8, 9)
igvf multiplexed_sample (8, 7)
igvf measurement_set (8, 1

next_sheet after duplicate filter (8, 7)
igvf multiplexed_sample (171, 7)
next_sheet measurement_set (8, 14)
next_sheet after duplicate filter (8, 14)
igvf measurement_set (171, 15)
next_sheet sequence_file (16, 20)
next_sheet after duplicate filter (16, 20)
igvf sequence_file (2130, 20)


Converters

In [14]:
def truncate(model):
    table_name = model._meta.db_table
    assert "\\" not in table_name
    with django.db.connection.cursor() as cursor:
        # I don't know why the sql params didn't work. this is a sql vulnerability waiting to happen
        cursor.execute("delete from \"{}\"".format(table_name))
        cursor.execute("DELETE FROM SQLITE_SEQUENCE WHERE name=\"{}\"".format(table_name))
        cursor.fetchone()

In [15]:
def int_or_none(x):
    if x in ("N/A", '#DIV/0!', '-'):
        return None
    elif pandas.isnull(x):
        return None
    else:
        return int(x)
    
def int_or_0(x):
    if pandas.isnull(x):
        return 0
    else:
        return int(x)

def float_or_none(x):
    if pandas.isnull(x):
        return None
    else:
        return float(x)
    
def str_or_empty(x):
    if pandas.isnull(x):
        return ""
    else:
        return x

def str_or_none(x):
    if pandas.isnull(x):
        return None
    else:
        return x
    
def date_or_none(x):
    if pandas.isnull(x):
        return None
    elif isinstance(x, datetime.datetime):
        return x.date()
    elif x in ("-"):
        return None
    else:
        return x

def datetime_or_none(x):
    if pandas.isnull(x):
        return None
    else:
        return x


In [16]:
def normalize_strain(strain):
    strain = strain.replace("F1/J", "F1J")
    return strain
    
assert normalize_strain("B6WSBF1/J") == "B6WSBF1J"
assert normalize_strain("195_B6CASTF1/J_10M_20") == "195_B6CASTF1J_10M_20"

def validate_mouse_age_sex(value):
    sex = value[-1]
    age = value[0:-1]
    
    assert sex in ("M", "F")
    assert isinstance(int(age), int)
    
validate_mouse_age_sex("10F")

mouse_name_tuple = namedtuple("mouse_name_tuple", ["mouse_id", "mouse_strain", "mouse_age_sex"])
def parse_mouse_name(mouse_name):    
    mouse_id_end = mouse_name.find("_")
    mouse_age_sex_start = mouse_name.rfind("_") + 1
    
    mouse_id = mouse_name[0:mouse_id_end]
    mouse_strain = normalize_strain(mouse_name[mouse_id_end+1:mouse_age_sex_start-1])
    mouse_age_sex = mouse_name[mouse_age_sex_start:]
    validate_mouse_age_sex(mouse_age_sex)
    return mouse_name_tuple(mouse_id, mouse_strain, mouse_age_sex)

assert parse_mouse_name("477_CC030_10M") == ("477", "CC030", "10M")
assert parse_mouse_name("239_TREM2R47HNSS_HO_10M") == ("239", "TREM2R47HNSS_HO", "10M")
assert parse_mouse_name("238_B6WSBF1/J_10F") == ("238", "B6WSBF1J", "10F")

mouse_tissue_tuple = namedtuple("mouse_tissue_tuple", ["mouse_id", "mouse_strain", "mouse_age_sex", "tissue_id"])
def parse_mouse_tissue(mouse_tissue):
    fields = mouse_tissue.split("_")
    
    mouse_id = fields.pop(0)
    mouse_strain = fields.pop(0)
    if mouse_strain in ("TREM2R47HNSS",):
        mouse_strain = "_".join([mouse_strain, fields.pop(0)])
    mouse_strain = normalize_strain(mouse_strain)
    mouse_age_sex = fields.pop(0)
    validate_mouse_age_sex(mouse_age_sex)
    tissue_id = fields.pop(0)
    assert len(fields) == 0
    
    return mouse_tissue_tuple(mouse_id, mouse_strain, mouse_age_sex, tissue_id)
    
assert parse_mouse_tissue("096_WSBJ_10F_15") == ("096", "WSBJ", "10F", "15")
assert parse_mouse_tissue("239_TREM2R47HNSS_HO_10M_01") == ("239", "TREM2R47HNSS_HO", "10M", "01")

def normalize_plate_name(name):
    name = name.upper()

    return name

assert normalize_plate_name("igvf_003") == "IGVF_003"
assert normalize_plate_name("IGVF_008B") == "IGVF_008B"

In [17]:
normalize_strain("B6WSBF1/J")

'B6WSBF1J'

# initialize general use tables

In [18]:
# Delete later: switch to just using an enum for the accessions.
#
#accession_namespaces = [
#    {"name": "ENCODE", "homepage": "https://www.encodeproject.org", "accession_prefix": "encode"},
#    {"name": "ENCODE test", "homepage": "https://test.encodedcc.org", "accession_prefix": "encodetst"},
#    {"name": "IGVF", "homepage": "https://data.igvf.org", "accession_prefix": "igvf"},
#    {"name": "IGVF test", "homepage": "https://sandbox.igvf.org", "accession_prefix": "igvftst"},
#]
#
#if models.AccessionNamespace.objects.count() > 0:
#    truncate(models.AccessionNamespace)
#
#for row in accession_namespaces:
#    record = models.AccessionNamespace(
#        name=row["name"],
#        homepage=row["homepage"],
#        accession_prefix=row["accession_prefix"],
#    )
#    record.save()
#
#igvf_namespace = models.AccessionNamespace.objects.get(name="IGVF")
#igvf_test_namespace = models.AccessionNamespace.objects.get(name="IGVF test")

In [19]:
def load_sources():
    source = [
        {
            "name": "jackson-labs",
            "display_name": "The Jackson Laboratory", 
            "homepage": "http://www.jax.org/index.html", 
            "igvf_id": "/sources/jackson-labs/",
        },
        {
            "name": "parse-biosciences",
            "display_name": "Parse Bioscience", 
            "homepage": "https://www.parsebiosciences.com/",
            "igvf_id": "/sources/parse-biosciences/",
        },
        {
            "name": "illumina",
            "display_name": "Illumina", 
            "homepage": "https://www.illumina.com", 
            "igvf_id": "/sources/illumina/",
        },
        {
            "name": "unc-csbio",
            "display_name": "UNC Systems Genetics",
            "homepage": "https://csbio.unc.edu",
            "igvf_id": "/sources/unc-csbio/",
        }
    ]

    if models.Source.objects.count() > 0:
        truncate(models.Source)

    for row in source:
        record = models.Source(
            name=row["name"],
            homepage=row["homepage"],
            igvf_id=row["igvf_id"])
        record.save()

load_sources()
jax_source = models.Source.objects.get(name="jackson-labs")
unc_csbio_source = models.Source.objects.get(name="unc-csbio")

In [20]:
# Should this include machine model?

def load_platforms():
    platforms = [
        {"display_name": "Nextseq 2000", "name": "nextseq2000", "family": "illumina", "igvf_id": "/platform-terms/EFO_0010963/"},
        {"display_name": "Novaseq 6000", "name": "novaseq6000", "family": "illumina", "igvf_id": "/platform-terms/EFO_0008637/"},
        {"display_name": "Oxford Nanopore MinION", "name": "minion", "family": "nanopore", "igvf_id": "/platform-terms/EFO_0008632/"},
        {"display_name": "Oxford Nanopore PromethION", "name": "promethion", "family": "nanopore", "igvf_id": "/platform-terms/EFO_0008634/"},
        {"display_name": "Pac Bio", "name": "pacbio", "family": "pacbio",},
    ]

    for row in platforms:
        record = models.Platform(
            name=row["name"],
            display_name=row["display_name"],
            family=row["family"],
            igvf_id=row.get("igvf_id"),
            #source=row.get("source"),
        )
        record.save()
        
load_platforms()


# Library kit type & version

In [21]:
def load_library_reagents():
    library_construction_reagent = [
        {"name": "wt-mega", "display_name": "Parse WT Mega", "version": "v2", "source": "parse-biosciences"},
        {"name": "wt", "display_name": "Parse WT", "version": "v2", "source": "parse-biosciences"},
    ]

    if models.LibraryConstructionReagent.objects.count() > 0:
        truncate(models.LibraryConstructionReagent)

    for row in library_construction_reagent:
        try:
            source = models.Source.objects.get(name=row["source"])

            record = models.LibraryConstructionReagent(
                name=row["name"],
                display_name=row["display_name"],
                version=row["version"],
                source=source,
            )
            record.save()        
        except models.Source.DoesNotExist:
            print(f"Couldn't find {row['source']}")

load_library_reagents()

# Load LibraryBarcodes

In [22]:
models.LibraryConstructionReagent.objects.get(name="wt-mega")

<LibraryConstructionReagent: Parse WT Mega v2>

In [23]:
def normalize_udi_to_code(value):
    prefix = "UDI_Plate_WT_"
    assert value.startswith(prefix)
    barcode_number = int(value[len(prefix):])
    return "UDI{:02}".format(barcode_number)

assert normalize_udi_to_code("UDI_Plate_WT_1") == "UDI01"
assert normalize_udi_to_code("UDI_Plate_WT_45") == "UDI45"

In [24]:
def load_barcodes():
    if models.LibraryBarcode.objects.count() > 0:
        truncate(models.LibraryBarcode)

    # Load WT Mega v2 first barcode
    bc1 = pandas.read_csv("bc_data_n192_v4.csv")

    mega_reagent = models.LibraryConstructionReagent.objects.get(name="wt-mega", version="v2")

    for i, row in bc1.iterrows():
        record = models.LibraryBarcode(
            reagent=mega_reagent,
            name=row["uid"],
            code=row["well"],
            i7_sequence=row["sequence"],
            barcode_type=row["type"], # TODO what do the codes mean again T,R?
            well_position=row["well"],
        )
        record.save()

    mega_subpool = [
        ("1", "CAGATC"),
        ("2", "ACTTGA"),
        ("3", "GATCAG"),
        ("4", "TAGCTT"),
        ("5", "ATGTCA"),
        ("6", "CTTGTA"),
        ("7", "AGTCAA"),
        ("8", "AGTTCC"),
        ("9", "GAGTGG"),
        ("10", "CCGTCC"),
        ("11", "GTAGAG"),
        ("12", "GTCCGC"),
        ("13", "GTGAAA"),
        ("14", "GTGGCC"),
        ("15", "GTTTCG"),
        ("16", "CGTACG"),
    ]
    subpool = pandas.DataFrame(mega_subpool, columns=["code", "i7_sequence"])

    for i, row in subpool.iterrows():
        record = models.LibraryBarcode(
            reagent=mega_reagent,
            name=row["code"],
            code=row["code"],
            i7_sequence=row["i7_sequence"],
        )
        record.save()

    regular_reagent = models.LibraryConstructionReagent.objects.get(name="wt", version="v2")    
    regular_subpool = [
        ("1", "CAGATC"),
        ("2", "ACTTGA"),
        ("3", "GATCAG"),
        ("4", "TAGCTT"),
        ("5", "ATGTCA"),
        ("6", "CTTGTA"),
        ("7", "AGTCAA"),
        ("8", "AGTTCC"),
    ]
    subpool = pandas.DataFrame(regular_subpool, columns=["code","i7_sequence"])
    for i, row in subpool.iterrows():
        record = models.LibraryBarcode(
            reagent=regular_reagent,
            name=row["code"],
            code=row["code"],
            i7_sequence=row["i7_sequence"],
        )
        record.save()


    udi_wt_reagent = models.LibraryConstructionReagent.objects.get(name="wt-mega", version="v2")
    udi_wt_barcodes = pandas.read_excel("https://support.parsebiosciences.com/hc/en-us/article_attachments/12988277964436")
    for i, row in udi_wt_barcodes.iterrows():
        record = models.LibraryBarcode(
            reagent=udi_wt_reagent,
            name=row["Index name"],
            code=normalize_udi_to_code(row["Index name"]),
            i7_sequence=row["i7_index"],
            i5_sequence=row["i5_index"],
            well_position=row["well_position"],
        )
        record.save()
        
load_barcodes()

# Initialize TissueOntology

In [25]:
def count_seen_tissues():
    seen_tissues = Counter()
    for sample_sheet_name in ["Samples - 8 founders", "Samples - Bridge"]:
        tissue_sheet = pandas.read_excel(
            lizs_sheet_name, 
            sample_sheet_name,
            usecols=["Tissue"]).dropna(axis=0, how='all')

        for i, row in tissue_sheet.iterrows():
            if not pandas.isnull(row["Tissue"]):
                seen_tissues[row["Tissue"]] += 1
        print(sample_sheet_name, tissue_sheet.shape, len(seen_tissues))
        
    return seen_tissues

seen_tissues = count_seen_tissues()
seen_tissues

Samples - 8 founders (1621, 1) 22
Samples - Bridge (416, 1) 44


Counter({'Hypothalamus/Pituitary': 97,
         'Cerebellum': 81,
         'Cortex/Hippocampus left': 85,
         'Cortex/Hippocampus right': 85,
         'Liver': 97,
         'Heart': 97,
         'Lung': 81,
         'Adrenal': 97,
         'Kidney': 81,
         'Gonads -1 Ovary': 41,
         'Gonads -2 Oviduct': 41,
         'Perigonadal fat': 81,
         'Brown fat': 81,
         'Soleus': 81,
         'Plantaris': 81,
         'Gastrocnemius': 82,
         'TA': 81,
         'EDL': 81,
         'Tail': 121,
         'PBMC - WBC': 121,
         'Gonads -1 Testis': 40,
         'Gonads -2 Epididymis': 40,
         'Left cortex': 24,
         'Right cortex': 24,
         'Left hippocampus': 24,
         'Right hippocampus': 24,
         'Testis left': 8,
         'Testis right': 8,
         'Epididymis left': 8,
         'Epididymis right': 8,
         'Kidney left': 16,
         'Kidney right': 16,
         'Gastrocnemius left': 16,
         'Gastrocnemius right': 16,
         

In [26]:
tissue_dissection_to_ontology_map = {
    'Hypothalamus/Pituitary': [("UBERON:0001898","hypothalamus"), ("UBERON:0000007","pituitary gland")],
    'Cerebellum': [("UBERON:0002037","cerebellum")],
    # Hippocampus might be:
    #   Hippocampal formation UBERON:0002421 https://www.ebi.ac.uk/ols/ontologies/uberon/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FUBERON_0002421
    #   Layer of hippocampus UBERON:0002305 https://www.ebi.ac.uk/ols/ontologies/uberon/terms?iri=http%3A%2F%2Fpurl.obolibrary.org%2Fobo%2FUBERON_0002305
    'Cortex/Hippocampus left': [("NTR:0000646","left cerebral cortex"), ("NTR:0000750", "Hippocampal formation left")],
    'Cortex/Hippocampus right': [("NTR:0000647","right cerebral cortex"), ("NTR:0000751", "Hippocampal formation right")],
    'Liver': [("UBERON:0002107", "liver")],
    'Heart': [("UBERON:0000948","heart")],
    'Lung': [("UBERON:0002048","lung")],
    'Adrenal': [("UBERON:0002369","adrenal gland")],
    'Kidney': [("UBERON:0002113","kidney")],
    'Kidney left': [("UBERON:0002113","kidney")],
    'Kidney right': [("UBERON:0002113","kidney")],
    "Ovary": [("UBERON:0000992", "ovary")],    
    'Gonads -1 Ovary': [("UBERON:0000992", "ovary")],
    'Gonads -1 (Ovary)': [("UBERON:0000992", "ovary")],
    'Oviduct': [("UBERON:0000993","oviduct")],
    'Gonads -2 Oviduct': [("UBERON:0000993","oviduct")],
    'Gonads -2 (Oviduct)': [("UBERON:0000993","oviduct")],
    'Gonads - 2 (Oviduct)': [("UBERON:0000993","oviduct")],
    'Perigonadal fat': [("UBERON:0003428", "gonadal fat pad")], #"is closer" to dissection
    'Brown fat': [("UBERON:0001348","brown adipose tissue")], 
    'Soleus': [("UBERON:0001389","soleus muscle")],
    'Plantaris': [("UBERON:0011905","plantaris")],
    'Gastrocnemius': [("UBERON:0001388","gastrocnemius")],
    'Gastrocnemius left': [("UBERON:0001388","gastrocnemius")],
    'Gastrocnemius right': [("UBERON:0001388","gastrocnemius")],
    'TA': [("UBERON:0001385","tibialis anterior")],
    'TA left': [("UBERON:0001385","tibialis anterior")],
    'TA right': [("UBERON:0001385","tibialis anterior")],
    'EDL': [("UBERON:0001386","extensor digitorum longus")],
    'Tail': [("UBERON:0002415","tail")],
    'PBMC - WBC': [("CL:2000001", "peripheral blood mononuclear cell")],
    'Gonads -1 Testis': [("UBERON:0000473","testis")],
    'Gonads -2 Epididymis': [("UBERON:0001301","epididymis")],
    "Left cortex": [("NTR:0000646","left cerebral cortex"),],
    "Cortex left": [("NTR:0000646","left cerebral cortex"),],
    "Right cortex": [("NTR:0000647","right cerebral cortex"),],
    "Cortex right": [("NTR:0000647","right cerebral cortex"),],
    "Left hippocampus": [("NTR:0000750", "Hippocampal formation left")],
    "Hippocampus left": [("NTR:0000750", "Hippocampal formation left")],
    "Right hippocampus": [("NTR:0000751", "Hippocampal formation right")],
    "Hippocampus right": [("NTR:0000751", "Hippocampal formation right")],
    # does left & right matter?
    "Testis left": [("UBERON:0000473", "Testis")],
    "Testis right": [("UBERON:0000473", "Testis")],
    "Epididymis left": [("UBERON:0001301", "Epididymis")],
    "Epididymis right": [("UBERON:0001301", "Epididymis")],
    "Pancreas": [("UBERON:0001264", "Pancreas")],
    "Lungs": [("UBERON:0002048", "Lung")],
}

print(set(seen_tissues).difference(tissue_dissection_to_ontology_map.keys()))

assert len(set(seen_tissues).difference(tissue_dissection_to_ontology_map.keys())) == 0, "Add in more names to term ids"

set()


In [27]:
def load_ontology_terms():
    term_details = pandas.read_csv("obo.tsv.bz2", compression="bz2", sep="\t", index_col="term_id")

    if models.OntologyTerm.objects.count() > 0:
        truncate(models.OntologyTerm)

    for key in tissue_dissection_to_ontology_map:
        for term_curie, term_name in tissue_dissection_to_ontology_map[key]:
            if term_curie.startswith("NTR:"):
                description=None
            else:
                details = term_details.loc[term_curie]
                description = details.description

            record = models.OntologyTerm(
                curie=term_curie,
                name=term_name,
                description=description
            )
            record.save()

load_ontology_terms()

In [28]:
strain_urls = {
    "AJ": "https://www.jax.org/strain/000646",
    "B6J": "https://www.jax.org/strain/000664",
    "129S1J": "https://www.jax.org/strain/002448",
    "NODJ": "https://www.jax.org/strain/001976",
    "NZOJ": "https://www.jax.org/strain/002105",
    "CASTJ": "https://www.jax.org/strain/000928",
    "PWKJ": "https://www.jax.org/strain/003715",
    "WSBJ": "https://www.jax.org/strain/001145",
    'B6129SF1J': "https://www.jax.org/strain/101043",
    'B6AF1J': "https://www.jax.org/strain/100002",
    'B6CASTF1J': None,
    'B6NODF1J': None,
    'B6NZOF1J': None,
    'B6PWKF1J': None,
    'B6WSBF1J': "https://www.jax.org/strain/019019",
    'TREM2R47HNSS_HO': 'https://www.jax.org/strain/034036',
    'CC001': "https://www.jax.org/strain/021238",
    'CC002': "https://www.jax.org/strain/021236",
    'CC003': "https://www.jax.org/strain/021237",
    'CC004': "https://www.jax.org/strain/020944",
    'CC005': "https://www.jax.org/strain/020945",
    'CC006': "https://www.jax.org/strain/022869",
    'CC007': "https://www.jax.org/strain/029625",
    'CC008': "https://www.jax.org/strain/026971",
    'CC009': "https://www.jax.org/strain/018856",
    'CC010': "https://www.jax.org/strain/021889",
    'CC011': "https://www.jax.org/strain/018854",
    'CC012': "https://www.jax.org/strain/028409",
    'CC013': "https://www.jax.org/strain/021892",
    'CC015': "https://www.jax.org/strain/018859",
    'CC017': "https://www.jax.org/strain/022870",
    'CC018': "https://www.jax.org/strain/021890",
    'CC024': "https://www.jax.org/strain/021891",
    'CC025': "https://www.jax.org/strain/018857",
    'CC028': "https://www.jax.org/strain/025126",
    'CC029': "https://www.jax.org/strain/026972",
    'CC030': "https://www.jax.org/strain/025426",
    'CC032': "https://www.jax.org/strain/020946",
    'CC036': "https://www.jax.org/strain/025127",
    'CC037': "https://www.jax.org/strain/025423",
    'CC038': None,
    'CC041': "https://www.jax.org/strain/021893",
    'CC055': None,
    'CC057': "https://www.jax.org/strain/024683",
    'CC060': "https://www.jax.org/strain/026427",
    'CC062': None,
    'CC065': None,
    'CC071': None,
    'CC074': "https://www.jax.org/strain/018855",
}

# used as name. Should this be 
strain_igvf_id = {
    "AJ": "A/J (AJ)",
    "B6J": "C57BL/6J (B6)",
    "129S1J": "129S1/SvImJ (129)",
    "NODJ": "NOD/ShiLtJ (NOD)",
    "NZOJ": "NZO/HlLtJ (NZO)",
    "CASTJ": "CAST/EiJ (CAST)",
    "PWKJ": "PWK/PhJ (PWK)",
    "WSBJ": "WSB/EiJ (WSB)",
    'B6129SF1J': None, #"B6129SF1/J (B6 female x 129S male F1)"
    'B6AF1J':    None, #"B6AF1/J (B6 female x AJ male F1)"
    'B6CASTF1J': None, #"B6CASTF1/J (B6 female x CAST male F1)"
    'B6NODF1J':  None, #"B6NODF1/J (B6 female x NOD male F1)"
    'B6NZOF1J':  None, #"B6NZOF1/J (B6 female x NZO male F1)"
    'B6PWKF1J':  None, #"B6PWKF1/J (B6 female x PWK male F1)"
    'B6WSBF1J':  None, #"B6WSBF1/J (B6 female x WSB male F1)"
    "TREM2R47HNSS_HO": "B6(SJL)-Trem2<em1Aduci>/J", #"B6(SJL)-Tre2ᵉᵐ¹ᴬᵈᵘᶜⁱ/J",
    # commented out CC lines are ones we're not currently using.
    'CC001': 'CC001/Unc',
    'CC002': 'CC002/Unc',
    'CC003': 'CC003/Unc',
    'CC004': 'CC004/TauUnc',
    'CC005': 'CC005/TauUnc',
    'CC006': 'CC006/TauUnc',
    'CC007': 'CC007/Unc',
    'CC008': 'CC008/GeniUnc',
    'CC009': 'CC009/Unc',
    'CC010': 'CC010/GeniUnc',
    'CC011': 'CC011/Unc',
    'CC012': 'CC012/GeniUnc',
    'CC013': 'CC013/GeniUnc',
    'CC015': 'CC015/Unc',
    #'CC016': 'CC016/GeniUnc',
    'CC017': 'CC017/Unc',
    'CC018': 'CC018/Unc',
    #'CC019': 'CC019/TauUnc',
    #'CC020': 'CC020/GeniUnc',
    #'CC021': 'CC021/Unc',
    #'CC022': 'CC022/GeniUnc',
    #'CC023': 'CC023/GeniUnc',
    'CC024': 'CC024/GeniUnc',
    'CC025': 'CC025/GeniUnc',
    #'CC026': 'CC026/GeniUnc',
    #'CC027': 'CC027/GeniUnc',
    'CC028': 'CC028/GeniUnc',
    'CC029': 'CC029/Unc',
    'CC030': 'CC030/GeniUnc',
    #'CC031': 'CC031/GeniUnc',
    'CC032': 'CC032/GeniUnc',
    #'CC033': 'CC033/GeniUnc',
    #'CC034': 'CC034/Unc',
    #'CC035': 'CC035/Unc',
    'CC036': 'CC036/Unc',
    'CC037': 'CC037/TauUnc',
    'CC038': 'CC038/GeniUnc',
    #'CC039': 'CC039/Unc',
    #'CC040': 'CC040/TauUnc',
    'CC041': 'CC041/TauUnc',
    #'CC042': 'CC042/GeniUnc',
    'CC043': 'CC043/GeniUnc',
    #'CC044': 'CC044/Unc',
    #'CC045': 'CC045/GeniUnc',
    #'CC046': 'CC046/Unc',
    #'CC047': 'CC047/Unc',
    #'CC048': 'CC048/Unc',
    #'CC049': 'CC049/TauUnc',
    #'CC050': 'CC050/Unc',
    #'CC051': 'CC051/TauUnc',
    #'CC052': 'CC052/GeniUnc',
    #'CC053': 'CC053/Unc',
    #'CC054': 'CC054/GeniUnc',
    'CC055': 'CC055/TauUnc',
    #'CC056': 'CC056/GeniUnc',
    'CC057': 'CC057/Unc',
    #'CC058': 'CC058/Unc',
    #'CC059': 'CC059/TauUnc',
    'CC060': 'CC060/Unc',
    #'CC061': 'CC061/GeniUnc',
    'CC062': 'CC062/Unc',
    #'CC063': 'CC063/Unc',
    'CC065': 'CC065/Unc',
    #'CC068': 'CC068/TauUnc',
    #'CC070': 'CC070/TauUnc',
    'CC071': 'CC071/TauUnc',
    #'CC072': 'CC072/TauUnc',
    #'CC073': 'CC073/Unc',
    'CC074': 'CC074/Unc',
    #'CC075': 'CC075/Unc',
    #'CC076': 'CC076/Unc',
    #'CC078': 'CC078/TauUnc',
    #'CC079': 'CC079/TauUnc',
    #'CC080': 'CC080/TauUnc',
    #'CC081': 'CC081/Unc',
}


In [29]:
def int_csv_to_hex(x):
    r, g, b = x.split(",")
    
    return "#{:02x}{:02x}{:02x}".format(int(r), int(g), int(b))

def remove_citation(x):
    return x.split(" ")[0]

#strains = pandas.read_excel(
#    lizs_sheet_name, 
#    sheet_name="Line information", 
#    usecols=[
#        "Designation",
#        "Strain",
#        "Note",
#        "Jax Catalog No",
#        "Sample CODE",
#        "Strain notes",
#    ],
#    converters={
#        "Strain": remove_citation,
#        "Jax Catalog No": str,
#        "Strain notes": str_or_empty,
#    }
#).dropna(how="all")
# Remove the extra lines with the bridge samples for now. I don't want to track thta level of detail
#bridge = strains[strains["Designation"] == "Bridge sample"].first_valid_index() - 1
#strains = strains.iloc[0:bridge]

# Add in Strain URL
#strains["see_also"] = strains["Sample CODE"].apply(lambda x: strain_urls.get(x, x))
#strains

#print(strains.to_csv(index=False))

strains_csv = StringIO("""Designation,Strain,Note,Jax Catalog No,Sample CODE,Strain notes,see_also,Source
A,A/J,CC founder,000646,AJ,Yellower adrenal gland.,https://www.jax.org/strain/000646,jackson-labs
B,C57BL/6J,CC founder,000664,B6J,,https://www.jax.org/strain/000664,jackson-labs
C,129S1/SvImJ,CC founder,002448,129S1J,Male skin is tougher than female skin. Gallbladder is more filled in this strain. Adrenals are pale and smaller than A6J.,https://www.jax.org/strain/002448,jackson-labs
D,NOD/ShiLtJ,CC founder,001976,NODJ,,https://www.jax.org/strain/001976,jackson-labs
E,NZO/HlLtJ,CC founder,002105,NZOJ,,https://www.jax.org/strain/002105,jackson-labs
F,CAST/EiJ,CC founder,000928,CASTJ,,https://www.jax.org/strain/000928,jackson-labs
G,PWK/PhJ,CC founder,003715,PWKJ,,https://www.jax.org/strain/003715,jackson-labs
H,WSB/EiJ,CC founder,001145,WSBJ,,https://www.jax.org/strain/001145,jackson-labs
,B6129SF1/J,CC F1,101043,B6129SF1J,,https://www.jax.org/strain/101043,jackson-labs
,B6AF1/J,CC F1,101043,B6AF1J,,https://www.jax.org/strain/100002,jackson-labs
,B6CASTF1/J,CC F1,Custom C56BL/6J crossed CAST/EiJ,B6CASTF1J,,,jackson-labs
,B6NODF1/J,CC F1,Custom C56BL/6J crossed NOD/ShiLtJ,B6NODF1J,,,jackson-labs
,B6NZOF1/J,CC F1,Custom C56BL/6J crossed NZO/HlLtJ,B6NZOF1J,,,jackson-labs
,B6PWKF1/J,CC F1,Custom C56BL/6J crossed PWK/PhJ,B6PWKF1J,,,jackson-labs
,B6WSBF1/J,CC F1,019019,B6WSBF1J,,https://www.jax.org/strain/019019,jackson-labs
,TREM2R47HNSS_HO,CC Mutant,033781,TREM2R47HNSS_HO,,https://www.jax.org/strain/033781,jackson-labs
,CC001,CC Cross,021238,CC001,,https://www.jax.org/strain/021238,unc-csbio
,CC002,CC Cross,021236,CC002,,https://www.jax.org/strain/021236,unc-csbio
,CC003,CC Cross,021237,CC003,,https://www.jax.org/strain/021237,unc-csbio
,CC004,CC Cross,020944,CC004,,https://www.jax.org/strain/020944,unc-csbio
,CC005,CC Cross,020945,CC005,,https://www.jax.org/strain/020945,unc-csbio
,CC006,CC Cross,022869,CC006,,https://www.jax.org/strain/022869,unc-csbio
,CC007,CC Cross,029625,CC007,,https://www.jax.org/strain/029625,unc-csbio
,CC008,CC Cross,026971,CC008,,https://www.jax.org/strain/026971,unc-csbio
,CC009,CC Cross,018856,CC009,,https://www.jax.org/strain/018856,unc-csbio
,CC010,CC Cross,021889,CC010,,https://www.jax.org/strain/021889,unc-csbio
,CC011,CC Cross,018854,CC011,,https://www.jax.org/strain/018854,unc-csbio
,CC012,CC Cross,028409,CC012,,https://www.jax.org/strain/028409,unc-csbio
,CC013,CC Cross,021892,CC013,,https://www.jax.org/strain/021892,unc-csbio
,CC015,CC Cross,018859,CC015,,https://www.jax.org/strain/018859,unc-csbio
,CC017,CC Cross,022870,CC017,,https://www.jax.org/strain/022870,unc-csbio
,CC018,CC Cross,021890,CC018,,https://www.jax.org/strain/021890,unc-csbio
,CC024,CC Cross,021891,CC024,,https://www.jax.org/strain/021891,unc-csbio
,CC025,CC Cross,018857,CC025,,https://www.jax.org/strain/018857,unc-csbio
,CC028,CC Cross,025126,CC028,,https://www.jax.org/strain/025126,unc-csbio
,CC029,CC Cross,026972,CC029,,https://www.jax.org/strain/026972,unc-csbio
,CC030,CC Cross,025426,CC030,,https://www.jax.org/strain/025426,unc-csbio
,CC032,CC Cross,020946,CC032,,https://www.jax.org/strain/020946,unc-csbio
,CC036,CC Cross,025127,CC036,,https://www.jax.org/strain/025127,unc-csbio
,CC037,CC Cross,025423,CC037,,https://www.jax.org/strain/025423,unc-csbio
,CC038,CC Cross,,CC038,,,unc-csbio
,CC041,CC Cross,021893,CC041,,https://www.jax.org/strain/021893,unc-csbio
,CC043,CC Cross,023828,CC043,,https://www.jax.org/strain/023828,unc-csbio
,CC055,CC Cross,,CC055,,,unc-csbio
,CC057,CC Cross,024683,CC057,,https://www.jax.org/strain/024683,unc-csbio
,CC060,CC Cross,026427,CC060,,https://www.jax.org/strain/026427,unc-csbio
,CC062,CC Cross,,CC062,,,unc-csbio
,CC065,CC Cross,,CC065,,,unc-csbio
,CC071,CC Cross,,CC071,,,unc-csbio
,CC074,CC Cross,018855,CC074,,https://www.jax.org/strain/018855,unc-csbio""")

strains = pandas.read_csv(
    strains_csv,
    converters={"Jax Catalog No": str_or_none}
)

for i, row in strains.iterrows():
    assert pandas.notnull(row["Strain"]), f"{row['Strain']} was null {i} {row}"
    assert pandas.notnull(row["Sample CODE"]), f"{row['Sample CODE']} was null {i} {row}"
    assert pandas.notnull(row["Source"]), f"{row['Source']} was null {i} {row}"

strains


Unnamed: 0,Designation,Strain,Note,Jax Catalog No,Sample CODE,Strain notes,see_also,Source
0,A,A/J,CC founder,000646,AJ,Yellower adrenal gland.,https://www.jax.org/strain/000646,jackson-labs
1,B,C57BL/6J,CC founder,000664,B6J,,https://www.jax.org/strain/000664,jackson-labs
2,C,129S1/SvImJ,CC founder,002448,129S1J,Male skin is tougher than female skin. Gallbla...,https://www.jax.org/strain/002448,jackson-labs
3,D,NOD/ShiLtJ,CC founder,001976,NODJ,,https://www.jax.org/strain/001976,jackson-labs
4,E,NZO/HlLtJ,CC founder,002105,NZOJ,,https://www.jax.org/strain/002105,jackson-labs
5,F,CAST/EiJ,CC founder,000928,CASTJ,,https://www.jax.org/strain/000928,jackson-labs
6,G,PWK/PhJ,CC founder,003715,PWKJ,,https://www.jax.org/strain/003715,jackson-labs
7,H,WSB/EiJ,CC founder,001145,WSBJ,,https://www.jax.org/strain/001145,jackson-labs
8,,B6129SF1/J,CC F1,101043,B6129SF1J,,https://www.jax.org/strain/101043,jackson-labs
9,,B6AF1/J,CC F1,101043,B6AF1J,,https://www.jax.org/strain/100002,jackson-labs


In [30]:
strain_code_to_name = strains[["Strain", "Sample CODE"]].set_index("Sample CODE")["Strain"].to_dict()
strain_name_to_code = strains[["Strain", "Sample CODE"]].set_index("Strain")["Sample CODE"].to_dict()

In [31]:
#strain_name_to_code
strain_code_to_name

{'AJ': 'A/J',
 'B6J': 'C57BL/6J',
 '129S1J': '129S1/SvImJ',
 'NODJ': 'NOD/ShiLtJ',
 'NZOJ': 'NZO/HlLtJ',
 'CASTJ': 'CAST/EiJ',
 'PWKJ': 'PWK/PhJ',
 'WSBJ': 'WSB/EiJ',
 'B6129SF1J': 'B6129SF1/J',
 'B6AF1J': 'B6AF1/J',
 'B6CASTF1J': 'B6CASTF1/J',
 'B6NODF1J': 'B6NODF1/J',
 'B6NZOF1J': 'B6NZOF1/J',
 'B6PWKF1J': 'B6PWKF1/J',
 'B6WSBF1J': 'B6WSBF1/J',
 'TREM2R47HNSS_HO': 'TREM2R47HNSS_HO',
 'CC001': 'CC001',
 'CC002': 'CC002',
 'CC003': 'CC003',
 'CC004': 'CC004',
 'CC005': 'CC005',
 'CC006': 'CC006',
 'CC007': 'CC007',
 'CC008': 'CC008',
 'CC009': 'CC009',
 'CC010': 'CC010',
 'CC011': 'CC011',
 'CC012': 'CC012',
 'CC013': 'CC013',
 'CC015': 'CC015',
 'CC017': 'CC017',
 'CC018': 'CC018',
 'CC024': 'CC024',
 'CC025': 'CC025',
 'CC028': 'CC028',
 'CC029': 'CC029',
 'CC030': 'CC030',
 'CC032': 'CC032',
 'CC036': 'CC036',
 'CC037': 'CC037',
 'CC038': 'CC038',
 'CC041': 'CC041',
 'CC043': 'CC043',
 'CC055': 'CC055',
 'CC057': 'CC057',
 'CC060': 'CC060',
 'CC062': 'CC062',
 'CC065': 'CC065',
 'CC

In [32]:
print(strains.to_csv(index=False))

Designation,Strain,Note,Jax Catalog No,Sample CODE,Strain notes,see_also,Source
A,A/J,CC founder,000646,AJ,Yellower adrenal gland.,https://www.jax.org/strain/000646,jackson-labs
B,C57BL/6J,CC founder,000664,B6J,,https://www.jax.org/strain/000664,jackson-labs
C,129S1/SvImJ,CC founder,002448,129S1J,Male skin is tougher than female skin. Gallbladder is more filled in this strain. Adrenals are pale and smaller than A6J.,https://www.jax.org/strain/002448,jackson-labs
D,NOD/ShiLtJ,CC founder,001976,NODJ,,https://www.jax.org/strain/001976,jackson-labs
E,NZO/HlLtJ,CC founder,002105,NZOJ,,https://www.jax.org/strain/002105,jackson-labs
F,CAST/EiJ,CC founder,000928,CASTJ,,https://www.jax.org/strain/000928,jackson-labs
G,PWK/PhJ,CC founder,003715,PWKJ,,https://www.jax.org/strain/003715,jackson-labs
H,WSB/EiJ,CC founder,001145,WSBJ,,https://www.jax.org/strain/001145,jackson-labs
,B6129SF1/J,CC F1,101043,B6129SF1J,,https://www.jax.org/strain/101043,jackson-labs
,B6AF1/J,CC F1,101043,B6AF1J,,https://

In [33]:
strain_type_lookup = {
    "CC founder": models.StrainType.FOUNDER,
    "CC F1": models.StrainType.F1,
    "CC Cross": models.StrainType.CROSS,
    "CC Mutant": models.StrainType.MUTANT,
}

if models.MouseStrain.objects.count() > 0:
    truncate(models.MouseStrain)

# This is the version where I was pulling from Liz's spreadsheet
#for i, row in strains.iterrows():
#    record = models.MouseStrain(
#        name=row["Sample CODE"],
#        display_name=strain_name[row["Sample CODE"]],
#        igvf_id=strain_igvf_id[row["Sample CODE"]],
#        strain_type=strain_type_lookup[row["Note"]],
#        jax_catalog_number=row["Jax Catalog No"],
#        notes=str_or_empty(row["Strain notes"]),
#        see_also=row["see_also"],
#        source=jax_source,
#    )
#    record.save()

sources = {
    "jackson-labs": jax_source,
    "unc-csbio": unc_csbio_source,
}

# this version is from my embedded csv file, so less adjusting needed
for i, row in strains.iterrows():
    record = models.MouseStrain(
        name=row["Sample CODE"],
        display_name=row["Strain"],
        igvf_id=strain_igvf_id[row["Sample CODE"]] if pandas.notnull(row["Sample CODE"]) else None,
        strain_type=strain_type_lookup[row["Note"]],
        jax_catalog_number=str_or_none(row["Jax Catalog No"]),
        notes=str_or_empty(row["Strain notes"]),
        see_also=str_or_none(row["see_also"]),
        source=sources[row["Source"]],
    )
    record.save()


In [34]:
row

Designation                                     NaN
Strain                                        CC074
Note                                       CC Cross
Jax Catalog No                               018855
Sample CODE                                   CC074
Strain notes                                    NaN
see_also          https://www.jax.org/strain/018855
Source                                    unc-csbio
Name: 49, dtype: object

This is closest to the Mouse table

In [35]:
def load_submitted_mice():
    submitted_mice = {
    }

    for accession_prefix in submitted_sheets:
        for i, row in submitted_sheets[accession_prefix]["rodent_donor"].iterrows():
            if not pandas.isnull(row["accession"]):
                rodent_identifier = row["rodent_identifier"]
                if accession_prefix == "igvftst":
                    see_also_template = "https://api.sandbox.igvf.org/rodent-donors/{}/"
                elif accession_prefix == "igvf":
                    see_also_template = "https://api.data.igvf.org/rodent-donors/{}/"
                if pandas.notnull(row["accession"]):
                    submitted_mice.setdefault(rodent_identifier, []).append({
                        "accession_prefix": accession_prefix,
                        "name": row["accession"],
                        "uuid": row["uuid"],
                        "see_also": see_also_template.format(row["accession"]),
                    })

    print(len(submitted_mice))
    return submitted_mice


In [36]:
def estrus_cycle(x):
    if pandas.isnull(x):
        return "NA"
    else:
        return x

    
def load_mouse_tables():
    submitted_mice = load_submitted_mice()
    
    sex_lookup = {
        numpy.nan: models.SexEnum.UNKNOWN,
        "Female": models.SexEnum.FEMALE,
        "Male": models.SexEnum.MALE,
    }

    estrus_stage = {
        "Unknown": models.EstrusCycle.UNKNOWN,
        "Anestrus": models.EstrusCycle.ANESTRUS,
        "Anestrus>Proestrus": models.EstrusCycle.ANESTRUS_PROESTRUS,
        "Proestrus": models.EstrusCycle.PROESTRUS,
        "Proestrus>Estrus": models.EstrusCycle.PROESTRUS_ESTRUS,
        "Estrus": models.EstrusCycle.ESTRUS,
        "Estrus>Metestrus": models.EstrusCycle.ESTRUS_METESTRUS,
        "Metestrus": models.EstrusCycle.METESTRUS,
        "Metestrus>Diestrus": models.EstrusCycle.METESTRUS_DIESTRUS,
        "Diestrus": models.EstrusCycle.DIESTRUS,
        "Diestrus>Proestrus": models.EstrusCycle.DIESTRUS_PROESTRUS,
    }

    mice = pandas.read_excel(
        lizs_sheet_name, 
        sheet_name="mice", 
        usecols=range(0,16),
        converters={
            "Estres cycle stage": estrus_cycle,
            "Dissection ID": int,
            "RMS number": int_or_none,
            "Housing number": int_or_none,
        }
    )

    if models.Mouse.objects.count() > 0:
        truncate(models.Mouse)

    los_angeles_tz = zoneinfo.ZoneInfo("America/Los_Angeles")


    failed = False
    for i, row in mice.iterrows():
        if not (pandas.isnull(row["Mouse Name"]) or pandas.isnull(row["Dissection ID"])):
            name = normalize_strain(row["Mouse Name"])
            strain_code = strain_name_to_code[row["Strain"]]

            if name == "056_WSBJ_10F->PWKJ_9F":
                name = "056_PWKJ_9F"

            try:
                strain = models.MouseStrain.objects.get(name=strain_code)
            except models.MouseStrain.DoesNotExist as e:
                print(f"Unable to find {row['Strain']} for row {i+2} with code {strain_code}")
                failed = True
                continue

            if not name[-1] in ("M", "F"):
                raise ValueError(f"Unrecognized sex field {mouse_name}")

            name_fields = name.split("_")

            # deal with TERM2R47HNSS_HO
            if len(name_fields) > 3:
                name_fields = [name_fields[0], "_".join(name_fields[1:-1]), name_fields[-1]]
            #if len(name_fields) != 3:
            #    raise ValueError(f"Wrong number of attributes in {name} line {i+1}")

            if not name_fields[1] in strain_igvf_id:
                raise ValueError(f"strain background in {name} not recognized line {i+1}")

            housing_number = None if pandas.isnull(row["Housing number"]) else row["Housing number"]

            record = models.Mouse(
                # should i use liz's disection id?
                name=name,
                strain=strain,
                sex=sex_lookup[row["Sex"]],
                weight_g=row["Weight (g)"],
                date_of_birth=row["DOB"].date() if not pandas.isnull(row["DOB"]) else None,
                harvest_date=row["Harvest Date"].date() if not pandas.isnull(row["Harvest Date"]) else None,
                timepoint_description=row["Timepoint"],
                life_stage=models.LifeStageEnum.ADULT,                
                operator=str_or_empty(row["Operator"]),
                notes=str_or_empty(row["Comments"]),
                housing_number=housing_number,
            )

            if not pandas.isnull(row["Estres cycle stage"]):
                # Ignore those transitions.
                stage = row["Estres cycle stage"]
                if stage.endswith("?"):
                    stage = stage[:-1]
                record.estrus_cycle=estrus_stage[stage]

            record.save()

            if name in submitted_mice:
                accessions = []
                for accession_record in submitted_mice[name]:
                    accession = models.Accession(
                        accession_prefix=accession_record["accession_prefix"],
                        name=accession_record["name"],
                        uuid=accession_record["uuid"],
                        see_also=accession_record["see_also"],
                    )
                    accession.save()
                    accessions.append(accession)
                record.accession.set(accessions)
                record.save()

    assert not failed, "Check warning messages"
    
load_mouse_tables()

121


In [37]:
%debug

ERROR:root:No traceback has been produced, nothing to debug.


# Tissues

In [38]:

def load_tissue_sheets():
    tissue_sheets = [
        "Samples - 8 founders", 
        "Samples - Bridge", 
        "Samples - F1s",
        "Samples - CClines",
    ]
    tissues = []
    for tissue_sheet in tissue_sheets:
        tissue = pandas.read_excel(
            lizs_sheet_name, 
            sheet_name=tissue_sheet,
            header=0,
        ).dropna(axis=0, how="all")
        if "tube+tissue wight (g)" in tissue.columns:
            print(f"{tissue_sheet} has typoed wight column, renaming to weight")
            tissue = tissue.rename(columns={
                "tube+tissue wight (g)": "tube+tissue weight (g)"
            })
        print(tissue_sheet, tissue.shape)
        tissues.append(tissue)

    combined = pandas.concat(tissues)
    assert "tube+tissue wight (g)" not in combined.columns
    return combined

_ = load_tissue_sheets()

Samples - 8 founders (1621, 16)
Samples - Bridge (416, 23)
Samples - F1s (1920, 18)
Samples - CClines (5720, 18)


In [39]:
def parse_timepoint(value):
    value, units = value.split(" ")
    value = float(value)
    
    unit_map = {
        "day": models.AgeUnitsEnum.DAY,
        "days": models.AgeUnitsEnum.DAY,
        "week": models.AgeUnitsEnum.WEEK,
        "weeks": models.AgeUnitsEnum.WEEK,
        "month": models.AgeUnitsEnum.MONTH,
        "months": models.AgeUnitsEnum.MONTH,
    }
    return (value, unit_map[units])


In [40]:
ontology_map = {}
for record in models.OntologyTerm.objects.all():
    ontology_map[record.curie] = record

In [41]:
def load_submitted_tissues():
    submitted_tissues = {}

    for accession_prefix in submitted_sheets:
        for i, row in submitted_sheets[accession_prefix]["tissue"].iterrows():
            if not pandas.isnull(row["accession"]):
                if accession_prefix == "igvftst":
                    see_also_template = "https://api.sandbox.igvf.org/tissues/{}/"
                elif accession_prefix == "igvf":
                    see_also_template = "https://api.data.igvf.org/tissues/{}/"

                aliases = row["aliases:array"].split(",")
                assert len(aliases) == 1
                alias_fields = aliases[0][len("ali-mortazavi:"):].split("_")
                tissue_id = "_".join(alias_fields[0:4])
                if pandas.notnull(row["accession"]):
                    submitted_tissues.setdefault(tissue_id, []).append({
                        "accession_prefix": accession_prefix,
                        "name": row["accession"],
                        "uuid": row["uuid"],
                        "see_also": see_also_template.format(row["accession"]),
                    })

    print(len(submitted_tissues))
    return submitted_tissues

submitted_tissues = load_submitted_tissues()
#len(submitted_tissues)

692


In [42]:
for key in submitted_tissues:
    value = submitted_tissues[key]
    if len(value) != 1:
        print(key, value)

016_B6J_10F_03 [{'accession_prefix': 'igvftst', 'name': 'TSTSM21751020', 'uuid': '5093a872-bf44-42c5-abfd-a99dd3d6df2c', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM21751020/'}, {'accession_prefix': 'igvftst', 'name': 'TSTSM20935867', 'uuid': '8ac461bf-8d25-4f0e-bd17-f5043cb96ce5', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM20935867/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM2747ZOJJ', 'uuid': '12089b43-dc3c-40cd-b7b7-de5934949a94', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM2747ZOJJ/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM9909OKAR', 'uuid': '3ecee650-301b-481f-bc5e-1a46dafd4542', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM9909OKAR/'}]
017_B6J_10M_03 [{'accession_prefix': 'igvftst', 'name': 'TSTSM58133825', 'uuid': '56f28252-6e1a-4367-bd0d-3bdd75abb6f2', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM58133825/'}, {'accession_prefix': 'igvftst', 'name': 'TSTSM51811428', 'uuid': 'ce1bfdf4-cacb-4859-a79f-5da312178a27', 'see_a

069_NODJ_10M_03 [{'accession_prefix': 'igvftst', 'name': 'TSTSM70776988', 'uuid': '64c37487-bc8c-4fbf-a70d-4815014ee4e3', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM70776988/'}, {'accession_prefix': 'igvftst', 'name': 'TSTSM04240735', 'uuid': 'b54afd50-51d6-4b6f-b4ae-41e5fb04a26c', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM04240735/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM2210CUOC', 'uuid': '3237d42d-9f30-412c-be67-97f43cd605bb', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM2210CUOC/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM3655BTER', 'uuid': '2b03b94c-c611-44c1-98c6-7aa18f1641c3', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM3655BTER/'}]
070_NODJ_10F_03 [{'accession_prefix': 'igvftst', 'name': 'TSTSM16736591', 'uuid': '7f713377-43fd-4ea7-901c-35c3e50e8a77', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM16736591/'}, {'accession_prefix': 'igvftst', 'name': 'TSTSM83682213', 'uuid': '7f390718-5684-4145-b3b5-85284e482727', 'see

032_AJ_10F_03 [{'accession_prefix': 'igvftst', 'name': 'TSTSM80577942', 'uuid': '6d8e091d-a63e-4464-b944-2785039b98a6', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM80577942/'}, {'accession_prefix': 'igvftst', 'name': 'TSTSM57113277', 'uuid': '26bdccbd-1bba-4691-aab9-8e6228442673', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM57113277/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM3195CJCF', 'uuid': 'cf59909d-c385-4e5c-a1c7-10dcdc2661f2', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM3195CJCF/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM6726YGIG', 'uuid': '18411a3d-e5dc-40bd-afe5-fcc9d0009ea4', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM6726YGIG/'}]
035_AJ_10M_03 [{'accession_prefix': 'igvftst', 'name': 'TSTSM59466810', 'uuid': 'de3f2622-b21c-4521-9496-04c96805f19f', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM59466810/'}, {'accession_prefix': 'igvftst', 'name': 'TSTSM93284518', 'uuid': '2eda64f7-e9e3-42fe-a6a7-c68fa659e718', 'see_als

037_129S1J_10M_03 [{'accession_prefix': 'igvftst', 'name': 'TSTSM79359987', 'uuid': '24278b79-3600-4fd8-8ef3-55f78bbe77be', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM79359987/'}, {'accession_prefix': 'igvftst', 'name': 'TSTSM14025651', 'uuid': '2657fe8f-9409-4cdd-b8d3-522bd1135eac', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM14025651/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM0860OBUU', 'uuid': '9ce6f4ec-0e70-4cce-ab32-b2fe0bc9e97b', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM0860OBUU/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM2419NKNV', 'uuid': '44fe44ad-3ce9-466c-97c5-8b8936d14106', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM2419NKNV/'}]
038_129S1J_10F_03 [{'accession_prefix': 'igvftst', 'name': 'TSTSM46396178', 'uuid': '8ef01892-7f1b-4ea2-9b50-5d827e1f1d72', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM46396178/'}, {'accession_prefix': 'igvftst', 'name': 'TSTSM52735813', 'uuid': '21859d64-6f62-4cfc-b51e-fb4280809d4a', 

090_CASTJ_10F_03 [{'accession_prefix': 'igvftst', 'name': 'TSTSM38489848', 'uuid': '5ec2ee20-ad68-4a12-ada9-ba63bd84b113', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM38489848/'}, {'accession_prefix': 'igvftst', 'name': 'TSTSM57565332', 'uuid': '03d61e1c-9c02-4669-9129-2c0c9ad0ad21', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM57565332/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM9194WULA', 'uuid': 'e97872f8-da47-4980-8a75-0426a69ea189', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM9194WULA/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM3391TANO', 'uuid': '4a773267-7169-43a1-9a01-67fdd84f0351', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM3391TANO/'}]
091_CASTJ_10M_03 [{'accession_prefix': 'igvftst', 'name': 'TSTSM62101456', 'uuid': 'd7a9a951-4843-4ff0-b909-26d3098c87f7', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM62101456/'}, {'accession_prefix': 'igvftst', 'name': 'TSTSM66782725', 'uuid': 'a9aa2edc-a47e-4dce-9b4e-daa393396a2d', 's

063_WSBJ_10M_03 [{'accession_prefix': 'igvftst', 'name': 'TSTSM55897972', 'uuid': '1e20fa9c-ea22-4766-a1df-d8b53af753b6', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM55897972/'}, {'accession_prefix': 'igvftst', 'name': 'TSTSM15834985', 'uuid': '07bb5012-908c-416a-95ba-64010ff3cf20', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM15834985/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM8727ZEXW', 'uuid': '857c0d4c-64a5-42bb-8d6f-189ce3460106', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM8727ZEXW/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM1039KCGC', 'uuid': 'bbc4bbb1-c7ef-4c6b-9526-ed8daae6c024', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM1039KCGC/'}]
046_NZOJ_10F_03 [{'accession_prefix': 'igvftst', 'name': 'TSTSM24255023', 'uuid': 'dfff0205-ca9b-47a5-8e77-41fc29ffecf1', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM24255023/'}, {'accession_prefix': 'igvftst', 'name': 'TSTSM72017255', 'uuid': '3adad841-b470-457f-986e-c4b150107e70', 'see

068_NODJ_10F_06 [{'accession_prefix': 'igvftst', 'name': 'TSTSM71072025', 'uuid': '21c29c17-6f6e-47f2-a23e-430fd41d9e30', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM71072025/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM6061AIFT', 'uuid': 'c8dedd9c-8424-4846-a347-fbfe73c2531a', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM6061AIFT/'}]
019_B6J_10M_06 [{'accession_prefix': 'igvftst', 'name': 'TSTSM46884051', 'uuid': '4f8c3b79-f670-4d89-adbe-95eb3131854d', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM46884051/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM3819GVRA', 'uuid': '17a90e67-0181-48a4-ad10-b4eacd8b632d', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM3819GVRA/'}]
069_NODJ_10M_06 [{'accession_prefix': 'igvftst', 'name': 'TSTSM04142593', 'uuid': '26ec404b-af28-46af-b84c-0300057c8a06', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM04142593/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM9555SHCP', 'uuid': '2cd63182-4ad1-4d86-8217-4e497

081_PWKJ_10M_06 [{'accession_prefix': 'igvftst', 'name': 'TSTSM92869047', 'uuid': '118f6311-3fc4-4006-8245-e3f16c44616a', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM92869047/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM3449RSOM', 'uuid': 'aba5e223-75b7-4180-8d99-dc96c28c05cd', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM3449RSOM/'}]
032_AJ_10F_06 [{'accession_prefix': 'igvftst', 'name': 'TSTSM79829816', 'uuid': '2072f0ab-f6b8-4937-8bc4-d0d441894a51', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM79829816/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM2048BQHM', 'uuid': 'bc124c8c-160f-4e8b-8cd0-b2d91cc7e03a', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM2048BQHM/'}]
084_PWKJ_10F_06 [{'accession_prefix': 'igvftst', 'name': 'TSTSM52490317', 'uuid': '3b050af2-f7dd-4ee0-bb77-2480a4b4c565', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM52490317/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM5687CRRX', 'uuid': '8cc97f35-a489-446c-a1b9-20d291

046_NZOJ_10F_06 [{'accession_prefix': 'igvftst', 'name': 'TSTSM56267464', 'uuid': '7cbf667d-8e6b-4136-a86d-39736d913051', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM56267464/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM0510YNAJ', 'uuid': '935b1909-c93c-452f-8511-001e4d66958f', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM0510YNAJ/'}]
057_WSBJ_10M_06 [{'accession_prefix': 'igvftst', 'name': 'TSTSM25214382', 'uuid': 'fe31cebf-74f0-452c-a375-8c39654fc11f', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM25214382/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM1456DXFQ', 'uuid': 'cd8afd32-4cdd-496a-9147-1c7f9f5f7c49', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM1456DXFQ/'}]
047_NZOJ_10M_06 [{'accession_prefix': 'igvftst', 'name': 'TSTSM83837273', 'uuid': 'c5def42a-2e25-493d-b51b-fc1b48e0331f', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM83837273/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM7577ZSZL', 'uuid': '1991df86-8766-43b4-819b-a022

107_CASTJ_10M_21 [{'accession_prefix': 'igvftst', 'name': 'TSTSM56825988', 'uuid': 'cfd1ca9e-6ed3-4eb1-bb1d-7b954b5d58e0', 'see_also': 'https://api.sandbox.igvf.org/tissues/TSTSM56825988/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM5157GFSA', 'uuid': 'edd70761-6e0b-46d7-bda1-e34c5a3b2266', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM5157GFSA/'}]
092_CASTJ_10F_03 [{'accession_prefix': 'igvf', 'name': 'IGVFSM4189AKUL', 'uuid': '8cb2f317-879e-4ab1-82fd-0b95c3e660e0', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM4189AKUL/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM0322EOHO', 'uuid': '862fc166-261c-4e78-b219-611fe387a33d', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM0322EOHO/'}]
016_B6J_10F_01 [{'accession_prefix': 'igvf', 'name': 'IGVFSM4465YOJN', 'uuid': 'd296f0d1-265c-426b-a808-95c00b827243', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM4465YOJN/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM5984CJUU', 'uuid': 'b28f2e52-2d05-4743-865b-1cbfef1844b

078_PWKJ_10F_01 [{'accession_prefix': 'igvf', 'name': 'IGVFSM9111MIVH', 'uuid': '09bdfe37-27e8-4e14-a8b7-d2ddd7bbc7ac', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM9111MIVH/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM2146SQAW', 'uuid': '2524a727-bbc3-48b8-a40f-fadf802233ad', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM2146SQAW/'}]
031_AJ_10M_01 [{'accession_prefix': 'igvf', 'name': 'IGVFSM6893IKIH', 'uuid': '61a2b143-b341-437e-8117-ea685322676d', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM6893IKIH/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM3351WODP', 'uuid': '7e190df1-ae82-4925-bc22-1ec75270b252', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM3351WODP/'}]
079_PWKJ_10M_01 [{'accession_prefix': 'igvf', 'name': 'IGVFSM3536WDNR', 'uuid': 'd18320cc-ed7a-4d10-8a8f-d918b492ab8d', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM3536WDNR/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM3744QPBB', 'uuid': 'eaab5bfc-ca1e-415b-b41d-c652530c842b', 'se

044_129S1J_10F_01 [{'accession_prefix': 'igvf', 'name': 'IGVFSM6738OHZO', 'uuid': '83e218de-7b42-426b-81be-65127043e3e0', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM6738OHZO/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM0539MTAN', 'uuid': 'b49b3933-3e97-4507-9a8a-c122e6a8bc06', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM0539MTAN/'}]
094_CASTJ_10F_01 [{'accession_prefix': 'igvf', 'name': 'IGVFSM0021WQYA', 'uuid': 'da950e21-1ab3-4705-9511-5dfe7dc4176f', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM0021WQYA/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM1257AOMZ', 'uuid': 'd959cea7-a785-401c-aa5e-89c07cdb5de7', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM1257AOMZ/'}]
045_129S1J_10M_01 [{'accession_prefix': 'igvf', 'name': 'IGVFSM6742HLKN', 'uuid': '649ec13f-97a1-4398-9148-f381fddd6df0', 'see_also': 'https://api.data.igvf.org/tissues/IGVFSM6742HLKN/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM1996CIIR', 'uuid': '9fce9d03-d51b-49cf-b8e1-da310b5fe33

In [43]:
def validate_tissue_genotype(tissue, genotype):
    normalized_genotype = normalize_strain(genotype)
    assert tissue == normalized_genotype, f"{tissue} != {normalized_genotype} ({genotype})"

In [44]:
def load_tissues(strain_code_to_name):
    if models.Tissue.objects.count() > 0:
        truncate(models.Tissue)

    tissues = load_tissue_sheets()
    submitted_tissues = load_submitted_tissues()
    los_angeles_tz = zoneinfo.ZoneInfo("America/Los_Angeles")
    
    failed = False
    for i, row in tissues.iterrows():
        if not pandas.isnull(row["Tissue"]) and row["IGVF Sample BOX"] != "DNA Extraction Optimization":
            mouse_tissue_id_label = "Mouse_Tissue ID"

            mouse_tissue_name = normalize_strain(row[mouse_tissue_id_label])
            mouse_tissue_name_fields = parse_mouse_tissue(mouse_tissue_name)

            if not mouse_tissue_name_fields.mouse_strain in strain_code_to_name:
                raise ValueError(f"tissue strain field in {mouse_tissue_name} not recognized")

            mouse_name = "_".join(mouse_tissue_name_fields[0:3])
            try:
                mouse = models.Mouse.objects.get(name=mouse_name)
            except models.Mouse.DoesNotExist:
                print("row {}, {} was not found".format(i+2, mouse_name))
                failed = True
                continue

            mouse_tissue = "_".join(mouse_tissue_name_fields)

            genotype = row["Genotype"]

            # this is the "label swap" on spreadsheet rows 602-605.
            if mouse_name == "092_CASTJ_10F":
                genotype = "CASTJ"
            # this is the other half the swap on spreadsheet rows 1522-1525
            elif mouse_name == "046_NZOJ_10F":
                genotype = 'NZOJ'

            validate_tissue_genotype(mouse.strain.name, genotype)

            tissue_terms = []
            for term_curie, term_name in tissue_dissection_to_ontology_map[row["Tissue"]]:
                tissue_terms.append(ontology_map[term_curie])

            #age, age_units = parse_timepoint(row["Timepoint"])

            if pandas.isnull(row["Approx. sac time"]):
                sac_time = datetime.time(0,0,0)
            else:
                sac_time = datetime.time(
                    row["Approx. sac time"].hour,
                    row["Approx. sac time"].minute,
                    row["Approx. sac time"].second,    
                )


            record = models.Tissue(
                mouse=mouse,
                name = mouse_tissue,
                description = row["Tissue"],
                #dissection_time=dissection,
                #age=age,
                #age_units=age_units,
                tube_label=row["Tube label"],
                dissector=str_or_empty(row["Dissector"]),
                dissection_notes=str_or_empty(row["Comment"]),
            )

            if not pandas.isnull(row["Dissection date"]):
                record.dissection = datetime.datetime(
                    row["Dissection date"].year,
                    row["Dissection date"].month,
                    row["Dissection date"].day,
                    sac_time.hour,
                    sac_time.minute,
                    sac_time.second,
                    tzinfo=los_angeles_tz,
                )

            tube_weight_label = "tube weight (g)"
            if not pandas.isnull(row[tube_weight_label]):
                record.tube_weight_g = float(row[tube_weight_label])

            total_weight_label = "tube+tissue weight (g)"
            if not pandas.isnull(row[total_weight_label]):
                record.total_weight_g = float(row[total_weight_label])

            record.save()
            record.ontology_term.set(tissue_terms)

            if row[mouse_tissue_id_label] in submitted_tissues:
                accessions = []
                for accession_row in submitted_tissues[row[mouse_tissue_id_label]]:
                    if pandas.notnull(accession_row["name"]):
                        accession = models.Accession(
                            accession_prefix=accession_row["accession_prefix"],
                            name=accession_row["name"],
                            uuid=accession_row["uuid"],
                            see_also=accession_row["see_also"],
                        )
                        accession.save()
                        accessions.append(accession)
                record.accession.set(accessions)
            record.save()

    assert not failed, "Check warning messages."
    
load_tissues(strain_code_to_name)

Samples - 8 founders (1621, 16)
Samples - Bridge (416, 23)
Samples - F1s (1920, 18)
Samples - CClines (5720, 18)
692


In [45]:
#submitted_tissues[row[mouse_tissue_id_label]], type(submitted_tissues[row[mouse_tissue_id_label]])

# Information from Samples into experiment

- Total barcoded nuclei (Samples into experiment)

Is this FixedTissue, FixedSample, FixedBiosample?



In [46]:
if models.FixedSample.objects.count() > 0:
    truncate(models.FixedSample)


In [47]:
def load_samples_into_experiment(book_name, sheet_name, header=0):
    samples_into_experiment = pandas.read_excel(
        book_name, 
        sheet_name=sheet_name,
        header=header,
        index_col=None,
    ).dropna(how="all")

    # was trying to normalize two row hierarchical headers, 
    #level0 = []
    #level1 = []
    #for header in samples_into_experiment.columns:
    #    level0.append("" if header[0].startswith("Unnamed: ") else header[0])
    #    level1.append(header[1])
    #samples_into_experiment.columns = pandas.MultiIndex.from_arrays([level0, level1], names=["phase", "name"])

    one_based = 1
    header_lines = 2
    failed = False
    for i, row in samples_into_experiment.iterrows():
        line_no = i + one_based + header_lines
        box_name = row["IGVF Fixation BOX"]
        if isinstance(box_name, str):
            box_name = box_name.strip()

        if box_name in ["IGVF_FIX_001", "IGVF_FIX_002"]:
            continue
            
        if pandas.isnull(row["Mouse_Tissue ID"]):
            continue

        tissue_id = normalize_strain(row["Mouse_Tissue ID"])
        pooled_from = row.get("pooled_from")

        if pandas.isnull(tissue_id):
            continue

        #if (tissue_id.endswith("_25") or tissue_id.endswith("_26")) and pandas.isnull(pooled_from):
        #    print(f"How are we going to merge these samples? {tissue_id}")
        #    continue

        if pandas.isnull(pooled_from):
            pooled_from = [tissue_id]
        else:
            pooled_from = [normalize_strain(pooled_id.strip()) for pooled_id in pooled_from.split(",")]

        tissues = []
        for pooled_id in pooled_from:
            try:
                tissues.append(models.Tissue.objects.get(name=pooled_id))
            except models.Tissue.DoesNotExist:
                print(f"Tissue {pooled_id} not found in tissue table on {line_no}")
                failed = True

        weight = row["weight (mg)"]
        fixation_date = row["Fixation date"]
        notes = row["Notes"]

        if weight == "#VALUE!":
            pass
        elif weight < 0:
            pass
        elif pandas.isnull(weight):
            pass
        else:
            total_weight = 0
            for tissue in tissues:
                if not pandas.isnull(tissue.weight_mg):
                    total_weight += tissue.weight_mg

            if not numpy.isclose(total_weight, weight):
                print(f"Sum of tissue weights from {pooled_from} {total_weight} doesn't match {weight}. {line_no}")

        sample_id = normalize_strain(row["Sample ID"]) if not pandas.isnull(row["Sample ID"]) else None
        if not (pandas.isnull(sample_id) or pandas.isnull(tissue_id)):
            if tissue_id != sample_id:
                print(f"If defined, {tissue_id} should equal {sample_id} line {line_no}")
                failed = True

        cap_label = row["Cap label"]
        if not (pandas.isnull(cap_label) or pandas.isnull(sample_id)):
            sample_fields = sample_id.split("_")
            predicted_label = "_".join([sample_fields[0], sample_fields[-1]])
            if predicted_label != cap_label:
                print(f"{cap_label} should equal {predicted_label} line {line_no}")
                failed = True

        well_id = row["wells in Barcoding plate"]

        record = models.FixedSample(
            name=tissue_id,
            tube_label=cap_label,
            fixation_name=box_name,
            fixation_date=date_or_none(row["Fixation date"]),
            starting_nuclei=int_or_none(row["Total nuclei (x10^6)"]),
            nuclei_into_fixation=int_or_none(row["Nuclei into fixation (x10^6)"]),
            fixed_nuclei=int_or_none(row["Total fixed nuclei (x10^6)"]),
            aliquots_made=int_or_none(row["# aliquots"]),
            aliquot_volume_ul=float_or_none(row["uL per aliquot"]),
        )
        record.save()
        record.tissue.set(tissues)
        record.save()

    assert not failed, "Check warning messages"

    

In [48]:
load_samples_into_experiment(lizs_sheet_name, "Founder Samples into experiment", header=1)
    

Sum of tissue weights from ['046_NZOJ_10F_17'] 116.0 doesn't match 190.0. 558
Sum of tissue weights from ['095_CASTJ_10M_08'] 11.0 doesn't match 15.0. 749


In [49]:
load_samples_into_experiment(lizs_sheet_name, "Bridge samples into experiment", header=0)

In [50]:
load_samples_into_experiment(lizs_sheet_name, "F1 Samples into experiment", header=1)

Sum of tissue weights from ['127_B6AF1J_10M_05'] 1075.0 doesn't match 966.0. 205
Sum of tissue weights from ['128_B6AF1J_10F_05'] 873.0 doesn't match 1078.0. 206
Sum of tissue weights from ['129_B6AF1J_10M_05'] 1139.0 doesn't match 946.0. 207
Sum of tissue weights from ['130_B6AF1J_10F_05'] 66.1 doesn't match 1577.0. 208
Sum of tissue weights from ['131_B6AF1J_10M_05'] 1525.0 doesn't match 748.0. 209
Sum of tissue weights from ['132_B6AF1J_10F_05'] 888.0 doesn't match 1418.0. 210
Sum of tissue weights from ['133_B6AF1J_10M_05'] 0 doesn't match 857.0. 211
Sum of tissue weights from ['134_B6AF1J_10F_05'] 0 doesn't match 1351.0. 212
Sum of tissue weights from ['143_B6129SF1J_10M_05'] 1288.0 doesn't match 603.0. 213
Sum of tissue weights from ['144_B6129SF1J_10F_05'] 995.0 doesn't match 1248.0. 214
Sum of tissue weights from ['145_B6129SF1J_10M_05'] 1259.0 doesn't match 776.0. 215
Sum of tissue weights from ['146_B6129SF1J_10F_05'] 974.0 doesn't match 1216.0. 216
Sum of tissue weights from

Sum of tissue weights from ['194_B6CASTF1J_10F_01'] 90.0 doesn't match 63.0. 340
Sum of tissue weights from ['195_B6CASTF1J_10M_01'] 87.0 doesn't match 66.0. 341
Sum of tissue weights from ['196_B6CASTF1J_10F_01'] 86.0 doesn't match 82.0. 342
Sum of tissue weights from ['197_B6CASTF1J_10M_01'] 98.0 doesn't match 54.0. 343
Sum of tissue weights from ['198_B6CASTF1J_10F_01'] 84.0 doesn't match 60.0. 344
Sum of tissue weights from ['207_B6PWKF1J_10M_01'] 0 doesn't match 47.0. 345
Sum of tissue weights from ['208_B6PWKF1J_10F_01'] 0 doesn't match 52.0. 346
Sum of tissue weights from ['213_B6PWKF1J_10M_01'] 93.0 doesn't match 66.0. 347
Sum of tissue weights from ['210_B6PWKF1J_10F_01'] 0 doesn't match 57.0. 348
Sum of tissue weights from ['215_B6PWKF1J_10M_01'] 96.0 doesn't match 53.0. 349
Sum of tissue weights from ['212_B6PWKF1J_10F_01'] 0 doesn't match 86.0. 350
Sum of tissue weights from ['217_B6PWKF1J_10M_01'] 107.0 doesn't match 53.0. 351
Sum of tissue weights from ['214_B6PWKF1J_10F_

Sum of tissue weights from ['240_TREM2R47HNSS_HO_10F_10', '240_TREM2R47HNSS_HO_10F_11'] 0 doesn't match 334.0. 462
Sum of tissue weights from ['241_TREM2R47HNSS_HO_10M_27', '241_TREM2R47HNSS_HO_10M_29'] 0 doesn't match 43.0. 463
Sum of tissue weights from ['242_TREM2R47HNSS_HO_10F_10', '242_TREM2R47HNSS_HO_10F_11'] 0 doesn't match 365.0. 464
Sum of tissue weights from ['243_TREM2R47HNSS_HO_10M_27', '243_TREM2R47HNSS_HO_10M_29'] 0 doesn't match 30.0. 465
Sum of tissue weights from ['244_TREM2R47HNSS_HO_10F_10', '244_TREM2R47HNSS_HO_10F_11'] 0 doesn't match 341.0. 466
Sum of tissue weights from ['245_TREM2R47HNSS_HO_10M_27', '245_TREM2R47HNSS_HO_10M_29'] 0 doesn't match 40.0. 467
Sum of tissue weights from ['248_TREM2R47HNSS_HO_10F_10', '248_TREM2R47HNSS_HO_10F_11'] 0 doesn't match 370.0. 468
Sum of tissue weights from ['127_B6AF1J_10M_08'] 0 doesn't match 15.0. 505
Sum of tissue weights from ['128_B6AF1J_10F_08'] 0 doesn't match 13.0. 506
Sum of tissue weights from ['129_B6AF1J_10M_08']

Sum of tissue weights from ['183_B6NZOF1J_10M_31'] 0 doesn't match 240.0. 635
Sum of tissue weights from ['182_B6NZOF1J_10F_31'] 0 doesn't match 326.0. 636
Sum of tissue weights from ['191_B6CASTF1J_10M_31'] 0 doesn't match 267.0. 637
Sum of tissue weights from ['192_B6CASTF1J_10F_31'] 0 doesn't match 400.0. 638
Sum of tissue weights from ['193_B6CASTF1J_10M_31'] 0 doesn't match 316.0. 639
Sum of tissue weights from ['194_B6CASTF1J_10F_31'] 0 doesn't match 415.0. 640
Sum of tissue weights from ['195_B6CASTF1J_10M_31'] 0 doesn't match 276.0. 641
Sum of tissue weights from ['196_B6CASTF1J_10F_31'] 0 doesn't match 409.0. 642
Sum of tissue weights from ['197_B6CASTF1J_10M_31'] 0 doesn't match 310.0. 643
Sum of tissue weights from ['198_B6CASTF1J_10F_31'] 0 doesn't match 410.0. 644
Sum of tissue weights from ['207_B6PWKF1J_10M_31'] 0 doesn't match 178.0. 645
Sum of tissue weights from ['208_B6PWKF1J_10F_31'] 0 doesn't match 230.0. 646
Sum of tissue weights from ['213_B6PWKF1J_10M_31'] 0 doe

In [51]:
# According to Liz this sheet isn't ready yet.
# load_samples_into_experiment(lizs_sheet_name, "CC Samples into experiment", header=1)

# Plate layout

(the hard thing)

In [52]:
WellContent = namedtuple("well_content", ["genotype", "tissue_id"])


def get_genotype_from_tissue(tissue_id):
    assert pandas.notnull(tissue_id)

    fields = tissue_id.split("_")
    assert len(fields) == 4, f"Field label validation fail {tissue_id}, not enough fields {len(fields)}"

    return normalize_strain(fields[1])

assert get_genotype_from_tissue("016_B6J_10F_20") == "B6J"
assert get_genotype_from_tissue("046_NZOJ_10F_03") == "NZOJ"
assert get_genotype_from_tissue("198_B6CASTF1/J_10F_20") == "B6CASTF1J"



def validate_tissue_ids(contents, expected_genotypes):
    for expected, (row_index, row) in zip(expected_genotypes, contents.iterrows()):
        for cell in row:
            fields = cell.split("_")
            assert len(fields) == 4, f"Field label validation fail {cell}"
            # there's a few sample swaps, don't validate them.
            if expected == "Mixed":
                continue
            elif fields[1] not in ("092_CASTJ_10F_03"):
                assert fields[1] == expected, f"{fields[1]} failed to match {expected}"


def is_plate_name(name):
    return not pandas.isnull(name) and name.startswith("IGVF_")

def find_plate_start(sheet, offsets):
    block_headers = [
        "M","F","Mixed","B6J","NODJ","AJ","PWKJ","129S1J","CASTJ","WSBJ","NZOJ","B6CASTF1/J","B6PWKF1/J","B6WSBF1/J","B6NODF1/J"
    ]
    for plate_id_row in sheet[sheet[offsets["plate_label"]].apply(is_plate_name)].index:
        plate_name = normalize_plate_name(sheet.loc[plate_id_row, offsets["plate_label"]])
        for i in range(plate_id_row, plate_id_row + 4):
            cell = sheet.loc[i, offsets["well_start"]]
            if isinstance(cell, str) and (cell.startswith("Tissue") or cell in block_headers):
                yield (plate_name, i)
                
def get_plate_genotype(sheet, start, offsets):
    plate_tissues = []
    for cell in sheet.loc[start+2:start+9, offsets["tissue_label"]].tolist():
        if not pandas.isnull(cell):
            match = re.match("(B6J|NODJ|AJ|PWKJ|129S1J|CASTJ|WSBJ|NZOJ|Mixed)", cell, re.IGNORECASE)
            if match is not None:
                plate_tissues.append(match.group(0))
            else:
                print("Diffculty matching {} in {} {}".format(cell, start+2, start+9))
    
    return plate_tissues


def validate_plate_column_ids(plate_column_ids):
    expected_column_ids = set(range(1,13))
    for column_id in plate_column_ids:
        assert column_id in expected_column_ids, f"{column_id} {type(column_id)} not in expected_column_ids"
    
def validate_plate_row_ids(plate_row_labels):
    expected_row_labels = set(("A", "B", "C", "D", "E", "F", "G", "H"))
    for row_label in plate_row_labels:
        assert row_label in expected_row_labels, f"{row_label} not in expected labels"
    
def parse_single_well_block(sheet, plate_start, offsets):
    # column ids
    simple_plate_column_ids = sheet.loc[plate_start+1, offsets["well_range"]].tolist()
    validate_plate_column_ids(simple_plate_column_ids)

    plate_row_labels = sheet.loc[plate_start+2:plate_start+9, offsets["well_row_label"]].dropna().tolist()
    validate_plate_row_ids(plate_row_labels)

    contents = sheet.loc[plate_start+2:plate_start+len(plate_row_labels)+1, offsets["well_range"]].copy()
    contents.index = plate_row_labels
    contents.columns = simple_plate_column_ids
    contents = contents.dropna()

    genotypes = get_plate_genotype(sheet, plate_start, offsets)
    validate_tissue_ids(contents, genotypes)

    well_contents = {}
    for well_row, row in contents.iterrows():
        for well_column, tissue_cell in zip(simple_plate_column_ids, row):
            genotype = get_genotype_from_tissue(tissue_cell)
            well_contents[(well_row, str(well_column))] = [WellContent(genotype, tissue_cell)]

    return well_contents

def parse_multiplexed_well_block(sheet, start, expected_genotypes, offsets):
    well_re = re.compile("^[A-H]1?[\d]$")
    wells = sheet.loc[start+3, offsets["well_range"]].tolist()
    for cell in wells:
        if pandas.isnull(cell) or well_re.match(cell) is None:
            raise ValueError(f"well value {cell} in row {start+3} does not look correct")

    contents = sheet.loc[start+0:start+1, offsets["well_range"]]
    contents.columns = [(x[0], x[1:]) for x in wells]
    validate_tissue_ids(contents, expected_genotypes)
    contents.index = expected_genotypes

    well_contents = {}
    for expected_genotype, row in contents.dropna().iterrows():
        for well, tissue_id in row.iteritems():
            genotype = get_genotype_from_tissue(tissue_id)
            well_contents.setdefault(well, []).append(WellContent(genotype, tissue_id))

    return well_contents

def parse_plate(sheet):
    max_rows_from_plate_name_to_well_names = 4

    offsets = {
        "plate_label": 1,
        "tissue_label": 1,
        "well_row_label": 2,
        "well_start": 3,
        "well_range": slice(3, 10),
    }
    offsets_simple = offsets.copy()
    offsets_simple["well_range"] = slice(3, 14)

    for plate_name, plate_start in find_plate_start(sheet, offsets):
        if plate_name.endswith("XX"):
            return

        well_contents = {}
        genotypes = get_plate_genotype(sheet, plate_start, offsets)
        
        if plate_name in ("IGVF_B01", "IGVF_012", "IGVF_013", "IGVF_014"):
            well_contents.update(parse_single_well_block(sheet, plate_start, offsets_simple))
            complex_blocks = []        
        else:
            well_contents.update(parse_single_well_block(sheet, plate_start, offsets))

            complex_blocks = [
                (plate_start + 12, slice(0, 2)),
                (plate_start + 12 + 5, slice(2, 4)),
                (plate_start + 12 + 10, slice(4, 6)),
                (plate_start + 12 + 15, slice(6, 8)),
            ]

            for complex_start, genotype_slice in complex_blocks:
                expected_genotypes = genotypes[genotype_slice]
                well_contents.update(parse_multiplexed_well_block(sheet, complex_start, expected_genotypes, offsets))
        
        yield (plate_name, well_contents)
        
offsets = {
    "plate_label": 1,
    "tissue_label": 1,
    "well_row_label": 2,
    "well_start": 3,
    "well_range": slice(3, 14),
}

In [53]:
        
plate_layout = pandas.read_excel(
    lizs_sheet_name, 
    sheet_name="Plate setups",
    header=None,
)


for plate_name, plate_contents in parse_plate(plate_layout):
    print(plate_name, len(plate_contents))

IGVF_003 96
IGVF_004 96
IGVF_005 96
IGVF_006 96
IGVF_007 96
IGVF_008 96
IGVF_008B 96
IGVF_B01 48
IGVF_009 96
IGVF_010 96
IGVF_011 96
IGVF_012 48
IGVF_013 96
IGVF_014 48


In [54]:
# half height 4 row plates
well_contents = parse_single_well_block(plate_layout, 257, offsets)
assert len(well_contents) == 48, f"{len(well_contents)}"

well_contents = parse_single_well_block(plate_layout, 395, offsets)
assert len(well_contents) == 48, f"{len(well_contents)}"

# Full length 8 row plate
well_contents = parse_single_well_block(plate_layout, 406, offsets)
assert len(well_contents) == 96, f"{len(well_contents)}"
#print(well_contents)

In [55]:
models.LibraryConstructionReagent.objects.all()

<QuerySet [<LibraryConstructionReagent: Parse WT Mega v2>, <LibraryConstructionReagent: Parse WT v2>]>

In [56]:
def guess_barcode_reagent_from_plate(plate_name):
    # Depend on a global to avoid hitting the database a bunch
    wt_mega_2_reagent = models.LibraryConstructionReagent.objects.get(name="wt-mega", version="v2")
    wt_regular_2_reagent = models.LibraryConstructionReagent.objects.get(name="wt", version="v2")    
    
    if plate_name in ("IGVF_B01", "IGVF_012"):
        reagent = wt_regular_2_reagent
    else:
        reagent = wt_mega_2_reagent

    return reagent

assert str(guess_barcode_reagent_from_plate("IGVF_003")) == "Parse WT Mega v2"
assert str(guess_barcode_reagent_from_plate("IGVF_012")) == "Parse WT v2"

def load_plates(plate_layout):
    # Validate tissue references:
    failed = False
    for plate_name, plate_contents in parse_plate(plate_layout):
        for well_id in plate_contents:
            well_contents = plate_contents[well_id]

            for well_fraction in well_contents:
                tissue_id = normalize_strain(well_fraction.tissue_id)
                try:
                    biosample = models.FixedSample.objects.get(name=tissue_id)
                except models.FixedSample.DoesNotExist:
                    print(f"Unable to find {tissue_id} on plate {plate_name}")
                    failed = True

    #assert not failed, "Resolve tissues"

    # populate database
    if models.SplitSeqWell.biosample.through.objects.count() > 0:
        truncate(models.SplitSeqWell.biosample.through)

    if models.SplitSeqWell.objects.count() > 0:
        truncate(models.SplitSeqWell)

    if models.SplitSeqPlate.objects.count() > 0:
        truncate(models.SplitSeqPlate)

    errors = 0
    plates_loaded = set()

    for plate_name, plate_contents in parse_plate(plate_layout):
        plates_loaded.add(plate_name)
        plate_record = models.SplitSeqPlate(
            name=plate_name,
            size=models.PlateSizeEnum.size_96,
            pool_location=None,
            date_performed=None,
        )
        plate_record.save()

        for well_id in plate_contents:
            well_contents = plate_contents[well_id]

            biosamples = []
            for well_content in well_contents:
                tissue_id = normalize_strain(well_content.tissue_id)
                try:
                    biosamples.append(models.FixedSample.objects.get(name=tissue_id))
                except models.FixedSample.DoesNotExist:
                    print(f"unable to find tissue {tissue_id} for {plate_name} {well_id}")
                    errors += 1


            reagent = guess_barcode_reagent_from_plate(plate_name)

            barcodes = models.LibraryBarcode.objects.filter(
                reagent=reagent,
                code="{}{}".format(well_id[0], well_id[1]),
            )

            well_record = models.SplitSeqWell(
                plate=plate_record,
                row=well_id[0],
                column=well_id[1],
            )
            well_record.save()
            well_record.biosample.set(biosamples)
            well_record.barcode.set(barcodes)
            well_record.save()

    assert not errors, "Check error messages"
    assert "IGVF_008B" in plates_loaded, "Remember to make copied IGVF_008B plate"
    assert "IGVF_012" in plates_loaded, "Still having trouble with IGVF_012"
    
load_plates(plate_layout)

In [57]:
[x for x in models.Tissue.objects.all() if x.name.startswith("191_B6CAST")]

[<Tissue: 191_B6CASTF1J_10M_01 Hypothalamus/Pituitary>,
 <Tissue: 191_B6CASTF1J_10M_03 Cortex/Hippocampus left>,
 <Tissue: 191_B6CASTF1J_10M_04 Cortex/Hippocampus right>,
 <Tissue: 191_B6CASTF1J_10M_05 Liver>,
 <Tissue: 191_B6CASTF1J_10M_06 Heart>,
 <Tissue: 191_B6CASTF1J_10M_08 Adrenal>,
 <Tissue: 191_B6CASTF1J_10M_19 Tail>,
 <Tissue: 191_B6CASTF1J_10M_20 PBMC - WBC>,
 <Tissue: 191_B6CASTF1J_10M_27 Testis left>,
 <Tissue: 191_B6CASTF1J_10M_28 Testis right>,
 <Tissue: 191_B6CASTF1J_10M_29 Epididymis left>,
 <Tissue: 191_B6CASTF1J_10M_30 Epididymis right>,
 <Tissue: 191_B6CASTF1J_10M_31 Kidney left>,
 <Tissue: 191_B6CASTF1J_10M_32 Kidney right>,
 <Tissue: 191_B6CASTF1J_10M_33 Gastrocnemius left>,
 <Tissue: 191_B6CASTF1J_10M_34 Gastrocnemius right>]

# Submitted measurement sets

In [58]:
def load_submitted_measurement_sets():
    measurement_sets = {}

    for accession_prefix in submitted_sheets:
        for i, row in submitted_sheets[accession_prefix]["measurement_set"].iterrows():
            if not pandas.isnull(row["accession"]):
                aliases = row["aliases:array"].split(',')
                name = aliases[0].replace("ali-mortazavi:", "")

                try:
                    record = models.MeasurementSet.objects.get(name=name)
                except models.MeasurementSet.DoesNotExist:
                    record = models.MeasurementSet(
                        name=name,
                    )
                    record.save()

                    if accession_prefix == "igvftst":
                        see_also_template = "https://api.sandbox.igvf.org/measurment-sets/{}/"
                    elif accession_prefix == "igvf":
                        see_also_template = "https://api.data.igvf.org/measurement-sets/{}/"

                    accession = models.Accession(
                        accession_prefix=accession_prefix,
                        name=row["accession"],
                        uuid=row["uuid"],
                        see_also=see_also_template.format(row["accession"]),
                    )
                    accession.save()
                    record.accession.add(accession)
                    record.save()
                measurement_sets[name] = record
    print("loaded", len(measurement_sets), "measurement_sets")
    return measurement_sets


# Subpool

## Extract meaning from the Experiments tab

- number of nuclei per subpool (Experiment) 
- cDNA amp # PCR cycles (Experiment)
- cDNA ng/ul in 25ul (experiment)
- total cDNA ng (experiment)
- Bioanalyzer cDNA ave bp length (expierment)
- Sub library Index PCR # (experiment)
- Sublibrary Index (experiment)
- Sequence for SampleSheet (experiment)
- library ng/ul (experiment)
- Bioanalyzer library ave bp length (experiment)
- Nextseq run number (experiment)

- number of 67k aliquots  (experiment)
- number of 8k aliquots (experiment)

- QC # raw reads (1 mismatch) (experiment)

In [59]:
experiment = pandas.read_excel(
    lizs_sheet_name, 
    sheet_name="Experiment",
    header=0,
    index_col=None,
    #usecols=range(0, 49)
)
experiment

Unnamed: 0,Experiment,mice,Fixation Box,Split-seq prep start date,Submit,Sublibrary,[Barcoded Nuclei] (nuclei/uL),Barcoded Nuclei Vol. (uL),Total barcoded nuclei (1st 3 rounds),# of backup aliquots,...,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63
0,IGVF_Splitseq_002,B6 (2F+2M),IGVF_FIX_001,2022-08-24,No,002_4A,4540,240.0,1089600,3.0,...,,,,,,,,,,
1,,CAST (2F+2M),IGVF_FIX_002,NaT,No,002_4B,,,,,...,,,,,,,,,,
2,Fixation V1,,,NaT,No,002_64A,,,Target Nuclei,,...,,,,,,,,,,
3,1M V1,,,NaT,No,002_64B,,,1018000,,...,,,,,,,,,,
4,,,,NaT,No,002_64C,,,Excess Nuclei,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1009,,,,NaT,,,,,,,...,,,,,,,,,,
1010,,,,NaT,,,,,,,...,,,,,,,,,,
1011,,,,NaT,,,,,,,...,,,,,,,,,,
1012,,,,NaT,,,,,,,...,,,,,,,,,,


In [60]:
experiment = pandas.read_excel(
    lizs_sheet_name, 
    sheet_name="Experiment",
    header=0,
    index_col=None,
    #usecols=range(0, 49)
).rename(columns={
    # Unnamed: 42 seems like an index or reference to the sample sheet?
    "Unnamed: 43": "Novaseq raw reads",
    "Unnamed: 44": "Novaseq1 L001",
    "Unnamed: 45": "Novaseq1 L002",
    "Unnamed: 46": "Novaseq1 L003",
    "Unnamed: 47": "Novaseq1 L004",
    "Unnamed: 48": "Novaseq2 L001",
    "Unnamed: 49": "Novaseq2 L002",
    "Unnamed: 50": "Novaseq2 L003",
    "Unnamed: 51": "Novaseq2 L004",
})

experiment

Unnamed: 0,Experiment,mice,Fixation Box,Split-seq prep start date,Submit,Sublibrary,[Barcoded Nuclei] (nuclei/uL),Barcoded Nuclei Vol. (uL),Total barcoded nuclei (1st 3 rounds),# of backup aliquots,...,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63
0,IGVF_Splitseq_002,B6 (2F+2M),IGVF_FIX_001,2022-08-24,No,002_4A,4540,240.0,1089600,3.0,...,,,,,,,,,,
1,,CAST (2F+2M),IGVF_FIX_002,NaT,No,002_4B,,,,,...,,,,,,,,,,
2,Fixation V1,,,NaT,No,002_64A,,,Target Nuclei,,...,,,,,,,,,,
3,1M V1,,,NaT,No,002_64B,,,1018000,,...,,,,,,,,,,
4,,,,NaT,No,002_64C,,,Excess Nuclei,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1009,,,,NaT,,,,,,,...,,,,,,,,,,
1010,,,,NaT,,,,,,,...,,,,,,,,,,
1011,,,,NaT,,,,,,,...,,,,,,,,,,
1012,,,,NaT,,,,,,,...,,,,,,,,,,


In [61]:
def is_experiment_name(name):
    not_submittable = ["IGVF_Splitseq_EX"]
    return not pandas.isnull(name) and name.startswith("IGVF_Splitseq")

experiment[experiment["Experiment"].apply(is_experiment_name) & (pandas.notnull(experiment["mice"]))]

Unnamed: 0,Experiment,mice,Fixation Box,Split-seq prep start date,Submit,Sublibrary,[Barcoded Nuclei] (nuclei/uL),Barcoded Nuclei Vol. (uL),Total barcoded nuclei (1st 3 rounds),# of backup aliquots,...,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63
0,IGVF_Splitseq_002,B6 (2F+2M),IGVF_FIX_001,2022-08-24,No,002_4A,4540.0,240.0,1089600.0,3.0,...,,,,,,,,,,
17,IGVF_Splitseq_003,8 founders (4M+4F),IGVF_FIX_003,2022-12-01,No,003_8A,6860.0,250.0,1715000.0,7.0,...,,Total,L001,L002,L003,L004,L001,L002,L003,L004
34,IGVF_Splitseq_004,8 founders (4M+4F),IGVF_FIX_005,2022-12-15,No,004_8A,2460.0,640.0,1574400.0,5.0,...,,Total,L001,L002,L003,L004,L001,L002,L003,L004
51,IGVF_Splitseq_005,8 founders (4M+4F),IGVF_FIX_010,2023-01-23,No,005_8A,4675.0,390.0,1823250.0,6.0,...,,Total,L001,L002,L003,L004,L001,L002,L003,L004
68,IGVF_Splitseq_006,8 founders (4M+4F),IGVF_FIX_006,2023-02-10,No,006_8A,3160.0,780.0,2464800.0,8.0,...,,Total,L001,L002,L003,L004,L001,L002,L003,L004
87,IGVF_Splitseq_007,8 founders (4M+4F),IGVF_FIX_006,2023-03-07,No,007_8A,2810.0,580.0,1629800.0,8.0,...,,Total,L001,L002,L003,L004,L001,L002,L003,L004
105,IGVF_Splitseq_008,8 founders (4M+4F),IGVF_FIX_008,2023-03-24,No,008_8A,2600.0,250.0,650000.0,0.0,...,,Total,L001,L002,L003,L004,,,,
117,IGVF_Splitseq_B01,CASTJ + B6J (2M+2F),,2023-04-03,Yes,B01_13A,4920.0,83.6,411312.0,10.0,...,,,,,,,,,,
126,IGVF_Splitseq_008B,8 founders (4M+4F),IGVF_FIX_008,2023-04-19,Yes,008B_67A,3330.0,250.0,832500.0,0.0,...,2.0,1417529714,355914199,355917666,358863075,346834774,,,,
137,IGVF_Splitseq_009,8 founders (4M+4F),IGVF_FIX_004,2023-05-02,Yes,009_67A,3370.0,285.0,960450.0,0.0,...,2.0,1474101220,177858630,182128617,178436907,177628982,189105418,193500915,191649075,183792676


In [62]:
def find_fixation_start(sheet):
    def is_experiment_name(name):
        not_submittable = ["IGVF_Splitseq_EX"]
        return not pandas.isnull(name) and name.startswith("IGVF_Splitseq") # and not name in not_submittable

    for fixation_id_row in sheet[sheet["Experiment"].apply(is_experiment_name) & (pandas.notnull(experiment["mice"]))].index:
        name = sheet.loc[fixation_id_row, "Experiment"]
        start = fixation_id_row

        stop = sheet.shape[0]
        for i, value in sheet.iloc[start:]["Sublibrary"].iteritems():
            if pandas.isnull(value):
                stop = i - 1
                break
        
        
        yield (name, start, stop)

for name, start, stop in find_fixation_start(experiment):
    print(name, start, stop)

IGVF_Splitseq_002 0 15
IGVF_Splitseq_003 17 32
IGVF_Splitseq_004 34 49
IGVF_Splitseq_005 51 66
IGVF_Splitseq_006 68 85
IGVF_Splitseq_007 87 102
IGVF_Splitseq_008 105 114
IGVF_Splitseq_B01 117 124
IGVF_Splitseq_008B 126 135
IGVF_Splitseq_009 137 151
IGVF_Splitseq_010 153 167
IGVF_Splitseq_011 169 183
IGVF_Splitseq_EX 185 192
IGVF_Splitseq_012 196 203
IGVF_Splitseq_013 208 222
IGVF_Splitseq_014 225 232
IGVF_Splitseq_015 235 250
IGVF_Splitseq_016 253 268
IGVF_Splitseq_017 271 286
IGVF_Splitseq_018 289 304
IGVF_Splitseq_019 307 322
IGVF_Splitseq_020 325 340
IGVF_Splitseq_021 343 358
IGVF_Splitseq_022 361 376


In [63]:
def parse_selection_type(value):
    if value is None:
        return None
    elif value.lower() in ("normal", "no"):
        return models.SublibrarySelectionTypeEnum.no_selection
    elif value in ("capture", "EX"):
        return models.SublibrarySelectionTypeEnum.exome_capture
    else:
        raise ValueError(f"Unrecognized capture type {value}")
        
def parse_sample_type(value):
    if value is None:
        return None
    elif value.lower() in ("nuclei",):
        return models.SubcellularComponentEnum.nuclei
    elif value.lower() in ("cells",):
        return models.SubcellularComponentEnum.cellular
    else:
        raise ValueError(f"Unrecognized sample type {value}")


def parse_fixation(sheet, experiment_name, start, stop):
    block = sheet.loc[start:stop]

    experiment_name_fields = experiment_name.split("_")
    plate_name = f"{experiment_name_fields[0]}_{experiment_name_fields[-1]}"
    
    aliquots = {}
    for i, row in block.iterrows():
        count = row["# of backup aliquots"]
        size = row["Backup Aliquot Size"]
        if not (pandas.isnull(count) or pandas.isnull(size)):
            if isinstance(size, str) and size.endswith("k"):
                size = size[:-1]
            size = int(size) * 1000
            aliquots[size] = int(count)
    
    experiment = {
        "experiment_name": experiment_name,
        "plate_name": plate_name,
        "prep_date": block.loc[start, "Split-seq prep start date"],
        "unused_aliquots": aliquots,
        "subpool": [],
    }

    for i, row in block.iterrows():
        if not pandas.isnull(row["Sublibrary"]):
            if pandas.isnull(row["NovaSeq Sequencing date"]):
                novaseq_run_id = None
            else:
                novaseq_run_id = row["NovaSeq Sequencing date"].isoformat().split("T")[0]

            if pandas.isnull(row.get("Novaseq raw reads")):
                novaseq_raw_reads = None
                novaseq1_raw_reads = None
                novaseq2_raw_reads = None
            elif row["Novaseq raw reads"] == "Total":
                novaseq_raw_reads = None
                novaseq1_raw_reads = None
                novaseq2_raw_reads = None
            else:
                novaseq_raw_reads = int_or_none(row.get("Novaseq raw reads"))
                novaseq1_raw_reads = sum([int_or_none(row.get(x)) for x in["Novaseq1 L001", "Novaseq1 L002", "Novaseq1 L003", "Novaseq1 L004"]])
                novaseq2_raw_reads = sum([int_or_none(row.get(x)) for x in["Novaseq2 L001", "Novaseq2 L002", "Novaseq2 L003", "Novaseq2 L004"]])

            plate_name, subname = row["Sublibrary"].split("_")
            print(row["Sublibrary Type"], row["Sublibrary"])
            subpool = {
                "name": row["Sublibrary"],
                "plate_name": "IGVF_{}".format(plate_name),
                "subpool_subname": subname,
                "submit": row["Submit"],
                "nuclei": int_or_none(row["Nuclei / Sublibrary"]),
                "selection_type": parse_selection_type(row["Sublibrary Type"]),
                "subcellular_component": parse_sample_type(row["Sample Type"]),
                "cdna_pcr_rounds": row["cDNA Amp # PCR Cycles"],
                "cdna_ng_per_ul": row["[cDNA] (ng/uL)"],
                "cdna_volume": row["cDNA Vol. (uL)"],
                #total_cdna = cdna_ng_per_ul_in_25ul * 25
                "bioanalyzer_date": row["BA date"],
                "cdna_average_bp_length": int_or_none(row["cDNA Ave. Length (bp)"]),
                "index_pcr_no": int_or_none(row["Sub library Index PCR #"]),
                "index": str_or_none(row["Sublibrary Index"]),
                "i7_barcode": row["i7 Forward Sequence"],
                "i5_barcode": row["i5 Forward Sequence"],
                "library_ng_per_ul": float_or_none(row["[Library] (ng/ul)"]),
                "library_average_bp_length": float_or_none(row["Library Ave. Length (bp)"]),
                "nextseq_run_date": date_or_none(row["QC Sequencing date"]),
                "nextseq_run_id": str_or_none(int_or_none(row["Nextseq run #"])),
                "nextseq_run_raw_reads": int_or_none(row["QC # raw reads  (1 mismatch)"]),
                "novaseq_run_date": date_or_none(row["NovaSeq Sequencing date"]),
                "novaseq_run_id": novaseq_run_id,
                "novaseq_run_raw_reads": novaseq_raw_reads,
                "novaseq1_raw_reads": novaseq1_raw_reads,
                "novaseq2_raw_reads": novaseq2_raw_reads,
            }
            experiment["subpool"].append(subpool)
            #if not pandas.isnull(subpool["nuclei"]):
            #    nuclei = subpool["nuclei"]
            #    aliquots[nuclei] = aliquots.get(nuclei) + 1

    return experiment

#parse_fixation(experiment, "IGVF_Splitseq_003", 17, 32)
#parse_fixation(experiment, "IGVF_Splitseq_008", 108, 117)
#parse_fixation(experiment, "IGVF_Splitseq_B01", 124, 131)
#parse_fixation(experiment, "IGVF_Splitseq_008B", 133, 143)
parse_fixation(experiment, "IGVF_Splitseq_EX", 186, 197)

EX 004_13A
EX 005_13A
EX 007_13A
EX 008B_13A
EX 009_13A
EX 010_13A
EX 011_13A
NO 012_13A
NO 012_13B


{'experiment_name': 'IGVF_Splitseq_EX',
 'plate_name': 'IGVF_EX',
 'prep_date': NaT,
 'unused_aliquots': {13000: 12},
 'subpool': [{'name': '004_13A',
   'plate_name': 'IGVF_004',
   'subpool_subname': '13A',
   'submit': 'Yes',
   'nuclei': 13000,
   'selection_type': <SublibrarySelectionTypeEnum.exome_capture: 'EX'>,
   'subcellular_component': <SubcellularComponentEnum.nuclei: 'N'>,
   'cdna_pcr_rounds': '5 + 8',
   'cdna_ng_per_ul': 23.8,
   'cdna_volume': 25.0,
   'bioanalyzer_date': NaT,
   'cdna_average_bp_length': None,
   'index_pcr_no': 12,
   'index': 2,
   'i7_barcode': 'ACTTGA',
   'i5_barcode': nan,
   'library_ng_per_ul': 24.4,
   'library_average_bp_length': None,
   'nextseq_run_date': datetime.date(2023, 5, 30),
   'nextseq_run_id': 1070,
   'nextseq_run_raw_reads': None,
   'novaseq_run_date': None,
   'novaseq_run_id': None,
   'novaseq_run_raw_reads': None,
   'novaseq1_raw_reads': None,
   'novaseq2_raw_reads': None},
  {'name': '005_13A',
   'plate_name': 'IGVF_0

In [64]:
platforms = {x.name: x for x in models.Platform.objects.all()}
platforms

{'nextseq2000': <Platform: Nextseq 2000>,
 'novaseq6000': <Platform: Novaseq 6000>,
 'minion': <Platform: Oxford Nanopore MinION>,
 'promethion': <Platform: Oxford Nanopore PromethION>,
 'pacbio': <Platform: Pac Bio>}

In [65]:
class SubpoolName:
    def __init__(self, name=None):
        if isinstance(name, str):
            self._normalized = SubpoolName.subpool_name_to_numbers(name)
        elif isinstance(name, list):
            self._normalized = name
        elif name is None:
            self._normalized = None
        else:
            raise ValueError("Unrecognized type {} for {}".format(type(name), name))

    def __str__(self):
        if self._normalized is None:
            return "None"
        else:
            return SubpoolName.numbers_to_subpool_name(self._normalized)
        
    def __repr__(self):
        if self._normalized is None:
            return str("{}()".format(self.__class__.__name__))
        else:
            return str("{}('{}')".format(self.__class__.__name__, str(self)))
        
    def __eq__(self, cmp):
        if self._normalized is None and cmp is None:
            return True
        elif self._normalized is None and cmp._normalized is None:
            return True
        elif self._normalized is None or cmp._normalized is None:
            return False
        elif len(self._normalized) != len(cmp._normalized):
            return False
        else:
            for left, right in zip(self._normalized, cmp._normalized):
                if left != right:
                    return False
                
        return True
    
    def __lt__(self, cmp):
        if self._normalized is None and cmp is None:
            return False
        elif self._normalized is None and cmp._normalized is None:
            return False
        elif self._normalized is None:
            return True
        elif cmp is None or cmp._normalized is None:
            return False
        elif self._normalized[0] < cmp._normalized[0]:
            return True
        elif len(self._normalized) < len(cmp._normalized):
            return True
        elif len(self._normalized) > len(cmp._normalized):
            return False
        else:
            for left, right in zip(self._normalized, cmp._normalized):
                if left == right:
                    continue
                elif left < right:
                    return True
                else:
                    return False
                
        return False
                
    def __gt__(self, cmp):
        if self._normalized is None and cmp is None:
            return False
        elif self._normalized is None and cmp._normalized is None:
            return False
        elif cmp is None or cmp._normalized is None:
            return True
        elif self._normalized[0] > cmp._normalized[0]:
            return True
        elif len(self._normalized) < len(cmp._normalized):
            return False
        elif len(self._normalized) > len(cmp._normalized):
            return True
        else:
            for left, right in zip(self._normalized, cmp._normalized):
                if left == right:
                    continue
                elif left < right:
                    return False
                else:
                    return True
            
        return False
        
    @classmethod
    def subpool_name_to_numbers(cls, name):
        STATE_DIGITS = 1
        STATE_LETTERS = 2
        STATE_ERROR = -1

        digits = []
        letters = []
        state = STATE_DIGITS
        for char in name:
            if state == STATE_DIGITS:
                if char.isdigit():
                    digits.append(char)
                elif char.isalpha():
                    state = STATE_LETTERS
                    letters.append(char)
                else:
                    raise ValueError("Unrecognized symbol {} in {}".format(char, name))
            elif state == STATE_LETTERS:
                if char.isdigit():
                    raise ValueError("Unexpected digit {} in {}".format(char, name))
                elif char.isalpha():
                    letters.append(char)
                else:
                    raise ValueError("Unrecognized symbol {} in {}".format(char, name))

        result = [int("".join(digits))]
        for number in letters:
            if number < "A" or number > "Z":
                raise ValueError("out of bounds letter code in {}".format(name))
            else:
                result.append(ord(number) - ord("A"))

        return result

    @classmethod
    def numbers_to_subpool_name(cls, subpool_list):

        result = [str(subpool_list[0])]
        for char in subpool_list[1:]:
            result.append(chr(char + ord("A")))

        return "".join(result)

    def next_code(self):
        if self._normalized is None:
            return None

        carry = 0
        advancable = []
        for i, place in enumerate(reversed(self._normalized[1:])):
            if i == 0:
                place += 1
            else:
                place += carry
                carry = 0

            if place > 25:
                carry = 1
                place = 0

            advancable.append(place)

        if carry > 0:
            advancable.append(0)

        result = [self._normalized[0]]
        result.extend(reversed(advancable))
        
        return SubpoolName(result)

s = SubpoolName("8ACZ")
print(s)

decoded = SubpoolName.subpool_name_to_numbers("8ACZ") 
print("decoded", decoded)
encoded = SubpoolName.numbers_to_subpool_name(decoded)
print("encoded", encoded)

print(SubpoolName("8ACZ").next_code())



for pairs in [("8A", "8B"), ("8AA", "8Z"), ("8B", "8E"), ("8B", "8A"), ("8Z", "8AA"), ("67A", "67A"), ("8AA", "8AB")]:
    left = SubpoolName(pairs[0])
    right = SubpoolName(pairs[1])
    print(left, right, left < right, left == right, left > right)
    
print(max(SubpoolName("8A"), SubpoolName("8B")))
print(max(None, SubpoolName("8B")))
print(max(SubpoolName("8A"), None))


8ACZ
decoded [8, 0, 2, 25]
encoded 8ACZ
8ADA
8A 8B True False False
8AA 8Z False False True
8B 8E True False False
8B 8A False False True
8Z 8AA True False False
67A 67A False True False
8AA 8AB True False False
8B
8B
8A


# load subpools, sequencing runs and subpools in sequencing run

In [66]:
def load_submitted_multiplexed_samples():
    multiplexed_samples = {}

    for accession_prefix in submitted_sheets:
        for i, row in submitted_sheets[accession_prefix]["multiplexed_sample"].iterrows():
            if not pandas.isnull(row["accession"]):
                aliases = row["aliases:array"].split(',')
                name = aliases[0].replace("ali-mortazavi:", "")
                if accession_prefix == "igvftst":
                    see_also_template = "https://api.sandbox.igvf.org/multiplexed-samples/{}/"
                elif accession_prefix == "igvf":
                    see_also_template = "https://api.data.igvf.org/multiplexed-samples/{}/"

                multiplexed_samples.setdefault(name, []).append({
                    "accession_prefix": accession_prefix,
                    "name": row["accession"],
                    "uuid": row["uuid"],
                    "see_also": see_also_template.format(row["accession"]),
                })
    print(len(multiplexed_samples))
    
    return multiplexed_samples

In [67]:
def load_split_seq_plates_as_dict():
    plates = {}
    for current_plate in models.SplitSeqPlate.objects.all():
        plates[current_plate.name] = current_plate
    return plates


In [68]:
load_split_seq_plates_as_dict()

{'IGVF_003': <SplitSeqPlate: IGVF_003>,
 'IGVF_004': <SplitSeqPlate: IGVF_004>,
 'IGVF_005': <SplitSeqPlate: IGVF_005>,
 'IGVF_006': <SplitSeqPlate: IGVF_006>,
 'IGVF_007': <SplitSeqPlate: IGVF_007>,
 'IGVF_008': <SplitSeqPlate: IGVF_008>,
 'IGVF_008B': <SplitSeqPlate: IGVF_008B>,
 'IGVF_B01': <SplitSeqPlate: IGVF_B01>,
 'IGVF_009': <SplitSeqPlate: IGVF_009>,
 'IGVF_010': <SplitSeqPlate: IGVF_010>,
 'IGVF_011': <SplitSeqPlate: IGVF_011>,
 'IGVF_012': <SplitSeqPlate: IGVF_012>,
 'IGVF_013': <SplitSeqPlate: IGVF_013>,
 'IGVF_014': <SplitSeqPlate: IGVF_014>}

In [69]:
[vars(x) for x in models.LibraryBarcode.objects.filter(
    reagent=guess_barcode_reagent_from_plate("IGVF_004"),
    barcode_type=None,
    code="10",
)]

[{'_state': <django.db.models.base.ModelState at 0x7efdcf27ea00>,
  'id': 202,
  'reagent_id': 'wt-mega',
  'name': '10',
  'code': '10',
  'i7_sequence': 'CCGTCC',
  'i5_sequence': None,
  'barcode_type': None,
  'well_position': None}]

In [70]:
def load_experiments_return_submittable_subpools():
    compression=r"(?<compression>gz|bz2|xz|zstd)"
    nanopore_library_name_re_template = r"igvf(?P<run>[\d]+)_(?P<library_id>{library_name})_.*_(?P<fragment>[\d]+)\.fastq\.{compression}"
    nextseq_library_name_re_template = r"(?P<run>[\d]+)_(?P<library_id>{library_name})_(?P<read_id>[RI][\d])\.fastq\.{compression}"
    nextseq_library_index_re_template = r"Sublibrary_(?P<library_id>{library_index})_S(?P<sample_id>[\d]+)_(?P<lane_id>L[\d]+)_(?P<read_id>[RI][\d])_(?P<fragment>[\d]+)\.fastq\.{compression}"
    novaseq_library_index_re_template = r"Sublibrary_(?P<library_id>{library_index})_S(?P<sample_id>[\d]+)_(?P<lane_id>L[\d]+)_(?P<read_id>[RI][\d])_(?P<fragment>[\d]+)\.fastq\.{compression}"

    multiplexed_samples = load_submitted_multiplexed_samples()
    split_seq_plates = load_split_seq_plates_as_dict()

    failed_plates = {"IGVF_006",}
    failed = False
    submittable_subpools = {}
    for experiment_name, experiment_start, experiment_stop in find_fixation_start(experiment):
        fixation = parse_fixation(experiment, experiment_name, experiment_start, experiment_stop)

        if fixation["plate_name"] in failed_plates:
            print(f"Skipping failed plate {plate_name}")
            continue

        last_subpools = {}

        reagent = guess_barcode_reagent_from_plate(fixation["plate_name"])

        for line_offset, subpool in enumerate(fixation["subpool"]):
            if subpool["plate_name"] not in split_seq_plates:
                print("Unable to find plate for {}".format(subpool["plate_name"]))
                continue

            try:
                barcodes = models.LibraryBarcode.objects.filter(
                    reagent=reagent, 
                    barcode_type=None, 
                    code=subpool["index"])
            except models.LibraryBarcode.DoesNotExist:
                print(f"Unable to find barcode {subpool['index']}")
                
            if len(barcodes) == 0:
                print(f"Unable to find barcodes with {reagent} None {subpool['index']}")

            # validate i7 barcodes
            library_i7_barcodes_sequence = {b.i7_sequence for b in barcodes if pandas.notnull(b.i7_sequence)}
            if not pandas.isnull(subpool["i7_barcode"]):
                i7_expected_sequence = set(subpool["i7_barcode"].split(","))
            else:
                i7_expected_sequence = set()
            if library_i7_barcodes_sequence != i7_expected_sequence:
                print(f"{subpool['name']} Database lookup of {subpool['index']} i7_barcodes {library_i7_barcodes_sequence} doesn't match human entry {i7_expected_sequence}")
                print(reagent, subpool["index"], barcodes)
                failed = True
                continue

            # validate i5 barcodes
            library_i5_barcodes_sequence = {b.i5_sequence for b in barcodes if pandas.notnull(b.i5_sequence)}
            if len(library_i5_barcodes_sequence) > 0:
                if not pandas.isnull(subpool["i5_barcode"]):
                    i5_expected_sequence = set(subpool["i5_barcode"].split(","))
                else:
                    i5_expected_sequence = set()
                if library_i5_barcodes_sequence != i5_expected_sequence:
                    print(f"{subpool['name']} Database lookup of i5_barcodes {library_i5_barcodes_sequence} doesn't match human entry {i5_expected_sequence}")
                    failed = True
                    continue

            plate = split_seq_plates[subpool["plate_name"]]
            subpool_prefix = subpool["name"].split("_")[0]
            last_subpools[subpool["nuclei"]] = max(
                last_subpools.get(subpool["nuclei"]), 
                SubpoolName(subpool["subpool_subname"]))

            record = models.Subpool(
                name=subpool["name"],
                plate=plate,
                nuclei=subpool["nuclei"],
                selection_type=subpool["selection_type"],
                cdna_pcr_rounds=subpool["cdna_pcr_rounds"],
                cdna_ng_per_ul=float_or_none(subpool["cdna_ng_per_ul"]),
                cdna_volume=float_or_none(subpool["cdna_volume"]),
                bioanalyzer_date=date_or_none(subpool["bioanalyzer_date"]),
                cdna_average_bp_length=int_or_none(subpool["cdna_average_bp_length"]),
                index_pcr_number=int_or_none(subpool["index_pcr_no"]),
                index=str_or_none(subpool["index"]),
                library_ng_per_ul=float_or_none(subpool["library_ng_per_ul"]),
                library_average_bp_length=int_or_none(subpool["library_average_bp_length"]),
            )
            record.save()
            record.barcode.set(barcodes)
            record.save()

            if pandas.notnull(subpool["submit"]):
                submittable_subpools[subpool["name"]] = subpool["submit"]

            name = "subpool_{}".format(subpool["name"].lower())
            if name in multiplexed_samples:
                for accession_row in multiplexed_samples[name]:
                    if pandas.notnull(accession_row["name"]):
                        print(name, record.name, multiplexed_samples[name])
                        accession = models.Accession(**accession_row)
                        accession.save()
                        record.accession.add(accession)
                        record.save()            

        for nuclei in fixation["unused_aliquots"]:
            if nuclei in last_subpools:
                subpool_name = last_subpools[nuclei]
                for i in range(fixation["unused_aliquots"][nuclei]):
                    subpool_name = subpool_name.next_code()

                    record = models.Subpool(
                        name="{}_{}".format(subpool_prefix, subpool_name),
                        plate=plate,
                        nuclei=nuclei,
                    )
                    print(record.name)
                    record.save()
            else:
                print("No unused aliquots {} {}".format(nuclei, last_subpools))

    assert not failed
    return submittable_subpools
    
submittable_subpools = load_experiments_return_submittable_subpools()

171
NO 002_4A
NO 002_4B
NO 002_64A
NO 002_64B
NO 002_64C
NO 002_64D
NO 002_64E
NO 002_64F
NO 002_64G
NO 002_64H
NO 002_64I
NO 002_64J
NO 002_64K
NO 002_64L
NO 002_64M
NO 002_64N
Unable to find plate for IGVF_002
Unable to find plate for IGVF_002
Unable to find plate for IGVF_002
Unable to find plate for IGVF_002
Unable to find plate for IGVF_002
Unable to find plate for IGVF_002
Unable to find plate for IGVF_002
Unable to find plate for IGVF_002
Unable to find plate for IGVF_002
Unable to find plate for IGVF_002
Unable to find plate for IGVF_002
Unable to find plate for IGVF_002
Unable to find plate for IGVF_002
Unable to find plate for IGVF_002
Unable to find plate for IGVF_002
Unable to find plate for IGVF_002
No unused aliquots 67000 {}
NO 003_8A
NO 003_67A
NO 003_67B
NO 003_67C
NO 003_67D
NO 003_67E
NO 003_67F
NO 003_67G
NO 003_67H
NO 003_67I
NO 003_67J
NO 003_67K
NO 003_67L
NO 003_67M
NO 003_67N
NO 003_67O
003_67P
003_67Q
003_67R
003_67S
003_67T
003_67U
003_67V
003_8B
003_8C
NO 00

subpool_b01_13h B01_13H [{'accession_prefix': 'igvftst', 'name': 'TSTSM89995437', 'uuid': '5484bec6-4b0e-4488-b3b1-fd4def0022df', 'see_also': 'https://api.sandbox.igvf.org/multiplexed-samples/TSTSM89995437/'}, {'accession_prefix': 'igvf', 'name': 'IGVFSM2294BOOU', 'uuid': 'f25eb07b-ed33-496f-a8d7-83814b4b642f', 'see_also': 'https://api.data.igvf.org/multiplexed-samples/IGVFSM2294BOOU/'}]
B01_13I
B01_13J
B01_13K
B01_13L
B01_13M
B01_13N
B01_13O
B01_13P
B01_13Q
B01_13R
NO 008B_67A
NO 008B_67B
NO 008B_67C
NO 008B_67D
NO 008B_67E
NO 008B_67F
NO 008B_67G
NO 008B_67H
NO 008B_67I
NO 008B_67J
No unused aliquots 13000 {67000: SubpoolName('67J')}
NO 009_67A
NO 009_67B
NO 009_67C
NO 009_67D
NO 009_67E
NO 009_67F
NO 009_67G
NO 009_67H
NO 009_67I
NO 009_67J
NO 009_67K
NO 009_67L
NO 009_67M
NO 009_67N
NO 009_67O
No unused aliquots 13000 {67000: SubpoolName('67O')}
NO 010_67A
NO 010_67B
NO 010_67C
NO 010_67D
NO 010_67E
NO 010_67F
NO 010_67G
NO 010_67H
NO 010_67I
NO 010_67J
NO 010_67K
NO 010_67L
NO 010

In [71]:
models.LibraryBarcode.objects.get(i7_sequence="TAATCGAC")

<LibraryBarcode: UDI_Plate_WT_20 UDI20 TAATCGAC+TAATCGAC>

In [72]:
# Old creating runs from the sample sheet

#    expected_names = {
#        "IGVF_Splitseq_003": {
#            "nanopore": nanopore_library_name_re_template,
#            "nextseq": nextseq_library_name_re_template,
#            "novaseq": novaseq_library_index_re_template,
#        },
#        "IGVF_Splitseq_004": {
#            "nanopore": nanopore_library_name_re_template,
#            "nextseq": nextseq_library_name_re_template,
#            "novaseq": novaseq_library_index_re_template,
#        },
#        "IGVF_Splitseq_005": {
#            "nanopore": nanopore_library_name_re_template,
#            "nextseq": nextseq_library_index_re_template,
#            "novaseq": novaseq_library_index_re_template,
#        },
#        "IGVF_Splitseq_006": {
#            "nanopore": nanopore_library_name_re_template,
#            "nextseq": nextseq_library_index_re_template,
#            "novaseq": novaseq_library_index_re_template,
#        },
#
#    }
#
#    nanopore_runs = {
#        "IGVF_Splitseq_003": {
#            "name": "2023-01-03-8A",
#            "run_date": datetime.datetime(2023,1,3),
#            "raw_reads": 87784162,
#        }
#    }

#        if experiment_name in nanopore_runs and subpool["nuclei"] == 8000:
#            measurement = get_or_create_sequencing_run(
#                # nanopore_runs[experiment_name]["name"]
#                name="{}/{}".format(plate.name.lower(), "nanopore"),
#                run_date=nanopore_runs[experiment_name]["run_date"],
#                platform=platforms["nanopore"],
#                plate=plate,
#            )
#            
#            pattern = expected_names[experiment_name]["nanopore"].format(
#                library_name=subpool["subpool_subname"],
#                library_index=line_offset+1,
#                compression=compression,
#            )
#            
#            run = models.SubpoolInRun(
#                subpool=record,
#                sequencing_run=measurement,
#                raw_reads=nanopore_runs[experiment_name]["raw_reads"],
#                status=models.RunStatus.PASS,
#            )
#            run.save()            
#
#        if subpool["nextseq_run_date"] is not None:
#            measurement = get_or_create_sequencing_run(
#                #sublibrary["nextseq_run_id"]
#                name="{}/{}".format(plate.name.lower(), "nextseq"),
#                run_date=subpool["nextseq_run_date"],
#                platform=platforms["nextseq"],
#                plate=plate,
#            )
#
#            pattern = expected_names[experiment_name]["nextseq"].format(
#                library_name=subpool["subpool_subname"],
#                library_index=line_offset+1,
#                compression=compression,
#            )            
#            
#            run = models.SubpoolInRun(
#                subpool=record,
#                sequencing_run=measurement,
#                raw_reads=subpool["nextseq_run_raw_reads"],
#                status=models.RunStatus.PASS,
#            )
#            run.save()
#            
#        if subpool["novaseq_run_date"] is not None:
#            for i, nova_raw_reads in enumerate([subpool["novaseq1_raw_reads"], subpool["novaseq2_raw_reads"]]):
#                #nova_run_name = "{}-{}".format(sublibrary["novaseq_run_id"], i+1)
#                nova_run_name = "{}/{}".format(plate.name.lower(), "nova{}".format(i+1))
#                measurement = get_or_create_sequencing_run(
#                    name=nova_run_name,
#                    run_date=subpool["novaseq_run_date"],
#                    platform=platforms["novaseq"],
#                    plate=plate,
#                )
#
#                pattern = expected_names[experiment_name]["novaseq"].format(
#                    library_name=subpool["subpool_subname"],
#                    library_index=line_offset+1,
#                    compression=compression,
#                )
#
#                run = models.SubpoolInRun(
#                    subpool=record,
#                    sequencing_run=measurement,
#                    raw_reads=nova_raw_reads,
#                    status=models.RunStatus.PASS,                    
#                )
#                run.save()



In [73]:
def load_fastq_csv():
    fastqs = pandas.read_csv(
        "fastq_metadata.tsv", sep="\t",
        dtype={
            "sample_id": str,
            "read_id": str,
            "fragment": str,
            "sequencer_run": str,
            "lane": str,
        }).sort_values(["experiment", "subpool_name", "index_id", "lane", "read"])
    return fastqs

fastqs = load_fastq_csv()
fastqs.head()

Unnamed: 0,experiment,family,filename,md5sum,size,ctime,plate_id,subpool_name,fragment,compression,sequencer,sequencer_run,flowcell_id,nuclei,read,barcode,index_id,sample_id,lane,exome_capture
378,igvf_003,illumina,igvf_003/nextseq2/003_13A_R1.fastq.gz,57e977fdfe32850db83ee5f7addb7165,17707636592,2023-10-02,3,13A,,gz,VH00582,6,AAC3V7HHV,13,R1,CAGATC,1,,,EX
377,igvf_003,illumina,igvf_003/nextseq2/003_13A_R2.fastq.gz,148c181e7d81e4aa15611fb2086d5971,10322410034,2023-10-02,3,13A,,gz,VH00582,6,AAC3V7HHV,13,R2,CAGATC,1,,,EX
13,igvf_003,illumina,igvf_003/nextseq/003_67A_R1.fastq.gz,35350d53cf858267d294bce69e475c83,48483277280,2023-09-08,3,67A,,gz,VH00582,1,AAATMGFHV,67,R1,ACTTGA,2,,,
12,igvf_003,illumina,igvf_003/nextseq/003_67A_R2.fastq.gz,00c68ff0f9e0217d422c57e8948d4bb4,32327542019,2023-09-08,3,67A,,gz,VH00582,1,AAATMGFHV,67,R2,ACTTGA,2,,,
14,igvf_003,illumina,igvf_003/nextseq/003_67B_R1.fastq.gz,79db765143bb23e14c613c9e1339e9f7,44182320136,2023-09-08,3,67B,,gz,VH00582,1,AAATMGFHV,67,R1,GATCAG,3,,,


In [74]:
def fastq_metadata_row_to_subpool_name(row):
    if pandas.isnull(row.subpool_name):
        return None
    else:
        return "{}_{}".format(row.plate_id, row.subpool_name)

def get_subpool_from_fastq_row(row):
    plate_name = normalize_plate_name(row.experiment)
    subpool_name = fastq_metadata_row_to_subpool_name(row)

    query = {
        "plate__name": plate_name,
    }
    
    if pandas.notnull(subpool_name):
        query["name"] = subpool_name
    if pandas.notnull(row.index_id):
        query["index"] = str_or_none(row.index_id)
    
    # these are labeled as having X nuclei but actually have fewer nuclei. 
    # Not that you can tell that from the filename
    nuclei_dont_match = {"013_67N", "014_13H"}
    if pandas.notnull(row.nuclei) and subpool_name not in nuclei_dont_match:
        query["nuclei"] = int(row.nuclei)*1000
    if pandas.notnull(row.exome_capture):
        query["selection_type"] = row.exome_capture

    subpools = models.Subpool.objects.filter(**query)
    
    if len(subpools) == 0:
        print("Nothing found for {} {}".format(query, row.filename))
    elif len(subpools) > 1:
        print("Multiple hits for {}: {} {}".format(query, subpools, row.filename))
    else:
        return subpools[0]

from Bio.Seq import Seq
def check_fastq_barcode_is_equal(i7_sequence, i5_sequence, barcode):
    if pandas.isnull(i5_sequence):
        return i7_sequence == barcode
    
    sequences = barcode.split("+")
    if len(sequences) != 2:
        print("Warning: database says dual index, file says single index: {} {} {}".format(i7_sequence, i5_sequence, barcode))
        return False

    i7, i5 = sequences
    # barcodes in the fastq file are reverse complimented.
    i5 = Seq(i5).reverse_complement()
    return i7_sequence == i7 and i5_sequence == i5
    
assert check_fastq_barcode_is_equal("ACTTGA", None, "ACTTGA")
assert not check_fastq_barcode_is_equal("ACTTGA", None, "CTTGTA")
assert check_fastq_barcode_is_equal("GTGAAACT", "AGTCTGTA", "GTGAAACT+TACAGACT")
assert not check_fastq_barcode_is_equal("GTGAAACT", "AGTCTGTA", "ACTTGATC+TTTGGGTG")
assert not check_fastq_barcode_is_equal("GTGAAACT", "AGTCTGTA", "CTTGTA")
print("Finish check_fastq_barcode_is_equal test assertions")

def test_subpool_lookup(fastqs):
    for i, row in fastqs.iterrows():
        subpool = get_subpool_from_fastq_row(row)
        if subpool is None:
            continue
        barcodes = subpool.barcode.filter(barcode_type=None)
        if barcodes.count() != 1:
            print("Barcode count wrong {}".format(barcode))

        barcode = barcodes[0]
        if not check_fastq_barcode_is_equal(barcode.i7_sequence, barcode.i5_sequence, row["barcode"]):
            print(f"Barcode mismatch {subpool.plate_name()} {subpool.index} {barcode.i7_sequence} {barcode.i5_sequence} {row.barcode} {row.filename}")

test_subpool_lookup(fastqs)

Finish check_fastq_barcode_is_equal test assertions
Barcode mismatch IGVF_003 1 CAGATC None nan igvf_003/nanopore/igvf003_8A_lig-ss_11.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC None nan igvf_003/nanopore/igvf003_8A_lig-ss_4.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC None nan igvf_003/nanopore/igvf003_8A_lig-ss_7.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC None nan igvf_003/nanopore/igvf003_8A_lig-ss_6.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC None nan igvf_003/nanopore/igvf003_8A_lig-ss_8.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC None nan igvf_003/nanopore/igvf003_8A_lig-ss_5.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC None nan igvf_003/nanopore/igvf003_8A_lig-ss_1.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC None nan igvf_003/nanopore/igvf003_8A_lig-ss_2.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC None nan igvf_003/nanopore/igvf003_8A_lig-ss_10.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC None nan igvf_003/nanopore/igvf003_8A_lig-ss_9.fastq.gz
Barcode mismatch IGVF_003 1 CAGATC N

In [75]:
[(x.index, x.nuclei, x.selection_type) for x in models.Subpool.objects.filter(plate__name="IGVF_013")]

[('UDI01', 13000, 'EX'),
 ('UDI02', 67000, 'NO'),
 ('UDI03', 67000, 'NO'),
 ('UDI04', 67000, 'NO'),
 ('UDI05', 67000, 'NO'),
 ('UDI06', 67000, 'NO'),
 ('UDI07', 67000, 'NO'),
 ('UDI08', 67000, 'NO'),
 ('UDI09', 67000, 'NO'),
 ('UDI10', 67000, 'NO'),
 ('UDI11', 67000, 'NO'),
 ('UDI12', 67000, 'NO'),
 ('UDI13', 67000, 'NO'),
 ('UDI14', 67000, 'NO'),
 ('UDI15', 48256, 'NO')]

Load information about previously submitted records

In [76]:
submitted_sequence_file = {}

for accession_prefix in submitted_sheets:
    for i, row in submitted_sheets[accession_prefix]["sequence_file"].iterrows():
        if not pandas.isnull(row["accession"]):
            if accession_prefix == "igvftst":
                see_also_template = "https://api.sandbox.igvf.org/sequence-files/{}/"
            elif accession_prefix == "igvf":
                see_also_template = "https://api.data.igvf.org/sequence-files/{}/"
            
            if pandas.notnull(row["accession"]):
                submitted_sequence_file.setdefault(row["submitted_file_name"], []).append({
                    "accession_prefix": accession_prefix,
                    "name": row["accession"],
                    "uuid": row["uuid"],
                    "see_also": see_also_template.format(row["accession"]),
                })
        
print("loaded {} submitted sequence_file records".format(len(submitted_sequence_file)))


loaded 2132 submitted sequence_file records


In [77]:
submitted_sheets.keys()

dict_keys(['igvftst', 'igvf'])

In [78]:
def built_in_cache(a, b, cache={}):
    cache.setdefault(a, b)
    print(cache)
    
built_in_cache('a', 1)
built_in_cache('b', 3)
built_in_cache('c', 4)
built_in_cache('a', 2)

{'a': 1}
{'a': 1, 'b': 3}
{'a': 1, 'b': 3, 'c': 4}
{'a': 1, 'b': 3, 'c': 4}


In [79]:
sequencing_experiments = {}
sequencing_subpools = {}

In [80]:
def get_or_create_sequencing_run(name, run_date, platform, plate, accessions=None):
    if name in sequencing_experiments:
        return sequencing_experiments[name]
    
    try:
        sequencing_run = models.SequencingRun.objects.get(name=name)
    except models.SequencingRun.DoesNotExist:
        sequencing_run = models.SequencingRun(
            name=name,
            run_date=run_date,
            platform=platform,
            plate=plate,
            stranded=models.StrandedEnum.REVERSE,
        )
        sequencing_run.save()
        if accessions is not None:
            sequencing_run.accession.set(accessions)
            sequencing_run.save
        
    sequencing_experiments[name] = sequencing_run
    return sequencing_run

def get_or_create_subpool_in_run(subpool, sequencing_run, status, raw_reads=None, measurement_set=None):
    try:
        subpool_in_run = models.SubpoolInRun.objects.get(subpool=subpool, sequencing_run=sequencing_run)
    except models.SubpoolInRun.DoesNotExist:
        subpool_in_run = models.SubpoolInRun(
            subpool=subpool, 
            sequencing_run=sequencing_run,
            raw_reads=None,
            status=status,
            measurement_set=measurement_set,
        )
        subpool_in_run.save()
        
    sequencing_subpools[name] = subpool_in_run
    return subpool_in_run


def lane_or_none(value):
    if pandas.isnull(value):
        return None
    elif isinstance(value, float):
        return int(value)
    elif value.startswith("L"):
        return int(value[1:])
    else:
        return int(value)

def load_fastqs(submittable_subpools):
    fastqs = load_fastq_csv()
    measurement_sets = load_submitted_measurement_sets()

    for i, row in fastqs.iterrows():
        #plate_name = normalize_plate_name(row.experiment)
        #subpool_name = "{}_{}".format(row.plate_id, row.subpool_name)
        #index_id = int(row.index_id)
        fastq_relative = Path(row.filename)
        sequencing_run_name = str(Path(fastq_relative.parts[0])/fastq_relative.parts[1])
        sequencer = row.get("sequencer")
        run_date = row["ctime"]
        md5sum = row["md5sum"]

        #if row["read_id"] in ('I1',):
        #    continue

        subpool = get_subpool_from_fastq_row(row)

        if subpool is None:
            print("Unable to import {row.filename}")
            continue

        current_plate = subpool.plate

        platform = {
            "A00850": models.Platform.objects.get(display_name="Novaseq 6000"),
            "VH00582": models.Platform.objects.get(display_name="Nextseq 2000"),
            "minion": models.Platform.objects.get(name="minion"),
            "promethion": models.Platform.objects.get(name="promethion"),
        }[sequencer]

        subpool_status = {
            "Yes": models.RunStatusEnum.PASS,
            "No": models.RunStatusEnum.FAILED,
            None: models.RunStatusEnum.FAILED,
        }[submittable_subpools.get(subpool.name)]

        sequencing_run = get_or_create_sequencing_run(sequencing_run_name, run_date, platform, current_plate, accessions=None)
        subpool_run = get_or_create_subpool_in_run(subpool=subpool, sequencing_run=sequencing_run, status=subpool_status)

        for accession_prefix in ["igvftst", "igvf"]:
            file_set_name = None
            submitted_md5s = submitted_sheets[accession_prefix]["sequence_file"].set_index("md5sum")
            if row.md5sum in submitted_md5s.index:
                file_set_alias = submitted_md5s.loc[row.md5sum]["file_set"]
                file_set_name = file_set_alias.split(":")[1]

        if file_set_name is None:
            pass
        elif file_set_name not in measurement_sets:
            # skip if there hasn't been a submitted measurement_set
            pass
        elif subpool_run.measurement_set is None:
            subpool_run.measurement_set = measurement_sets[file_set_name]
            subpool_run.save()
            print(f"updating {subpool_run} measurement set to {measurement_sets[file_set_name]}")
        else:
            assert subpool_run.measurement_set.name == file_set_name, f"SubpoolInRun measurement_set mismatch {subpool_run.measurement_set.name} {file_set_name}"


        subpool_file = models.SubpoolInRunFile(
            sequencing_run=sequencing_run,
            subpool_run=subpool_run,
            md5sum=md5sum,
            filename=str(fastq_relative),
            flowcell_id=row["flowcell_id"],
        )
        if not pandas.isnull(row["lane"]):
            subpool_file.lane = lane_or_none(row["lane"])
        if not pandas.isnull(row["read"]):
            subpool_file.read = row["read"]
        subpool_file.save()

        for accession_info in submitted_sequence_file.get(str(fastq_relative), []):
            if pandas.notnull(accession_info["name"]):
                accession = models.Accession(
                    accession_prefix=accession_info["accession_prefix"],
                    name=accession_info["name"],
                    uuid=accession_info["uuid"],
                    see_also=accession_info["see_also"],
                )
                accession.save()                
                subpool_file.accession.add(accession)
                subpool_file.save()

load_fastqs(submittable_subpools)

print("total plates", models.SplitSeqPlate.objects.count())
print("total sequencing runs", models.SequencingRun.objects.count())
print("total subpools", models.Subpool.objects.count())
print("total subpool in run", models.SubpoolInRun.objects.count())
print("total subpool in run files", models.SubpoolInRunFile.objects.count())


loaded 173 measurement_sets
updating 003_13A igvf_003/nextseq2 measurement set to 003_13A_illumina
updating 003_67A igvf_003/nextseq measurement set to 003_67A_illumina
updating 003_67B igvf_003/nextseq measurement set to 003_67B_illumina
updating 003_67I igvf_003/nova1 measurement set to 003_67I_illumina
updating 003_67I igvf_003/nova2 measurement set to 003_67I_illumina
updating 003_67J igvf_003/nova1 measurement set to 003_67J_illumina
updating 003_67J igvf_003/nova2 measurement set to 003_67J_illumina
updating 003_67K igvf_003/nova1 measurement set to 003_67K_illumina
updating 003_67K igvf_003/nova2 measurement set to 003_67K_illumina
updating 003_67L igvf_003/nova1 measurement set to 003_67L_illumina
updating 003_67L igvf_003/nova2 measurement set to 003_67L_illumina
updating 003_67M igvf_003/nova1 measurement set to 003_67M_illumina
updating 003_67M igvf_003/nova2 measurement set to 003_67M_illumina
updating 003_67N igvf_003/nova1 measurement set to 003_67N_illumina
updating 003_

updating 007_67D igvf_007/nova2 measurement set to 007_67D_illumina
updating 007_67E igvf_007/nova1 measurement set to 007_67E_illumina
updating 007_67E igvf_007/nova2 measurement set to 007_67E_illumina
updating 007_67F igvf_007/nova1 measurement set to 007_67F_illumina
updating 007_67F igvf_007/nova2 measurement set to 007_67F_illumina
updating 007_67G igvf_007/nova1 measurement set to 007_67G_illumina
updating 007_67G igvf_007/nova2 measurement set to 007_67G_illumina
updating 007_67H igvf_007/nova1 measurement set to 007_67H_illumina
updating 007_67H igvf_007/nova2 measurement set to 007_67H_illumina
updating 008_67C igvf_008/nextseq measurement set to 008_67C_illumina
updating 008_67G igvf_008/nextseq measurement set to 008_67G_illumina
updating 008_67I igvf_008/nova1 measurement set to 008_67I_illumina
updating 008_67A igvf_008/nova1 measurement set to 008_67A_illumina
updating 008_67B igvf_008/nova1 measurement set to 008_67B_illumina
updating 008_67C igvf_008/nova1 measurement 

updating 011_67B igvf_011/nova2 measurement set to 011_67B_illumina
updating 011_67C igvf_011/nova1 measurement set to 011_67C_illumina
updating 011_67C igvf_011/nova2 measurement set to 011_67C_illumina
updating 011_67D igvf_011/nova1 measurement set to 011_67D_illumina
updating 011_67D igvf_011/nova2 measurement set to 011_67D_illumina
updating 011_67E igvf_011/nova1 measurement set to 011_67E_illumina
updating 011_67E igvf_011/nova2 measurement set to 011_67E_illumina
updating 011_67F igvf_011/nova1 measurement set to 011_67F_illumina
updating 011_67F igvf_011/nova2 measurement set to 011_67F_illumina
updating 011_67G igvf_011/nova1 measurement set to 011_67G_illumina
updating 011_67G igvf_011/nova2 measurement set to 011_67G_illumina
updating 011_67H igvf_011/nova1 measurement set to 011_67H_illumina
updating 011_67H igvf_011/nova2 measurement set to 011_67H_illumina
updating 012_13A igvf_012/nextseq measurement set to 012_13A_illumina
updating 012_13B igvf_012/nextseq measurement 

In [81]:
for q in models.Subpool.objects.filter(plate__name="IGVF_014", index="UDI24"):
    print(q.name, q.index, q.nuclei, q.selection_type)

014_13H UDI24 10400 NO


In [82]:
[(x.name, x.index, x.selection_type) for x in models.Subpool.objects.filter(plate__name="IGVF_013")]

[('013_13A', 'UDI01', 'EX'),
 ('013_67A', 'UDI02', 'NO'),
 ('013_67B', 'UDI03', 'NO'),
 ('013_67C', 'UDI04', 'NO'),
 ('013_67D', 'UDI05', 'NO'),
 ('013_67E', 'UDI06', 'NO'),
 ('013_67F', 'UDI07', 'NO'),
 ('013_67G', 'UDI08', 'NO'),
 ('013_67H', 'UDI09', 'NO'),
 ('013_67I', 'UDI10', 'NO'),
 ('013_67J', 'UDI11', 'NO'),
 ('013_67K', 'UDI12', 'NO'),
 ('013_67L', 'UDI13', 'NO'),
 ('013_67M', 'UDI14', 'NO'),
 ('013_67N', 'UDI15', 'NO')]

In [83]:
def print_subpool_submission_status(fastqs):
    for i, row in fastqs.iterrows():
        plate_name = normalize_plate_name(row.experiment)
        index_id = row.index_id
        fastq_relative = Path(row.filename)
        sequencing_run_name = str(Path(fastq_relative.parts[0])/fastq_relative.parts[1])
        sequencer = row.get("sequencer")
        run_date = row["ctime"]
        md5sum = row["md5sum"]

        #if row["read_id"] in ('I1',):
        #    continue

        subpool = get_subpool_from_fastq_row(row)
        current_plate = subpool.plate

        platform = {
            "A00850": models.Platform.objects.get(display_name="Novaseq 6000"),
            "VH00582": models.Platform.objects.get(display_name="Nextseq 2000"),
            "minion": models.Platform.objects.get(name="minion"),
            "promethion": models.Platform.objects.get(name="promethion"),
        }[sequencer]

        subpool_status = {
            "Yes": models.RunStatus.PASS,
            "No": models.RunStatus.FAILED,
            None: models.RunStatus.FAILED,
        }[submittable_subpools.get(subpool.name)]

        #sequencing_run = get_or_create_sequencing_run(sequencing_run_name, run_date, platform, current_plate, accessions=None)
        #subpool_run = get_or_create_subpool_in_run(subpool=subpool, sequencing_run=sequencing_run, status=subpool_status)
        print(row.filename, row.index_id, sequencing_run_name, run_date, subpool, subpool_status)

#print_subpool_submission_status(fastqs)

In [84]:
md5counts = Counter(fastqs["md5sum"])
dupmd5s = set()
for md5sum in md5counts:
    if md5counts[md5sum] > 1:
        dupmd5s.add(md5sum)
        
dupfiles = fastqs[fastqs["md5sum"].isin(dupmd5s)].sort_values(["experiment", "md5sum", "filename"])
print(dupfiles.shape)
dupfiles

(0, 20)


Unnamed: 0,experiment,family,filename,md5sum,size,ctime,plate_id,subpool_name,fragment,compression,sequencer,sequencer_run,flowcell_id,nuclei,read,barcode,index_id,sample_id,lane,exome_capture


In [85]:
print(dupfiles.to_csv(index=False))

experiment,family,filename,md5sum,size,ctime,plate_id,subpool_name,fragment,compression,sequencer,sequencer_run,flowcell_id,nuclei,read,barcode,index_id,sample_id,lane,exome_capture



In [86]:
models.SexEnum.choices

[('M', 'male'), ('F', 'female'), ('U', 'unknown'), ('O', 'other')]

In [87]:
# In case of interrupted upload

if 0:
    EC = str(Path("~/proj/encoded_client").expanduser())
    if EC not in sys.path:
        sys.path.append(EC)

    from encoded_client.encoded import ENCODED

    server = ENCODED("api.data.igvf.org")

    submitted_tissues = load_submitted_tissues()
    for tissue_id in submitted_tissues:
        for record in submitted_tissues[tissue_id]:
            if pandas.isnull(record["uuid"] ):
                tissue = server.get_json(record["see_also"])
                print("{}\t{}".format(record["name"], tissue["uuid"]))

In [88]:
models.SubpoolInRunFile.objects.get(md5sum="2a5a07155767a2f541cfb20e7d0636e0")

<SubpoolInRunFile: igvf_b01/next2/B01_13H_R2.fastq.gz>