# Introduction

This is a simplification of my figuring out how to submit seqspec files to the IGVF DACC.

- [Python environment](#Setup)
- [Seqspec template function](#Template)
- [Working out steps needed to create seqspec objects](#Working-out-steps-needed-to-create-seqspec-objects)
- [Exploring boto3](#Exploring-boto3)
- [Seqspec submission functions](#Seqspec-Submission-functions)
- [Create seqspec objects for remaining fastqs](#Create-seqspec-objects-for-remaining-fastqs)

Information about barcodes.

cell barcode whitelists for 10x multiome GEX and ATAC:
- atac_737K-arc-v1.txt.gz
- gex_737K-arc-v1.txt.gz

MULTI-seq barcodes (custom)
- MULTIbc_onlist.txt 

MULTI-seq I7 indices (custom):
- index7_onlist_MULTI.txt 

10x Multiome 
- snRNA-seq I5 and I7 indices: https://cdn.10xgenomics.com/raw/upload/v1655151897/support/in-line%20documents/Dual_Index_Kit_TT_Set_A.csv
  - i7_dual_index_set_a.txt (10 bp)
  - i5_dual_index_set_a.txt (10 bp)
- ATAC-seq I7 indices: https://cdn.10xgenomics.com/raw/upload/v1655155348/support/in-line%20documents/Single_Index_Kit_N_Set_A.csv
  - i7_single_index_set_a.txt (8 bp)


# Setup

First we start with general imports

In [None]:
import gzip
import hashlib
from io import StringIO, BytesIO
import logging
from matplotlib.pyplot import show
import numpy
import os
import pandas
from pathlib import Path
import requests
import sys
import tarfile
import tqdm
from urllib.parse import urlparse
import yaml

In [None]:
try:
    import boto3
except ImportError:
    !{sys.executable} -m pip install --user boto3
    import boto3
    
from botocore.exceptions import ClientError
    

In [None]:
try:
    from jinja2 import Environment
except ImportError:
    !{sys.executable} -m pip install --user jinja2
    from jinja2 import Environment

from jinja2 import FileSystemLoader, select_autoescape, Undefined, StrictUndefined, make_logging_undefined

logger = logging.getLogger(__name__)
LoggingUndefined = make_logging_undefined(
    logger=logger,
    base=Undefined
)

env = Environment(
    loader=FileSystemLoader("templates"),
    autoescape=select_autoescape(),
    undefined=LoggingUndefined,
)

I want to be able to use the seqspec validator while I was writing my seqspec file.

I have the repository checked out into ~/proj/seqspec. This block should either import it for me, or install it if someone elese runs it.

Import pieces of seqspec that we need for this notebook.

In [None]:
try:
    import seqspec
except ImportError:
    seqspec_root = Path("~/proj/seqspec").expanduser()
    if seqspec_root.exists() and str(seqspec_root) not in sys.path:
        sys.path.append(str(seqspec_root))
    else:
        !{sys.executable} -m pip install --user seqspec
    import seqspec
    
from seqspec.Assay import Assay
from seqspec.Region import Region
from seqspec.Region import Onlist
from seqspec.utils import load_spec_stream

from seqspec.seqspec_index import run_index, get_index
from seqspec.seqspec_print import print_library_ascii, print_seqspec_png
from seqspec.seqspec_onlist import run_onlist_region, run_onlist_read

In [None]:
from seqspecgen.igvf import (
    get_sequence_file_info,
    get_barcode_info,
)
from seqspecgen.util import seqspec_validate, generate_seqspec_tool_index

I have my own API for interacting with the IGVF database server (which is very much like the old ENCODE database server)

In [None]:
try:
    from encoded_client import encoded
except ImportError:
    encoded_root = Path("~/proj/encoded_client").expanduser()
    if encoded_root.exists() and str(encoded_root) not in sys.path:
        sys.path.append(str(encoded_root))
    else:
        !{sys.executable} -m pip install --user encoded_client
        
    from encoded_client import encoded

encoded_client will pull submitter credentials from either DCC_API_KEY and DCC_SECRET_KEY or from a .netrc file loaded from your home directory. (replacing the {DCC_API_KEY} and {DCC_SECRET_KEY} strings with your specific values.)

The format of a .netrc file is a plain text file with records of the format:

<pre>machine api.sandbox.igvf.org login {DCC_API_KEY} password {DCC_SECRET_KEY}</pre>

Or api.data.igvf.org

(it's also possible to list the fields on separate lines, but I think it's easier to read when they're on one line)

or after creating the server object call:

<pre>server.username = "{DCC_API_KEY}"
server.password = "{DCC_SECRET_KEY}"</pre>



## submission variables

In [None]:
#server_name = "api.sandbox.igvf.org"
server_name = "api.data.igvf.org"
award = "/awards/HG012076/"
lab = "/labs/ansuman-satpathy/"

In [None]:
server = encoded.ENCODED(server_name)
igvf_validator = encoded.DCCValidator(server)

In [None]:
def list_read_ids(self, modality):
    return [r.read_id for r in self.get_seqspec(modality)]

def get_barcode_read_id(spec, modality, run_files, barcode_read):
    barcode_accession = run_files[barcode_read]["accession"]

    for read_id in list_read_ids(spec, modality):
        if read_id.startswith(barcode_accession):
            return read_id
            
    raise ValueError("Unable to find barcode read")

In [None]:
def load_spec(filename):
    with open(filename, "rt") as instream:
        data = yaml.load(instream, Loader=yaml.Loader)
        for r in data.assay_spec:
            r.set_parent_id(None)
    return data

# Template

First build up lists of barcodes onlists needed for this protocol the names will be passed to the template.

In [None]:
# template for multiseq seqspecs
satpathy_template = "igvf-multiseq-{modality}.yaml.j2"

# First generation satpathy multiseq files
#metadata = pandas.read_csv("satpathy_seqspec_files.tsv", sep="\t")

#Feb 2025 satpathy multiseq_files
#metadata = pandas.read_csv("seqspec_info_feb2025.tsv", sep="\t")

#Feb 2025 satpathy 10x multiome
#metadata = pandas.read_csv("seqspec_info_feb2025_nomultiseq.tsv", sep="\t")

# Jun 2025 satpathy
metadata = pandas.read_csv("seqspec_info_igvf7.tsv", sep="\t")

metadata

In [None]:
barcode_sets = {
    "rna": {},
    "atac": {},
    "multi": {},
}

barcode_sets["rna"].update(get_barcode_info(server, "cell_barcode", "IGVFFI8751YQRY"))
barcode_sets["rna"].update(get_barcode_info(server, "i7_rnaseq_index", "IGVFFI4565KANH"))
barcode_sets["rna"].update(get_barcode_info(server, "i5_rnaseq_index", "IGVFFI4565KANH"))

barcode_sets["atac"].update(get_barcode_info(server, "cell_barcode", "IGVFFI7587TJLC"))
barcode_sets["atac"].update(get_barcode_info(server, "i7_atac_index", "IGVFFI1608YDWY"))

barcode_sets["multi"].update(get_barcode_info(server, "cell_barcode", "IGVFFI8751YQRY"))
barcode_sets["multi"].update(get_barcode_info(server, "multi_index", "IGVFFI5410CKWT"))
barcode_sets["multi"].update(get_barcode_info(server, "i7_multi_index", "IGVFFI3231EAMC"))


In [None]:
barcode_sets

Define the the file information needed to define this seqspec.

In [None]:
# load the template and instantiate it with the context for this specific measurement set.
# this version needs the libspec branch that separates the sequence and library specifications

def create_seqspec(row, barcode_read="R1", verbose=False):
    #template_name = "parse-wt-mega-v2-single-index-libspec-1.yaml.j2"
    modality = row["modality"]
    template_name = satpathy_template.format(modality=modality)
    template = env.get_template(template_name)

    # Build context
    context = {}
    # Merge in the barcode information
    context.update(barcode_sets[modality])

    context.update(row.to_dict())

    run_files = {}
    for column in ["R1", "R2", "R3", "I1", "I2"]:
        if pandas.notnull(row[column]):
            accession = row[column]
            run_files[accession] = server.get_json(accession)
            context.update(get_sequence_file_info(server, accession, column))
            #lengths = get_read_lengths(server, row[column], reads_to_check=10)
            #context[f"{column}_min_len"] = min(lengths)
            #context[f"{column}_max_len"] = max(lengths)

    example_yaml = template.render(context)

    # validate the generated seqspec file.
    example_spec = load_spec_stream(StringIO(example_yaml))
    errors = seqspec_validate(example_spec)
    if len(errors) > 0:
        print(errors)
        raise RuntimeError("Validation failures")

    example_spec.update_spec()
    if verbose:
        # lets print the settings
        #print_library_ascii, print_seqspec_png
        print(template_name)
        print("tree")
        print(print_library_ascii(example_spec))
        print("tool settings:", generate_seqspec_tool_index(example_spec, run_files))
        if modality is None:
            if len(example_spec.list_modalities()) == 1: 
                modality = example_spec.list_modalities()[0]
            elif len(example_spec.list_modalities()) > 1:
                raise ValueError("Need to specify modality")
            else:
                raise ValueError("There needs to be a modality in the seqspec file.")
        #barcode_read_id = get_barcode_read_id(example_spec, modality, run_files, barcode_read)
        barcode_read_id = row[barcode_read]
        onlist_files = run_onlist_read(example_spec, modality, barcode_read_id)
        print("onlist", [onlist.filename for onlist in onlist_files])
        show(print_seqspec_png(example_spec))
        print()        
        
    return example_spec

In [None]:
metadata[metadata["modality"] == "rna"].iloc[0]

In [None]:
# rna example
example_spec = create_seqspec(metadata[metadata["modality"] == "rna"].iloc[0], verbose=True)

# atac example
example_spec = create_seqspec(metadata[metadata["modality"] == "atac"].iloc[0], verbose=True)

# multi example
#example_spec = create_seqspec(metadata[metadata["modality"] == "multi"].iloc[0])

In [None]:
print(example_spec.to_YAML())

In [None]:
#print(run_print_library_tree(example_spec))

In [None]:
#_ = run_print_library_png(example_spec)

In [None]:
def save_seqspecs_targz(table, destination):
    fastqs = ["R1","R2","R3","I1","I2"]
    
    with tarfile.open(destination, "w:gz") as archive:
        configuration_file = []
        for i, row in tqdm.tqdm(table.iterrows(), total=table.shape[0]):
            row_name = row["identifier"].replace("ansuman-satpathy:", "")
            modality = row["modality"]
            #print("Processing {}".format(row_name))
            spec = create_seqspec(row)
            spec_yaml = spec.to_YAML().encode("utf-8")
            spec_yaml = gzip.compress(spec_yaml, mtime=0)
            spec_stream = BytesIO(spec_yaml)
            spec_filename = f"{row_name}_{modality}_seqspec.yaml.gz"
            spec_tarinfo = tarfile.TarInfo(str(spec_filename))
            spec_tarinfo.size = len(spec_yaml)
            archive.addfile(spec_tarinfo, fileobj=spec_stream)
            configuration_file.append({
                "accession": None,
                "file_set": row["identifier"],
                "content_type": "seqspec",
                "file_format": "yaml",
                "submitted_file_name": spec_filename,
                "md5sum": hashlib.md5(spec_yaml).hexdigest(),
                "seqspec_of:array": ",".join([row[x] for x in fastqs if pandas.notnull(row[x])]),
                "award": award,
                "lab": lab,
                
            })
        configuration_file = pandas.DataFrame(configuration_file)
        configuration_tsv = configuration_file.to_csv(index=False, sep="\t").encode("utf-8")
        configuration_stream = BytesIO(configuration_tsv)
        configuration_info = tarfile.TarInfo("configuration_file.tsv")
        configuration_info.size = len(configuration_tsv)
        archive.addfile(configuration_info, fileobj=configuration_stream)


def save_team_seqspecs_directory(destination, table):
    arguments = []
    raw_url = "https://raw.githubusercontent.com/detrout/y2ave_seqspecs/main/{filename}"
    
    for i, row in table.iterrows():
        row_name = row[0]
        print("Processing {} {}".format(team_id, row_name))
        spec_filename = f"{team_id}_{row_name}_seqspec.yaml"
        spec_yaml = render_seqspec(team_id, row)
        with open(destination/spec_filename, "wt") as outstream:
            outstream.write(spec_yaml)
            
        spec = load_spec_stream(StringIO(spec_yaml))
        arg_row = {
            "team_id": team_id,
            "dataset_id": row_name,
            "url": raw_url.format(filename=spec_filename),
        }
        print(generate_seqspec_index(spec, row))
        arg_row.update(generate_seqspec_index(spec, row))
        arguments.append(arg_row)

    return arguments


In [None]:
#print("All", metadata.shape[0])
#metadata_rna_atac = metadata[metadata["modality"].isin(["atac", "rna"])]
#print("Removed multi modality", metadata_rna_atac.shape[0])

In [None]:
#save_seqspecs_targz(metadata, "mcginnis-seqspecs-20241125.tgz")
#save_seqspecs_targz(metadata, "seqspecs_202505_multiseq_209.tar.gz")
#save_seqspecs_targz(metadata, "seqspecs_feb2025_nomultiseq.tar.gz")
save_seqspecs_targz(metadata, "seqspecs_202506_multiseqs.tar.gz")