# Introduction

This is a simplification of my figuring out how to submit seqspec files to the IGVF DACC.

- [Python environment](#Setup)
- [Seqspec template function](#Template)
- [Working out steps needed to create seqspec objects](#Working-out-steps-needed-to-create-seqspec-objects)
- [Exploring boto3](#Exploring-boto3)
- [Seqspec submission functions](#Seqspec-Submission-functions)
- [Create seqspec objects for remaining fastqs](#Create-seqspec-objects-for-remaining-fastqs)

Information about barcodes.

cell barcode whitelists for 10x multiome GEX and ATAC:
- atac_737K-arc-v1.txt.gz
- gex_737K-arc-v1.txt.gz

MULTI-seq barcodes (custom)
- MULTIbc_onlist.txt 

MULTI-seq I7 indices (custom):
- index7_onlist_MULTI.txt 

10x Multiome 
- snRNA-seq I5 and I7 indices: https://cdn.10xgenomics.com/raw/upload/v1655151897/support/in-line%20documents/Dual_Index_Kit_TT_Set_A.csv
  - i7_dual_index_set_a.txt (10 bp)
  - i5_dual_index_set_a.txt (10 bp)
- ATAC-seq I7 indices: https://cdn.10xgenomics.com/raw/upload/v1655155348/support/in-line%20documents/Single_Index_Kit_N_Set_A.csv
  - i7_single_index_set_a.txt (8 bp)


# Setup

First we start with general imports

In [1]:
import gzip
import hashlib
from io import StringIO, BytesIO
import logging
import numpy
import os
import pandas
from pathlib import Path
import requests
import sys
import tarfile
import tqdm
from urllib.parse import urlparse
import yaml

In [2]:
try:
    import boto3
except ImportError:
    !{sys.executable} -m pip install --user boto3
    import boto3
    
from botocore.exceptions import ClientError
    

In [3]:
try:
    from jinja2 import Environment
except ImportError:
    !{sys.executable} -m pip install --user jinja2
    from jinja2 import Environment

from jinja2 import FileSystemLoader, select_autoescape, Undefined, StrictUndefined, make_logging_undefined

logger = logging.getLogger(__name__)
LoggingUndefined = make_logging_undefined(
    logger=logger,
    base=Undefined
)

env = Environment(
    loader=FileSystemLoader("templates"),
    autoescape=select_autoescape(),
    undefined=LoggingUndefined,
)

I want to be able to use the seqspec validator while I was writing my seqspec file.

I have the repository checked out into ~/proj/seqspec. This block should either import it for me, or install it if someone elese runs it.

Import pieces of seqspec that we need for this notebook.

In [4]:
try:
    import seqspec
except ImportError:
    seqspec_root = Path("~/proj/seqspec").expanduser()
    if seqspec_root.exists() and str(seqspec_root) not in sys.path:
        sys.path.append(str(seqspec_root))
    else:
        !{sys.executable} -m pip install --user seqspec
    import seqspec
    
from seqspec.Assay import Assay
from seqspec.Region import Region
from seqspec.Region import Onlist
from seqspec.utils import load_spec_stream

#from seqspec.seqspec_index import run_index, get_index
#from seqspec.seqspec_print import run_print_sequence_spec, run_print_library_tree, run_print_library_png
#from seqspec.seqspec_onlist import run_list_onlists, run_onlist_read, run_find_by_type    

In [5]:
from seqspecgen.igvf import (
    get_sequence_file_info,
    get_barcode_info,
)
from seqspecgen.util import seqspec_validate

I have my own API for interacting with the IGVF database server (which is very much like the old ENCODE database server)

In [6]:
try:
    from encoded_client import encoded
except ImportError:
    encoded_root = Path("~/proj/encoded_client").expanduser()
    if encoded_root.exists() and str(encoded_root) not in sys.path:
        sys.path.append(str(encoded_root))
    else:
        !{sys.executable} -m pip install --user encoded_client
        
    from encoded_client import encoded

encoded_client will pull submitter credentials from either DCC_API_KEY and DCC_SECRET_KEY or from a .netrc file loaded from your home directory. (replacing the {DCC_API_KEY} and {DCC_SECRET_KEY} strings with your specific values.)

The format of a .netrc file is a plain text file with records of the format:

<pre>machine api.sandbox.igvf.org login {DCC_API_KEY} password {DCC_SECRET_KEY}</pre>

Or api.data.igvf.org

(it's also possible to list the fields on separate lines, but I think it's easier to read when they're on one line)

or after creating the server object call:

<pre>server.username = "{DCC_API_KEY}"
server.password = "{DCC_SECRET_KEY}"</pre>



## submission variables

In [7]:
#server_name = "api.sandbox.igvf.org"
server_name = "api.data.igvf.org"
award = "/awards/HG012076/"
lab = "/labs/ansuman-satpathy/"

In [8]:
server = encoded.ENCODED(server_name)
igvf_validator = encoded.DCCValidator(server)

In [9]:
def load_spec(filename):
    with open(filename, "rt") as instream:
        data = yaml.load(instream, Loader=yaml.Loader)
        for r in data.assay_spec:
            r.set_parent_id(None)
    return data

# Template

First build up lists of barcodes onlists needed for this protocol the names will be passed to the template.

In [10]:
metadata = pandas.read_csv("satpathy_seqspec_files.tsv", sep="\t")
metadata

Unnamed: 0,identifier,modality,sequence_kit,sequence_protocol,R1,R2,R3,I1,I2
0,ansuman-satpathy:igvf_exp0_atac_10x1_NGS1,atac,NovaSeq 6000 S4 Reagent Kit v1.5,Illumina NovaSeq 6000 (EFO:0008637),IGVFFI3568SBTV,IGVFFI1986FAVM,IGVFFI8652MKWJ,IGVFFI5795ORFY,
1,ansuman-satpathy:igvf_exp0_multi_10x1_NGS1,multi,NovaSeq 6000 S4 Reagent Kit v1.5,Illumina NovaSeq 6000 (EFO:0008637),IGVFFI5411OJRK,IGVFFI4642YKWI,IGVFFI5223HYRP,IGVFFI6088QALC,
2,ansuman-satpathy:igvf_exp0_rna_10x1_NGS1,rna,NovaSeq 6000 S4 Reagent Kit v1.5,Illumina NovaSeq 6000 (EFO:0008637),IGVFFI5758URSR,IGVFFI1215RCAP,,IGVFFI4517STPG,IGVFFI8677MRVW
3,ansuman-satpathy:igvf_exp1_atac_10x1_NGS1,atac,NovaSeq X Series 10B Reagent Kit,Illumina NovaSeq X (NTR:0000765),IGVFFI1415ZCCR,IGVFFI9193AKLW,IGVFFI4166FQWH,IGVFFI9476DBSV,
4,ansuman-satpathy:igvf_exp1_atac_10x1_NGS2,atac,NovaSeq X Series 10B Reagent Kit,Illumina NovaSeq X (NTR:0000765),IGVFFI2636JAAE,IGVFFI0014KSVS,IGVFFI9875PICN,IGVFFI6845HIAS,
...,...,...,...,...,...,...,...,...,...
408,ansuman-satpathy:igvf_mouse_bridge_114_rna,rna,NovaSeq 6000 S4 Reagent Kit v1.5,Illumina NovaSeq 6000 (EFO:0008637),IGVFFI3400EIAB,IGVFFI5674QKJY,,IGVFFI1603WDWP,IGVFFI1522TXOM
409,ansuman-satpathy:igvf_mouse_bridge_115_atac,atac,NovaSeq 6000 S4 Reagent Kit v1.5,Illumina NovaSeq 6000 (EFO:0008637),IGVFFI5559EVYB,IGVFFI5706AEPS,IGVFFI1771YVVX,IGVFFI1992CETB,
410,ansuman-satpathy:igvf_mouse_bridge_115_rna,rna,NovaSeq 6000 S4 Reagent Kit v1.5,Illumina NovaSeq 6000 (EFO:0008637),IGVFFI7761PXYZ,IGVFFI2504VZJX,,IGVFFI0849BAAC,IGVFFI3865CUKI
411,ansuman-satpathy:igvf_mouse_bridge_116_atac,atac,NovaSeq 6000 S4 Reagent Kit v1.5,Illumina NovaSeq 6000 (EFO:0008637),IGVFFI7842VQRM,IGVFFI0599BUEC,IGVFFI0780GMLK,IGVFFI9383DDZQ,


In [11]:
barcode_sets = {
    "rna": {},
    "atac": {},
    "multi": {},
}

barcode_sets["rna"].update(get_barcode_info(server, "cell_barcode", "IGVFFI8751YQRY"))
barcode_sets["rna"].update(get_barcode_info(server, "i7_rnaseq_index", "IGVFFI4565KANH"))
barcode_sets["rna"].update(get_barcode_info(server, "i5_rnaseq_index", "IGVFFI4565KANH"))

barcode_sets["atac"].update(get_barcode_info(server, "cell_barcode", "IGVFFI7587TJLC"))
barcode_sets["atac"].update(get_barcode_info(server, "i7_atac_index", "IGVFFI1608YDWY"))

barcode_sets["multi"].update(get_barcode_info(server, "cell_barcode", "IGVFFI8751YQRY"))
barcode_sets["multi"].update(get_barcode_info(server, "multi_index", "IGVFFI5410CKWT"))
barcode_sets["multi"].update(get_barcode_info(server, "i7_multi_index", "IGVFFI3231EAMC"))


In [12]:
barcode_sets

{'rna': {'cell_barcode_file_id': 'IGVFFI8751YQRY',
  'cell_barcode_file_name': 'IGVFFI8751YQRY.tsv.gz',
  'cell_barcode_file_size': 2142553,
  'cell_barcode_url': 'https://api.data.igvf.org/tabular-files/IGVFFI8751YQRY/@@download/IGVFFI8751YQRY.tsv.gz',
  'cell_barcode_md5': '95ca0127739965ae57d09c6c1cfb20b4',
  'i7_rnaseq_index_file_id': 'IGVFFI4565KANH',
  'i7_rnaseq_index_file_name': 'IGVFFI4565KANH.csv.gz',
  'i7_rnaseq_index_file_size': 1752,
  'i7_rnaseq_index_url': 'https://api.data.igvf.org/tabular-files/IGVFFI4565KANH/@@download/IGVFFI4565KANH.csv.gz',
  'i7_rnaseq_index_md5': '9bbbf8c40bf751cea0505fc85a66ab08',
  'i5_rnaseq_index_file_id': 'IGVFFI4565KANH',
  'i5_rnaseq_index_file_name': 'IGVFFI4565KANH.csv.gz',
  'i5_rnaseq_index_file_size': 1752,
  'i5_rnaseq_index_url': 'https://api.data.igvf.org/tabular-files/IGVFFI4565KANH/@@download/IGVFFI4565KANH.csv.gz',
  'i5_rnaseq_index_md5': '9bbbf8c40bf751cea0505fc85a66ab08'},
 'atac': {'cell_barcode_file_id': 'IGVFFI7587TJLC',
 

Define the the file information needed to define this seqspec.

In [13]:
# load the template and instantiate it with the context for this specific measurement set.
# this version needs the libspec branch that separates the sequence and library specifications

def create_seqspec(row, verbose=False):
    #template_name = "parse-wt-mega-v2-single-index-libspec-1.yaml.j2"
    modality = row["modality"]
    template_name = "igvf-multiseq-{modality}.yaml.j2".format(modality=modality)
    template = env.get_template(template_name)

    # Build context
    context = {}
    # Merge in the barcode information
    context.update(barcode_sets[modality])

    context.update(row.to_dict())

    for column in ["R1", "R2", "R3", "I1", "I2"]:
        if pandas.notnull(row[column]):
            accession = row[column]
            context.update(get_sequence_file_info(server, accession, column))
            #lengths = get_read_lengths(server, row[column], reads_to_check=10)
            #context[f"{column}_min_len"] = min(lengths)
            #context[f"{column}_max_len"] = max(lengths)

    example_yaml = template.render(context)

    # validate the generated seqspec file.
    example_spec = load_spec_stream(StringIO(example_yaml))
    seqspec_validate(example_spec.to_dict())

    example_spec.update_spec()
    if verbose:
        print(example_spec.to_YAML())
    return example_spec

In [14]:
metadata[metadata["modality"] == "multi"].iloc[0]

identifier           ansuman-satpathy:igvf_exp0_multi_10x1_NGS1
modality                                                  multi
sequence_kit                   NovaSeq 6000 S4 Reagent Kit v1.5
sequence_protocol           Illumina NovaSeq 6000 (EFO:0008637)
R1                                               IGVFFI5411OJRK
R2                                               IGVFFI4642YKWI
R3                                               IGVFFI5223HYRP
I1                                               IGVFFI6088QALC
I2                                                          NaN
Name: 1, dtype: object

In [15]:
# rna example
#example_spec = create_seqspec(metadata[metadata["modality"] == "rna"].iloc[0])

# atac example
#example_spec = create_seqspec(metadata[metadata["modality"] == "atac"].iloc[0])

# multi example
example_spec = create_seqspec(metadata[metadata["modality"] == "multi"].iloc[0])

In [16]:
print(example_spec.to_YAML())

!Assay
seqspec_version: 0.3.0
assay_id: 10x-ATAC-RNA-MULTI
name: 10x-ATAC-RNA-MULTI/Illumina
doi: https://doi.org/10.1038/s41592-019-0433-8
date: 17 June 2019
description: ansuman-satpathy:igvf_exp0_multi_10x1_NGS1 MULTI-seq
modalities:
- tag
lib_struct: https://igvf.github.io/seqspec/
library_protocol: Custom
library_kit: Illumina Truseq Dual Index
sequence_protocol: Illumina NovaSeq 6000 (EFO:0008637)
sequence_kit: NovaSeq 6000 S4 Reagent Kit v1.5
sequence_spec:
- !Read
  read_id: IGVFFI5411OJRK
  name: Read 1
  modality: tag
  primer_id: multi-truseq_read1
  min_len: 50
  max_len: 50
  strand: pos
  files:
  - !File
    file_id: IGVFFI5411OJRK
    filename: IGVFFI5411OJRK.fastq.gz
    filetype: fastq
    filesize: 3744591369
    url: https://api.data.igvf.org/sequence-files/IGVFFI5411OJRK/@@download/IGVFFI5411OJRK.fastq.gz
    urltype: https
    md5: 1e97130b5dfa0cdd859e95b39af60494
- !Read
  read_id: IGVFFI6088QALC
  name: Index 1 (i7 index)
  modality: tag
  primer_id: multi-truse

In [17]:
#print(run_print_library_tree(example_spec))

In [18]:
#_ = run_print_library_png(example_spec)

In [19]:
def save_seqspecs_targz(table, destination):
    fastqs = ["R1","R2","R3","I1","I2"]
    
    with tarfile.open(destination, "w:gz") as archive:
        configuration_file = []
        for i, row in tqdm.tqdm(table.iterrows(), total=table.shape[0]):
            row_name = row["identifier"].replace("ansuman-satpathy:", "")
            modality = row["modality"]
            #print("Processing {}".format(row_name))
            spec = create_seqspec(row)
            spec_yaml = spec.to_YAML().encode("utf-8")
            spec_yaml = gzip.compress(spec_yaml, mtime=0)
            spec_stream = BytesIO(spec_yaml)
            spec_filename = f"{row_name}_{modality}_seqspec.yaml.gz"
            spec_tarinfo = tarfile.TarInfo(str(spec_filename))
            spec_tarinfo.size = len(spec_yaml)
            archive.addfile(spec_tarinfo, fileobj=spec_stream)
            configuration_file.append({
                "accession": None,
                "file_set": row["identifier"],
                "content_type": "seqspec",
                "file_format": "yaml",
                "submitted_file_name": spec_filename,
                "md5sum": hashlib.md5(spec_yaml).hexdigest(),
                "seqspec_of:array": ",".join([row[x] for x in fastqs if pandas.notnull(row[x])]),
                "award": award,
                "lab": lab,
                
            })
        configuration_file = pandas.DataFrame(configuration_file)
        configuration_tsv = configuration_file.to_csv(index=False, sep="\t").encode("utf-8")
        configuration_stream = BytesIO(configuration_tsv)
        configuration_info = tarfile.TarInfo("configuration_file.tsv")
        configuration_info.size = len(configuration_tsv)
        archive.addfile(configuration_info, fileobj=configuration_stream)


def save_team_seqspecs_directory(destination, table):
    arguments = []
    raw_url = "https://raw.githubusercontent.com/detrout/y2ave_seqspecs/main/{filename}"
    
    for i, row in table.iterrows():
        row_name = row[0]
        print("Processing {} {}".format(team_id, row_name))
        spec_filename = f"{team_id}_{row_name}_seqspec.yaml"
        spec_yaml = render_seqspec(team_id, row)
        with open(destination/spec_filename, "wt") as outstream:
            outstream.write(spec_yaml)
            
        spec = load_spec_stream(StringIO(spec_yaml))
        arg_row = {
            "team_id": team_id,
            "dataset_id": row_name,
            "url": raw_url.format(filename=spec_filename),
        }
        print(generate_seqspec_index(spec, row))
        arg_row.update(generate_seqspec_index(spec, row))
        arguments.append(arg_row)

    return arguments


In [20]:
#print("All", metadata.shape[0])
#metadata_rna_atac = metadata[metadata["modality"].isin(["atac", "rna"])]
#print("Removed multi modality", metadata_rna_atac.shape[0])

In [21]:
#save_seqspecs_targz(metadata, "mcginnis-seqspecs-20241125.tgz")