# Introduction

The Y2AVE project needs seqspecs that use filenames available on synapse.

- [Python environment](#Setup)
- [Defining the template](#Defining-the-template)
  - [Print example seqspec](#Print-example-seqspec)
- [Generate team seqspec tarfile](#Generate-team-seqspec-tarfile)

There's roughly two sections to this notebook, the first is designe


# Connecting teams to seqspecs

This spreadsheet has the longer descriptions of the datasets and what team they were assigned to. 
[IGVF Single Cell datasets for Y2AVE](https://docs.google.com/spreadsheets/d/1QWa1JUzs7pR02P8uS95MWIqW8-Ldh7BX6D990YXXQbQ/edit#gid=0)

<table>
    <thead>
        <tr><td>Team></td><td>Chemistry</td><td>Seqspec</td><tr>
    </thead>
    <tbody>
        <tr><td>Team 1</td><td>10x_multiome</td><td>characterization_mcginnis</td><tr>
        <tr><td>Team 2</td><td>10x_multiome</td><td>characterization_mcginnis (is there 2 for team 2?)</td><tr>
        <tr><td>Team 3</td><td>10x_multiome</td><td></td><tr>
        <tr><td>Team 4</td><td>10x_v2</td>      <td></td><tr>
        <tr><td>Team 5</td><td>10x_multiome</td><td></td><tr>
        <tr><td>Team 6</td><td>10x_v2</td>      <td>10xsnatac</td><tr>
        <tr><td>Team 7</td><td>10x_multiome</td><td>10xmultiome_bridge_satpathy</td><tr>
        <tr><td>Team 7</td><td>parse</td>       <td>parse_bridge</td><tr>
        <tr><td>Team 8</td><td>shareseq</td>    <td>shareseq_bmmc_single_donor</td><tr>
    </tbody>
</table>

This synapse folder contains the snapshots of the AnVIL tables created during the first Y2AVE jamboree.
[Tables from past jamborees](https://drive.google.com/drive/folders/11LwkduQZTqKolIodTZywoUj0N2apIz41)
The notebook as it currently is written expects to find those files present in the current directory.

More work needs to be done to match datasets to seqspec templates.

# Setup

First we start with general imports

In [None]:
import hashlib
import requests
from pathlib import Path
from io import StringIO, BytesIO
import sys
import json
from jsonschema import Draft4Validator
import logging
import pandas
import os
import sys
import tarfile
from urllib.parse import urlparse
import yaml

In [None]:
try:
    from jinja2 import Environment
except ImportError:
    !{sys.executable} -m pip install --user jinja2
    from jinja2 import Environment

from jinja2 import FileSystemLoader, select_autoescape, Undefined, StrictUndefined, make_logging_undefined

logger = logging.getLogger(__name__)
LoggingUndefined = make_logging_undefined(
    logger=logger,
    base=Undefined
)

env = Environment(
    loader=FileSystemLoader("templates"),
    autoescape=select_autoescape(),
    undefined=LoggingUndefined,
)

I want to be able to use the seqspec validator while I was writing my seqspec file.

I have the repository checked out into ~/proj/seqspec. This block should either import it for me, or install it if someone elese runs it.

In [None]:
try:
    import seqspec
except ImportError:
    seqspec_root = Path("~/proj/seqspec").expanduser()
    if seqspec_root.exists() and str(seqspec_root) not in sys.path:
        sys.path.append(str(seqspec_root))
    else:
        !{sys.executable} -m pip install --user seqspec
    import seqspec

Import pieces of seqspec that we need for this notebook.

In [None]:
from seqspec.Assay import Assay
from seqspec.Region import Region
from seqspec.Region import Onlist
from seqspec.utils import load_spec_stream
from seqspec.seqspec_index import run_index, get_index
from seqspec.seqspec_print import run_print_sequence_spec, run_print_library_tree, run_print_library_png
from seqspec.seqspec_onlist import run_list_onlists, run_onlist_read, run_find_by_type

## define seqspec validation functions

In [None]:
def seqspec_validate(schema, spec):
    """Validate a yaml object against a json schema
    """
    validator = Draft4Validator(schema)

    for idx, error in enumerate(validator.iter_errors(spec), 1):
        print(f"[{idx}] {error.message}")

In [None]:
schema_path = seqspec_root / "seqspec"/ "schema" / "seqspec.schema.json"

with open(schema_path, "rt") as instream:
    seqspec_schema = json.load(instream)

In [None]:
def load_spec(filename):
    with open(filename, "rt") as instream:
        data = yaml.load(instream, Loader=yaml.Loader)
        for r in data.assay_spec:
            r.set_parent_id(None)
    return data

## Functions for loading the team tables

In [None]:
def parse_list_field(cell):
    if cell is None:
        return ""
    elif len(cell) < 2:
        return ""

    if cell[0] == "[" and cell[-1] == "]":
        values = []
        for element in cell[1:-1].split(","):
            values.append(element[1:-1])
        return values
    else:
        return cell


def load_team_table(team_id):
    team = pandas.read_csv(
        Path(f"{team_id}.tsv"), 
        sep="\t",
        converters = {
            "ATAC_barcode": parse_list_field,
            "ATAC_fastq_R1": parse_list_field, 
            "ATAC_fastq_R2": parse_list_field,
            "RNA_fastq_R1": parse_list_field,
            "RNA_fastq_R2": parse_list_field,
        }

    )
    return team

## Load all the teams to see what the columns and chemistry values look like

In [None]:
teams = ["Team_1", "Team_2", "Team_3", "Team_4", "Team_5", "Team_6", "Team_7", "Team_8"]

for team_id in teams:
    table = load_team_table(team_id)
    print(team_id, set(table["Chemistry"]), table.columns)

# Defining the template

First build up lists of barcodes onlists needed for this protocol the names will be passed to the template.

In [None]:
# this dictionary is team_id: Chemistry: template_filename

y2ave_templates = {
    "Team_1": { # filename in example seqspec matched Team_1.tsv
        "10x_multiome": "y2ave_characterization_mcginnis.yaml.j2",
    },
    "Team_2": { # filenames in table reference CharacterizationMcGinnis
        "10x_multiome": "y2ave_characterization_mcginnis.yaml.j2",        
    },
    "Team_3": {
        "10x_multiome": None,
        "10x_v2": None,
    },
    "Team_4": {
        "10x_v2": None,
    },
    "Team_5": {
        "10x_multiome": None,
        "10x_v2": None,
    },
    "Team_6": { # filename in example seqspec matched Team_6.tsv
        "10x_v2": "y2ave_10xsnatac.yaml.j2",
    },
    "Team_7": { # my team
        "10x_multiome": "y2ave_10xmultiome_bridge_satpathy.yaml.j2",
        "parse": "y2ave_parse_bridge.yaml.j2",
    },
    "Team_8": { # only one shareseq team
        "shareseq": "y2ave_shareseq_bmmc_single_donor.yaml.j2",
    }
}

def format_template_filename(filenames):
    if len(filenames) == 0:
        return ""
    else:
        return Path(filenames[0]).name

def render_seqspec(team_id, row):
    context = {
        "ATAC_barcode": format_template_filename(row["ATAC_barcode"]),
        "ATAC_fastq_R1": format_template_filename(row["ATAC_fastq_R1"]),
        "ATAC_fastq_R2": format_template_filename(row["ATAC_fastq_R2"]),
        "RNA_fastq_R1": format_template_filename(row["RNA_fastq_R1"]),
        "RNA_fastq_R2": format_template_filename(row["RNA_fastq_R2"]),
    }

    # load the template and instantiate it with the context for this specific measurement set.
    # this version needs the libspec branch that separates the sequence and library specifications
    template = env.get_template(y2ave_templates[team_id][row["Chemistry"]])

    # this version needs the original version that merges the sequence and library specification concepts
    #parse_wt_mega_v2 = env.get_template("parse-wt-mega-v2-single-index-seqspec-0.yaml.j2")

    example_yaml = template.render(context)

    # validate the generated seqspec file.
    example_spec = load_spec_stream(StringIO(example_yaml))
    seqspec_validate(seqspec_schema, example_spec.to_dict())
    
    return example_yaml

## print example seqspec

Change the team id and iloc (0-based row number) to generate different specific seqspecs

In [None]:
# Print the example for evaluation.
team_id = "Team_2"
example_yaml = render_seqspec(team_id, load_team_table(team_id).iloc[1])
print(example_yaml)

## generate seqspec charts

Intended to help double check the seqspec file

In [None]:
example_spec = load_spec_stream(StringIO(example_yaml))
print(run_print_library_tree(example_spec))

In [None]:
run_print_library_png(example_spec)

# Generate team seqspec tarfile

Once everything looks good, generate a tarfile of all the seqspecs for a team table change team_id here to pick the table you want.

In [None]:
def save_team_seqspecs(team_id, table):
    destination = f"{team_id}_seqspecs.tar.gz"
    with tarfile.open(destination, "w:gz") as archive:
        for i, row in table.iterrows():
            row_name = row[0]
            print("Processing {}".format(row_name))
            spec_yaml = render_seqspec(team_id, row)
            spec_stream = BytesIO(spec_yaml.encode("utf-8"))
            spec_filename = f"{team_id}_{row_name}_seqspec.yaml"
            spec_tarinfo = tarfile.TarInfo(str(spec_filename))
            spec_tarinfo.size = len(spec_yaml)
            archive.addfile(spec_tarinfo, fileobj=spec_stream)


team_id = "Team_7"
save_team_seqspecs(team_id, load_team_table(team_id))