# Type III TA Pipeline Smoke Tests
This notebook creates tiny synthetic datasets to exercise the filtering utilities in `type_iii_ta_sample.py` and to run a fully patched pipeline smoke test. The heavy external tools (Evo sampling, TRF, HMMER, Infernal) are replaced with lightweight stubs so we can validate code paths quickly.

In [2]:

from pathlib import Path
import tempfile
import pandas as pd
import numpy as np
from importlib import reload
from types import SimpleNamespace

BASE_TMP = Path(tempfile.mkdtemp(prefix="type_iii_ta_tests_"))
print(f"Temporary workspace: {BASE_TMP}")

from semantic_design.pipelines import type_iii_ta_sample as pipeline
pipeline = reload(pipeline)


Temporary workspace: /tmp/type_iii_ta_tests_hf0inesj


## Helper utilities
Supporting functions to fabricate stub CLI tools and reference tables used throughout the notebook.

In [3]:

import textwrap

structure_stub_path = BASE_TMP / "stub_structure_filter.py"
structure_stub_path.write_text(textwrap.dedent("""
    import argparse
    import pandas as pd

    parser = argparse.ArgumentParser()
    parser.add_argument('--query', required=True)
    parser.add_argument('--target', required=True)
    parser.add_argument('--output', required=True)
    parser.add_argument('--structure-type')
    parser.add_argument('--min-similarity')
    parser.add_argument('--pre-filter-threshold')
    parser.add_argument('--batch-size')
    args, unknown = parser.parse_known_args()

    candidates = pd.read_csv(args.query)
    keep = candidates[candidates['Root ID'].str.contains('A')]
    if keep.empty:
        keep = candidates.head(1)
    pd.DataFrame({'Query_ID': keep['Root ID']}).to_csv(args.output, index=False)
"""))

sequence_stub_path = BASE_TMP / "stub_sequence_filter.py"
sequence_stub_path.write_text(textwrap.dedent("""
    import argparse
    import pandas as pd

    parser = argparse.ArgumentParser()
    parser.add_argument('--reference_csv', required=True)
    parser.add_argument('--comparison_csv', required=True)
    parser.add_argument('--output_csv', required=True)
    parser.add_argument('--min-identity')
    args, unknown = parser.parse_known_args()

    candidates = pd.read_csv(args.comparison_csv).copy()
    scores = candidates[['Root ID']].rename(columns={'Root ID': 'comp_root_id'})
    scores['identity_percent'] = 95.0
    scores.to_csv(args.output_csv, index=False)
"""))

print(f"Stub scripts written to {structure_stub_path} and {sequence_stub_path}")


Stub scripts written to /tmp/type_iii_ta_tests_hf0inesj/stub_structure_filter.py and /tmp/type_iii_ta_tests_hf0inesj/stub_sequence_filter.py


## 1. Build synthetic TRF / RNA tables

In [4]:

trf_columns = [
    'Root ID','Start','End','Period Size','Copy Number','Consensus Size','Percent Match',
    'Percent Indels','Alignment Score','A','C','G','T','Entropy','Repeat Sequence','Full TRF Region'
]
trf_sample = pd.DataFrame([
    ['rootA', 1, 60, 10, 3, 30, 95.0, 2.0, 200, 15, 15, 15, 15, 1.2, 'ATGCATGCAT', 'ATGCGTACGATCGTACGATCGTAAACCGGTT'],
    ['rootB', 5, 80, 12, 4, 36, 90.0, 4.0, 180, 20, 10, 20, 10, 1.5, 'TTAACCGG', 'TTAACCGGTTAACCGGTTAACCGGTTAA'],
    ['rootC', 10, 55, 8, 5, 24, 88.0, 5.0, 150, 12, 12, 12, 12, 1.7, 'GGCC', 'GGCCATATGGCCATATGGCCATATGGCC']
], columns=trf_columns)

fold_sample = pd.DataFrame([
    {
        'Evo Sequence ID': 'rootA',
        'Description': 'rootA test',
        'DNA Sequence': trf_sample.loc[0, 'Full TRF Region'],
        'RNA Sequence': 'AUGCGUACGAUCGUACGAUCGUAAACCGGUU',
        'Secondary Structure': '((((....))))....((((....))))',
        'MFE': -12.0,
        'Hairpins': [(2, 12, 3, 11)]
    },
    {
        'Evo Sequence ID': 'rootB',
        'Description': 'rootB test',
        'DNA Sequence': trf_sample.loc[1, 'Full TRF Region'],
        'RNA Sequence': 'UUAAUUGGCUUAAUUGGCUUAAUUGGCUU',
        'Secondary Structure': '...(((....)))...',
        'MFE': -4.5,
        'Hairpins': []
    }
])

filter_cfg = SimpleNamespace(
    rna_require_hairpin=True,
    rna_minimum_mfe=-3.0,
    rna_require_all_bases=True,
    rna_fold_csv=BASE_TMP / 'filtered_rna_fold.csv'
)

filtered_fold, passing_ids = pipeline.filter_folded_trfs(trf_sample, fold_sample, filter_cfg)
print('Passing TRF roots:', passing_ids)

candidate_csv_path = BASE_TMP / 'rna_candidates.csv'
candidate_table = pipeline.prepare_rna_candidate_table(trf_sample, filtered_fold, candidate_csv_path)
print(candidate_table[['Root ID','Sequence_ID']])

candidate_fasta_path = BASE_TMP / 'rna_candidates.fasta'
pipeline.write_rna_candidates_fasta(candidate_table, candidate_fasta_path)
print(candidate_fasta_path.read_text())


Passing TRF roots: {'rootA'}
  Root ID Sequence_ID
0   rootA  rootA_1_60
>rootA_1_60 rootA
AUGCGUACGAUCGUACGAUCGUAAACCGGUU



## 2. Test RNA structure/sequence filters via stubs

In [10]:

reference_rna_path = BASE_TMP / 'reference_rnas.csv'
reference_df = candidate_table.copy()
reference_df.to_csv(reference_rna_path, index=False)

structure_cfg = SimpleNamespace(
    rna_structure_filter_script=Path(str(structure_stub_path)),
    rna_structure_filter_reference_csv=reference_rna_path,
    rna_structures_reference_csv=reference_rna_path,
    rna_structure_matches_csv=BASE_TMP / 'structure_hits.csv',
    rna_structure_filter_structure_type='both',
    rna_structure_filter_min_similarity=0.7,
    rna_structure_filter_pre_filter_threshold=0.7,
    rna_structure_filter_batch_size=10,
    rna_structure_filter_max_results=None,
    rna_structure_filter_cpus=None
)
structure_hits = pipeline.run_rna_structure_filter(candidate_csv_path, structure_cfg)
print('Structure hits:', structure_hits)

sequence_cfg = SimpleNamespace(
    rna_sequence_filter_script=Path(str(sequence_stub_path)),
    rna_sequence_filter_reference_csv=reference_rna_path,
    rna_structures_reference_csv=reference_rna_path,
    rna_sequence_matches_csv=BASE_TMP / 'sequence_hits.csv',
    rna_sequence_filter_min_identity=70.0,
    rna_sequence_filter_processes=None
)
sequence_hits = pipeline.run_rna_sequence_filter(candidate_csv_path, sequence_cfg)
print('Sequence hits:', sequence_hits)


Structure hits: {'rootA'}
Sequence hits: {'rootA'}


## 3. Test Pfam hmmscan filtering

In [11]:

from unittest.mock import patch
from subprocess import CompletedProcess

domtbl_line = " ".join([
    'ToxN_toxin','PF13958.10','159','rootA_0','-','150','1e-20','50.0','0.0',
    '1','1','1e-25','1e-25','45.2','0.1','5','120','3','128','3','128','0.95','ToxN_toxin'
])

aaid_fasta = BASE_TMP / 'dummy_proteins.faa'
aaid_fasta.write_text('>rootA_0\nMSTNKKLLDN')
pfam_db_path = BASE_TMP / 'Pfam-A.hmm'
pfam_db_path.write_text('# dummy Pfam DB')
pfam_reference_path = BASE_TMP / 'pfam_reference.csv'
pd.DataFrame({'pfam_name': ['ToxN_toxin']}).to_csv(pfam_reference_path, index=False)

hmmscan_cfg = SimpleNamespace(
    hmmscan_pfam_db_path=pfam_db_path,
    filtered_proteins_file=aaid_fasta,
    hmmscan_binary='hmmscan',
    hmmscan_cpu=1,
    hmmscan_domtblout=BASE_TMP / 'hmmscan.domtblout',
    hmmscan_hits_csv=BASE_TMP / 'hmmscan_hits.csv',
    pfam_reference_hits_csv=pfam_reference_path,
    pfam_evalue_threshold=0.05
)

def fake_hmmscan(cmd, check, **kwargs):
    hmmscan_cfg.hmmscan_domtblout.write_text(f"# dummy\n{domtbl_line}")
    return CompletedProcess(cmd, 0)

with patch('semantic_design.pipelines.type_iii_ta_sample.subprocess.run', side_effect=fake_hmmscan):
    hmmscan_hits = pipeline.run_hmmscan_filter(hmmscan_cfg)

print(hmmscan_hits[['sequence_id','pfam_name','e_value']])


  sequence_id   pfam_name       e_value
0     rootA_0  ToxN_toxin  1.000000e-25


## 4. Test cmscan filter

In [13]:

# Regenerate candidate table snapshot to ensure this cell works independently
if 'candidate_csv_path' not in globals() or not candidate_csv_path.exists():
    raise RuntimeError('Run the TRF/RNA preparation cell before executing the cmscan test.')

candidate_table_snapshot = pd.read_csv(candidate_csv_path)
if candidate_table_snapshot.empty:
    raise RuntimeError('Candidate table is empty; construct synthetic TRF/RNA data before running cmscan.')

candidate_table_snapshot['Sequence_ID'] = candidate_table_snapshot['Sequence_ID'].astype(str)
candidate_table_snapshot['Root ID'] = candidate_table_snapshot['Root ID'].astype(str)

cmscan_fasta_path = BASE_TMP / 'cmscan_rna_candidates.fasta'
pipeline.write_rna_candidates_fasta(candidate_table_snapshot, cmscan_fasta_path)

cm_file = BASE_TMP / 'toy.cm'
cm_file.write_text('# mock CM file')
seq_ids = candidate_table_snapshot['Sequence_ID'].tolist()

cmscan_cfg = SimpleNamespace(
    cmscan_model_paths=[cm_file],
    cmscan_tblout_dir=BASE_TMP / 'cmscan_tblout',
    cmscan_hits_csv=BASE_TMP / 'cmscan_hits.csv',
    cmscan_binary='cmscan',
    cmscan_evalue_threshold=0.05,
    cmscan_allowed_families=['ToxI'],
    cmscan_allowed_families_csv=None,
    cmscan_allowed_families_column='Query Name',
    rna_candidates_fasta=cmscan_fasta_path
)
cmscan_cfg.cmscan_tblout_dir.mkdir(exist_ok=True)

from subprocess import CompletedProcess

def fake_cmscan(cmd, check, **kwargs):
    tblout_path = Path(cmd[2])
    entries = [
        f"ToxI RF02519 {seq_ids[0]} - cm 1 34 10 40 + no 1 0.5 0.0 40.0 1e-6 ! ToxI antitoxin"
    ]
    tblout_path.write_text(''.join(entries) + '')
    return CompletedProcess(cmd, 0)

with patch('semantic_design.pipelines.type_iii_ta_sample.subprocess.run', side_effect=fake_cmscan):
    cmscan_hits = pipeline.run_cmscan_filter(candidate_table_snapshot, cmscan_cfg)

if cmscan_hits.empty:
    raise RuntimeError('Synthetic cmscan stub returned no hits; ensure the candidate table has at least one entry.')
else:
    display(cmscan_hits[['sequence_id','target_name','Root ID','e_value']])


Unnamed: 0,sequence_id,target_name,Root ID,e_value
0,rootA_1_60,ToxI,rootA,1e-06


## 5. Pipeline smoke test with patched heavy steps

In [14]:

import yaml
from contextlib import ExitStack
from unittest.mock import patch

pipeline_output_dir = BASE_TMP / 'pipeline_run'
pipeline_config_path = BASE_TMP / 'pipeline_smoke.yaml'
pipeline_prompts = BASE_TMP / 'prompts.csv'
pipeline_prompts.write_text('prompt\nDummy context')

config_payload = {
    'input_prompts': str(pipeline_prompts),
    'output_dir': str(pipeline_output_dir),
    'segmasker_path': '/bin/echo',
    'trf_path': '/bin/echo',
    'rna_structures_reference_csv': str(reference_rna_path),
    'model_name': 'dummy',
    'n_tokens': 32,
    'temperature': 0.7,
    'top_k': 4,
    'batched': False,
    'batch_size': 1,
    'n_sample_per_prompt': 1,
    'rc_truth': False,
    'return_both': False,
    'filter_min_length': 50,
    'filter_max_length': 400,
    'filter_partial_bool': False,
    'segmasker_threshold': 0.2,
    'run_esm_fold': True,
    'plddt_threshold': 0.3,
    'ptm_threshold': 0.0,
    'write_trf_to_csv': True,
    'rna_structure_filter_script': str(structure_stub_path),
    'rna_sequence_filter_script': str(sequence_stub_path),
    'rna_structure_filter_reference_csv': str(reference_rna_path),
    'rna_sequence_filter_reference_csv': str(reference_rna_path),
    'hmmscan_pfam_db_path': None,
    'cmscan_model_paths': []
}

with open(pipeline_config_path, 'w') as handle:
    yaml.safe_dump(config_payload, handle)

root_ids = ['rootA','rootB','rootC']
dna_sequences = ['ATGCGTACGATCGTACGATCGTAAACCGGTT', 'TTGACCGGTTGACCGGTTGACCGGTTGACC', 'GGCATATGGCATATGGCATATGGCAAA']
protein_sequences = ['MKTAYIAKQRQISFVKSHFSRQ', 'MKKLLPTAAAGLLLLAAQPAMA', 'MADQLTEEQIAEFKEAF']

trf_stub = pd.DataFrame([
    {
        'Root ID': rid,
        'Start': 1,
        'End': 60,
        'Period Size': 10,
        'Copy Number': 3,
        'Consensus Size': 30,
        'Percent Match': 95.0,
        'Percent Indels': 1.0,
        'Alignment Score': 200,
        'A': 15,
        'C': 15,
        'G': 15,
        'T': 15,
        'Entropy': 1.1,
        'Repeat Sequence': 'ATGC',
        'Full TRF Region': dna_sequences[idx]
    }
    for idx, rid in enumerate(root_ids)
])

fold_stub = pd.DataFrame([
    {
        'Evo Sequence ID': rid,
        'Description': rid,
        'DNA Sequence': dna_sequences[idx],
        'RNA Sequence': dna_sequences[idx].replace('T','U'),
        'Secondary Structure': '(((())))....(((())))',
        'MFE': -10.0,
        'Hairpins': [(1, 8, 2, 7)]
    }
    for idx, rid in enumerate(root_ids)
])

filtered_fold_stub = pd.DataFrame([
    {
        'Evo Sequence ID': f'{rid}_0',
        'Average pLDDT': 0.85,
        'pTM': 0.6,
        'Amino Acid Sequence': protein_sequences[idx]
    }
    for idx, rid in enumerate(root_ids)
])

def fake_read_prompts(path, batched, batch_size):
    return [['Dummy context']]

def fake_model_load(name):
    return None, None

def fake_sample_model(**kwargs):
    ids = [f'{rid}_0' for rid in root_ids]
    df = pd.DataFrame({'UUID': root_ids, 'Generated Sequence': dna_sequences})
    df.to_csv(kwargs['file_save_location'], index=False)
    return ['prompt'] * len(root_ids), dna_sequences, [0.0] * len(root_ids), ids

def fake_get_rc(seqs, rc_truth=False, return_both=False):
    return seqs

def fake_make_fasta(final_sequences, prompts, ids, path):
    with open(path, 'w') as handle:
        for seq_id, seq in zip(ids, final_sequences):
            handle.write(f'>{seq_id}\n{seq}')

def fake_run_prodigal(fasta_path, proteins_path, orfs_path):
    Path(proteins_path).write_text(''.join([f'>{rid}_0\n{protein_sequences[idx]}' for idx, rid in enumerate(root_ids)]))
    Path(orfs_path).write_text('>dummy_orf\nATGCGT')

def fake_filter_protein_fasta(input_fasta, output_fasta, *args, **kwargs):
    Path(output_fasta).write_text(Path(input_fasta).read_text())
    return len(root_ids)

def fake_fold_proteins(filtered_path, output_csv):
    filtered_fold_stub.to_csv(output_csv, index=False)
    return filtered_fold_stub.copy()

def fake_get_tandem_repeats(filtered_folds, sequences_csv, config):
    return trf_stub.copy()

def fake_fold_trfs(trf_df, output_csv):
    fold_stub.to_csv(output_csv, index=False)
    return fold_stub.copy()

def fake_visualize(*args, **kwargs):
    return [], []

def fake_get_at_pairs(*args, **kwargs):
    return pd.DataFrame()

with ExitStack() as stack:
    stack.enter_context(patch.object(pipeline, 'read_prompts', side_effect=fake_read_prompts))
    stack.enter_context(patch.object(pipeline, 'model_load', side_effect=fake_model_load))
    stack.enter_context(patch.object(pipeline, 'sample_model', side_effect=fake_sample_model))
    stack.enter_context(patch.object(pipeline, 'get_rc', side_effect=fake_get_rc))
    stack.enter_context(patch.object(pipeline, 'make_fasta', side_effect=fake_make_fasta))
    stack.enter_context(patch.object(pipeline, 'run_prodigal', side_effect=fake_run_prodigal))
    stack.enter_context(patch.object(pipeline, 'filter_protein_fasta', side_effect=fake_filter_protein_fasta))
    stack.enter_context(patch.object(pipeline, 'fold_proteins', side_effect=fake_fold_proteins))
    stack.enter_context(patch.object(pipeline, 'get_tandem_repeats', side_effect=fake_get_tandem_repeats))
    stack.enter_context(patch.object(pipeline, 'fold_trfs', side_effect=fake_fold_trfs))
    stack.enter_context(patch.object(pipeline, 'visualize_rna_structures', side_effect=fake_visualize))
    stack.enter_context(patch.object(pipeline, 'get_at_pairs', side_effect=fake_get_at_pairs))
    pipeline.run_pipeline(pipeline_config_path)

final_candidates = pd.read_csv(pipeline_output_dir / 'filtered_type_iii_candidates.csv')
print('Final candidates:')
print(final_candidates[['Evo Sequence ID','Root ID']])


Pipeline completed successfully.
Final candidates:
  Evo Sequence ID Root ID
0         rootA_0   rootA
1         rootB_0   rootB
2         rootC_0   rootC


The notebook now exercises each filtering routine and runs a mocked end-to-end pipeline. Point the stubs to real binaries/CM files whenever you want to validate against production assets.