In [1]:
from swissisoform.genome import GenomeHandler
from swissisoform.isoform import AlternativeIsoform
from swissisoform.translation import TruncatedProteinGenerator
from typing import Optional, List
import pandas as pd
import os

In [2]:
genome = GenomeHandler(
   '../data/genome_data/hg38.fa',
   '../data/genome_data/hg38.ncbiRefSeq.gtf'
)

alt_isoforms = AlternativeIsoform()
alt_isoforms.load_bed('../data/ribosome_profiling/RiboTISHV6_MD2025_AnnoToTruncation_exonintersect.bed')

protein_generator = TruncatedProteinGenerator(
   genome_handler=genome,
   alt_isoform_handler=alt_isoforms,
   output_dir="test/"
)

In [4]:
# Add before the prepare_deep_learning_dataset call
print("Testing NAXE truncation translation:")
naxe_sequences = protein_generator.generate_for_gene("NAXE")
for transcript_id, variants in naxe_sequences.items():
    for variant_id, seq in variants.items():
        print(f"  {transcript_id} - {variant_id}: {len(seq)} amino acids")

Testing NAXE truncation translation:
Note: Filtered out 1 transcript(s) on non-standard chromosomes
Non-standard chromosomes: {'chr1_MU273335v1_fix'}
Processing 1 transcripts for gene NAXE
Found 1 alternative start/truncation sites for gene NAXE
  Processing transcript NM_144772.3_2
  Generated truncated protein for NM_144772.3_2 - trunc_156591805_156591957
Saved protein sequences to test/NAXE/NAXE_protein_sequences.fasta
  NM_144772.3_2 - canonical: 10 amino acids
  NM_144772.3_2 - trunc_156591805_156591957: 10 amino acids


In [12]:
def run_protein_translation_diagnostic(genome_handler, alt_isoform_handler, gene_name="NAXE"):
    """
    Run a detailed diagnostic on the protein translation process for a single gene.
    
    Args:
        genome_handler: Initialized GenomeHandler instance
        alt_isoform_handler: Initialized AlternativeIsoform instance
        gene_name: The gene to analyze
    """
    from Bio.Seq import Seq
    import re
    
    print(f"\n{'='*80}")
    print(f"DETAILED TRANSLATION DIAGNOSTIC FOR {gene_name}")
    print(f"{'='*80}")
    
    # Step 1: Get transcripts
    print("\n1. RETRIEVING TRANSCRIPTS...")
    transcript_info = genome_handler.get_transcript_ids(gene_name)
    if transcript_info.empty:
        print(f"  No transcripts found for {gene_name}")
        return
    
    print(f"  Found {len(transcript_info)} transcripts:")
    for idx, transcript in transcript_info.iterrows():
        print(f"  {idx+1}. {transcript['transcript_id']} - {transcript['strand']} strand")
    
    # Step 2: Get alternative features
    print("\n2. RETRIEVING ALTERNATIVE FEATURES...")
    alt_features = alt_isoform_handler.get_visualization_features(gene_name)
    if alt_features.empty:
        print(f"  No alternative features found for {gene_name}")
        return
    
    print(f"  Found {len(alt_features)} alternative features:")
    for idx, feature in alt_features.iterrows():
        feature_id = feature.get('feature_id', f"trunc_{feature['start']}_{feature['end']}")
        feature_type = feature.get('feature_type', 'unknown')
        print(f"  {idx+1}. {feature_id} - {feature_type} at {feature['start']}-{feature['end']}")
    
    # Step 3: Process one transcript and one feature
    transcript = transcript_info.iloc[0]
    transcript_id = transcript['transcript_id']
    strand = transcript['strand']
    
    feature = alt_features.iloc[0]
    feature_id = feature.get('feature_id', f"trunc_{feature['start']}_{feature['end']}")
    feature_pos = feature['start']
    
    print(f"\n3. PROCESSING TRANSCRIPT {transcript_id} WITH FEATURE {feature_id}...")
    print(f"  Strand: {strand}")
    print(f"  Feature position: {feature_pos}")
    
    # Step 4: Extract transcript sequence
    print("\n4. EXTRACTING TRANSCRIPT SEQUENCE...")
    sequence_info = genome_handler.get_transcript_sequence(transcript_id)
    if not sequence_info or 'sequence' not in sequence_info:
        print(f"  Failed to retrieve sequence for {transcript_id}")
        return
    
    full_sequence = sequence_info['sequence']
    print(f"  Full sequence length: {len(full_sequence)} bp")
    print(f"  First 50 bp: {full_sequence[:50]}...")
    print(f"  Last 50 bp: ...{full_sequence[-50:]}")
    
    # Step 5: Apply truncation
    print("\n5. APPLYING TRUNCATION...")
    # For diagnostic, we'll use a simple approach
    # In real implementation, you'd use proper genomic coordinate conversion
    truncation_pos = len(full_sequence) // 2  # Simplified for diagnostics
    print(f"  Using placeholder truncation position: {truncation_pos}")
    
    if strand == '+':
        truncated_sequence = full_sequence[:truncation_pos]
        print(f"  Truncated to first {truncation_pos} bp (5' portion)")
    else:
        truncated_sequence = full_sequence[truncation_pos:]
        print(f"  Truncated to last {len(full_sequence) - truncation_pos} bp (3' portion)")
    
    print(f"  Truncated sequence length: {len(truncated_sequence)} bp")
    print(f"  First 50 bp: {truncated_sequence[:50]}...")
    print(f"  Last 50 bp: ...{truncated_sequence[-50:]}")
    
    # Step 6: Translate with start codon requirement
    print("\n6. TRANSLATING WITH START CODON REQUIREMENT...")
    
    # Find start codon
    start_codon_match = re.search(r'ATG', truncated_sequence)
    if not start_codon_match:
        print("  No start codon (ATG) found in the truncated sequence")
    else:
        start_pos = start_codon_match.start()
        print(f"  Start codon found at position {start_pos}")
        
        # Extract coding sequence
        coding_seq = truncated_sequence[start_pos:]
        length_to_use = len(coding_seq) - (len(coding_seq) % 3)
        coding_seq = coding_seq[:length_to_use]
        
        print(f"  Coding sequence length: {len(coding_seq)} bp")
        print(f"  First 50 bp of coding seq: {coding_seq[:50]}...")
        
        # Translate
        seq_obj = Seq(coding_seq)
        protein_seq_with_start = str(seq_obj.translate(to_stop=True))
        
        print(f"  Protein length: {len(protein_seq_with_start)} aa")
        print(f"  Protein sequence: {protein_seq_with_start[:50]}..." if len(protein_seq_with_start) > 50 
              else f"  Protein sequence: {protein_seq_with_start}")
    
    # Step 7: Translate without start codon requirement
    print("\n7. TRANSLATING WITHOUT START CODON REQUIREMENT...")
    
    # Ensure multiple of 3
    length_to_use = len(truncated_sequence) - (len(truncated_sequence) % 3)
    coding_seq = truncated_sequence[:length_to_use]
    
    print(f"  Using entire truncated sequence (adjusted to {length_to_use} bp)")
    
    # Translate all reading frames and find longest
    best_protein = ""
    best_length = 0
    best_frame = 0
    
    for frame in range(3):
        frame_seq = coding_seq[frame:]
        frame_length = len(frame_seq) - (len(frame_seq) % 3)
        if frame_length <= 0:
            continue
            
        frame_seq = frame_seq[:frame_length]
        seq_obj = Seq(frame_seq)
        protein = str(seq_obj.translate(to_stop=True))
        
        if len(protein) > best_length:
            best_protein = protein
            best_length = len(protein)
            best_frame = frame
    
    print(f"  Best reading frame: {best_frame}")
    print(f"  Protein length: {best_length} aa")
    print(f"  Protein sequence: {best_protein[:50]}..." if len(best_protein) > 50 
          else f"  Protein sequence: {best_protein}")
    
    print(f"\n{'='*80}")
    print("DIAGNOSTIC COMPLETE")
    print(f"{'='*80}")

In [13]:
print("Getting transcript information...")
transcript_info = genome.get_transcript_ids("NAXE")

if transcript_info.empty:
    print("No transcript info found")
else:
    print(f"Found {len(transcript_info)} transcripts")
    print("\nTranscript Information:")
    print(transcript_info)

Getting transcript information...
Note: Filtered out 1 transcript(s) on non-standard chromosomes
Non-standard chromosomes: {'chr1_MU273335v1_fix'}
Found 1 transcripts

Transcript Information:
         transcript_id chromosome      start        end strand
4571171  NM_144772.3_2       chr1  156591776  156594299      +


In [11]:
genome.get_transcript_sequence("NM_144772.3_2")

{'transcript_id': 'NM_144772.3_2',
 'chromosome': 'chr1',
 'start': np.int64(156591776),
 'end': np.int64(156594299),
 'strand': '+',
 'sequence': 'gggccgggggcgcgcgctctgcgagctggatgtccaggctgcgggcgctgctgggcctcgggctgctggttgcgggctcgcgcgtgccgcggATCAAAAGCCAGACCATCGCCTGTCGCTCGGGACCCACCTGGTGGGGACCGCAGCGGCTGAACTCGGGTGGCCGCTGGGACTCAGAGGTCATGGCGAGCACGGTGGTGAAGTACCTGAGGTAGGCACGGGTCTCGGGTGGCCTGCTCTGCCCCGGGGCGGGGCCTGGGACGGCCGGGCCACCTGCGCGACAGAGAACACGAGGGGCGGGACTCAGGCCGCGGGTTTTCCTCAGCCAGGAGGAGGCCCAGGCCGTGGACCAGGAGCTATTTAACGAATACCAGTTCAGCGTGGACCAACTTATGGAACTGGCCGGGCTGAGCTGTGCTACAGCCATCGCCAAGGTCAGTGGCACAACTCTCGACCTTTGGGAGCAGCCAGGGAGGAGTCACTGTCCCAGCCCCCTGGCCTAGGCACAAAGGGGTGGGAGAGACAGCTGGGCCAATATGGTCTATTACCGCCTGAAACCCCGCCGAACCACCCTTGACTCTGCCTTCAGGCATATCCCCCCACGTCCATGTCCAGGAGCCCCCCTACTGTCCTGGTCATCTGTGGCCCGGGGAATAATGGAGGAGATGGTCTGGTCTGTGCTCGACACCTCAAACTCTTTGTGAGTATGTGGGGAGGGGCTGTGGGGGAGGAGGGCGTGAGGGCTCTGGGATCTGGGGTTGAATTACCACTTTCTTCCTAGGGCTACGAGCCAACCATCTATTACCCCAAAAGGCCTAACAAGCCCCTCTTCACTGCATTGGTGACCCAGTG

In [8]:
run_protein_translation_diagnostic(genome, alt_isoforms, gene_name="NAXE")


DETAILED TRANSLATION DIAGNOSTIC FOR NAXE

1. RETRIEVING TRANSCRIPTS...
Note: Filtered out 1 transcript(s) on non-standard chromosomes
Non-standard chromosomes: {'chr1_MU273335v1_fix'}
  Found 1 transcripts:
  4571172. NM_144772.3_2 - + strand

2. RETRIEVING ALTERNATIVE FEATURES...
  Found 1 alternative features:
  310. trunc_156591805_156591957 - alternative_start at 156591805-156591957

3. PROCESSING TRANSCRIPT NM_144772.3_2 WITH FEATURE trunc_156591805_156591957...
  Strand: +
  Feature position: 156591805

4. EXTRACTING TRANSCRIPT SEQUENCE...
  Full sequence length: 2524 bp
  First 50 bp: gggccgggggcgcgcgctctgcgagctggatgtccaggctgcgggcgctg...
  Last 50 bp: ...GAGGTTGCTATGGTATTTGGAAACAATGAAAATGGACTGTTAGATGCCAA

5. APPLYING TRUNCATION...
  Using placeholder truncation position: 1262
  Truncated to first 1262 bp (5' portion)
  Truncated sequence length: 1262 bp
  First 50 bp: gggccgggggcgcgcgctctgcgagctggatgtccaggctgcgggcgctg...
  Last 50 bp: ...GAAACCCTTCCTGCAGATGCATGGATTAAGGGATGGGAAA

In [6]:
gene_list = ["NAXE", "NTHL1", "GARS1", "PNPO", "GSR"]

# Generate dataset
print("\nGenerating amino acid sequences for deep learning dataset")
dataset = protein_generator.prepare_deep_learning_dataset(
    gene_list=gene_list,
    output_format="fasta",
    include_canonical=True,
    min_length=0,
    max_length=2000
)


Generating amino acid sequences for deep learning dataset

Processing gene NAXE for dataset
Note: Filtered out 1 transcript(s) on non-standard chromosomes
Non-standard chromosomes: {'chr1_MU273335v1_fix'}
Processing 1 transcripts for gene NAXE
Found 1 alternative start/truncation sites for gene NAXE
  Processing transcript NM_144772.3_2
  Generated truncated protein for NM_144772.3_2 - trunc_156591805_156591957

Processing gene NTHL1 for dataset
Processing 4 transcripts for gene NTHL1
Found 2 alternative start/truncation sites for gene NTHL1
  Processing transcript XM_047434171.1
  Generated truncated protein for XM_047434171.1 - trunc_2047709_2047845
  Generated truncated protein for XM_047434171.1 - trunc_2046202_2046366
  Processing transcript NM_002528.7
  Generated truncated protein for NM_002528.7 - trunc_2047709_2047845
  Generated truncated protein for NM_002528.7 - trunc_2046202_2046366
  Processing transcript NM_001318193.2
  Generated truncated protein for NM_001318193.2 - 