<!-- WARNING: THIS FILE WAS AUTOGENERATED! DO NOT EDIT! -->

In [0]:
#| echo: false
#| output: asis
show_doc(TranscriptData)

---

[source](https://github.com/cobioda/allos/blob/main/allos/transcript_data.py#L20){target="_blank" style="float:right; font-size:smaller"}

### TranscriptData

>      TranscriptData (gtf_file:str, reference_fasta:Optional[str]=None)

*A class for managing transcript and gene information from a GTF file using PyRanges.

Existing Features:
  - Lookup by transcript ID or gene ID/name
  - Support for exons, CDS, UTR queries
  - Intron coordinate calculation
  - Batch queries
  - Transcript length calculation
  - Caching/memoization for repeated queries
  - Basic logging/error handling

NEW Features:
  (1) Nucleotide/protein sequence retrieval for CDS (with optional FASTA)
  (2) Alternative splicing analysis with splice junctions, isoform comparisons,
      and junction-chain interpretation.*

In [None]:
import os
import urllib.request
from pathlib import Path

# Example Ensembl URLs for mouse GRCm39 (release 109)
gtf_url = "ftp://ftp.ensembl.org/pub/release-109/gtf/mus_musculus/Mus_musculus.GRCm39.109.gtf.gz"
fasta_url = "ftp://ftp.ensembl.org/pub/release-109/fasta/mus_musculus/dna/Mus_musculus.GRCm39.dna.primary_assembly.fa.gz"

# Store data one directory back
data_dir = Path("..") / "data"
data_dir.mkdir(parents=True, exist_ok=True)

gtf_file_local = data_dir / "Mus_musculus.GRCm39.109.gtf.gz"
fasta_file_local = data_dir / "Mus_musculus.GRCm39.dna.primary_assembly.fa.gz"

# Download if not already present
if not gtf_file_local.is_file():
    print(f"Downloading {gtf_url}...")
    urllib.request.urlretrieve(gtf_url, gtf_file_local)

if not fasta_file_local.is_file():
    print(f"Downloading {fasta_url}...")
    urllib.request.urlretrieve(fasta_url, fasta_file_local)

# Instantiate your TranscriptData
td = TranscriptData(
    gtf_file=gtf_file_local,
    reference_fasta=fasta_file_local
)

# Now you can make queries like:
example_transcript_id = "ENSMUST00000070533"  # e.g., for mouse
exons = td.get_exons(example_transcript_id)
print("Exons:", exons)

Downloading ftp://ftp.ensembl.org/pub/release-109/gtf/mus_musculus/Mus_musculus.GRCm39.109.gtf.gz...
Downloading ftp://ftp.ensembl.org/pub/release-109/fasta/mus_musculus/dna/Mus_musculus.GRCm39.dna.primary_assembly.fa.gz...
Exons: +--------------+----------------+------------+-----------+-----------+------------+--------------+------------+--------------------+----------------+-------------+----------------+----------------+--------------------+----------------------+-------------------+---------------------+----------------------+-------------------+------------------------------------+---------------+--------------------+----------------+------------+-------+
|   Chromosome | Source         | Feature    |     Start |       End | Score      | Strand       | Frame      | gene_id            |   gene_version | gene_name   | gene_source    | gene_biotype   | transcript_id      |   transcript_version | transcript_name   | transcript_source   | transcript_biotype   | tag               | tra

In [None]:
ranges = pr.read_gtf(gtf_file_local)

In [None]:
ranges.columns

Index(['Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand',
       'Frame', 'gene_id', 'gene_version', 'gene_name', 'gene_source',
       'gene_biotype', 'transcript_id', 'transcript_version',
       'transcript_name', 'transcript_source', 'transcript_biotype', 'tag',
       'transcript_support_level', 'exon_number', 'exon_id', 'exon_version',
       'ccds_id', 'protein_id', 'protein_version'],
      dtype='object')

In [None]:
ranges.as_df()['transcript_id']

0                         NaN
1          ENSMUST00000194081
2          ENSMUST00000194081
3                         NaN
4          ENSMUST00000194393
                  ...        
1901233    ENSMUST00000189418
1901234    ENSMUST00000189418
1901235                   NaN
1901236    ENSMUST00000186353
1901237    ENSMUST00000186353
Name: transcript_id, Length: 1901238, dtype: object

In [None]:
from allos.readers_tests import *
mouse_data = process_mouse_data()


🔎 Looking for file at: /data/analysis/data_mcandrew/Allos_new/allos_env/lib/python3.9/site-packages/allos/resources/e18.mouse.clusters.csv
✅ File found at: /data/analysis/data_mcandrew/Allos_new/allos_env/lib/python3.9/site-packages/allos/resources/e18.mouse.clusters.csv
✅ File already exists at: /data/analysis/data_mcandrew/Allos_new/allos_env/lib/python3.9/site-packages/allos/resources/data/mouse_1.txt.gz

🔄 Decompressing /data/analysis/data_mcandrew/Allos_new/allos_env/lib/python3.9/site-packages/allos/resources/data/mouse_1.txt.gz to /data/analysis/data_mcandrew/Allos_new/allos_env/lib/python3.9/site-packages/allos/resources/data/mouse_1.txt...
✅ Decompression complete.
Test data (mouse_1) downloaded successfully
✅ File already exists at: /data/analysis/data_mcandrew/Allos_new/allos_env/lib/python3.9/site-packages/allos/resources/data/mouse_2.txt.gz

🔄 Decompressing /data/analysis/data_mcandrew/Allos_new/allos_env/lib/python3.9/site-packages/allos/resources/data/mouse_2.txt.gz to 

  utils.warn_names_duplicates("obs")


In [None]:
transcriptIds = mouse_data.var.index.to_list()

In [None]:
transcriptIds[:10]

['ENSMUST00000156717.1',
 'ENSMUST00000212520.1',
 'ENSMUST00000025798.12',
 'ENSMUST00000231280.1',
 'ENSMUST00000039286.4',
 'ENSMUST00000144552.7',
 'ENSMUST00000112304.8',
 'ENSMUST00000162041.7',
 'ENSMUST00000053506.6',
 'ENSMUST00000028207.12']

In [None]:
gene_names = td.get_gene_names_for_transcripts(transcript_ids=transcriptIds)

In [None]:
gene_names[:10]

['Klc2',
 'Capn15',
 'Klc2',
 'Eva1c',
 'Atg5',
 'Znhit3',
 'Ppm1b',
 'Gcc2',
 'Bbs1',
 'Crat']