In [1]:
from swissisoform.alternative_isoforms import AlternativeIsoform
from swissisoform.utils import cleanup_bed, update_gencode_gene_names

### GTF cleanup

In [2]:
# Clean the GTF file using the current Ensembl gene names
input_gtf = "../data/genome_data/gencode.v25.annotation.gtf"
output_gtf = "../data/genome_data/gencode.v25.annotation.ensembl_cleaned.gtf"
reference_gtf = "../data/genome_data/gencode.v47.annotation.gtf"

update_gencode_gene_names(
    input_gtf_path=input_gtf,
    output_gtf_path=output_gtf,
    reference_gtf_path=reference_gtf,
    verbose=True,
)

Creating gene ID to name mappings from reference GTF: ../data/genome_data/gencode.v47.annotation.gtf
Extracted 57992 gene names from GENCODE GTF
Extracted 78724 gene names from reference GTF
Created 21316 gene name updates

GTF Update Summary:
  Total lines processed: 2579822
  Genes processed: 58037
  Genes with updated names: 21324
  Total lines updated: 221358
  Output saved to: ../data/genome_data/gencode.v25.annotation.ensembl_cleaned.gtf


{'total_lines': 2579822,
 'updated_lines': 221358,
 'genes_processed': 58037,
 'genes_updated': 21324}

### Full bed file

In [3]:
input_bed = "../data/ribosome_profiling/full_truncations_JL.bed"
output_bed = "../data/ribosome_profiling/full_truncations_JL_cleaned.bed"

In [4]:
cleanup_bed(input_bed, output_bed, gtf_path=output_gtf, verbose=True)

Extracting gene mapping from GTF: ../data/genome_data/gencode.v25.annotation.ensembl_cleaned.gtf
Extracted 58037 unique gene ID to name mappings from GTF
Retrieved 116029 gene name mappings

Cleanup Summary:
  Total entries: 4926
  Invalid entries removed: 0
  Duplicates removed: 38
  Gene names updated: 154
  Valid entries in final file: 4888


{'total': 4926,
 'invalid_format': 0,
 'invalid_ensembl': 0,
 'duplicates': 38,
 'updated': 154,
 'valid': 4888}

In [5]:
alt_isoforms = AlternativeIsoform()
alt_isoforms.load_bed("../data/ribosome_profiling/full_truncations_JL_cleaned.bed")
gene_list = alt_isoforms.get_gene_list()

with open("../data/ribosome_profiling/gene_list.txt", "w") as f:
    for gene in gene_list:
        f.write(gene + "\n")

### Reduced bed file

In [6]:
input_bed = "../data/ribosome_profiling/selected_truncations_JL.bed"
output_bed = "../data/ribosome_profiling/selected_truncations_JL_cleaned.bed"

In [7]:
cleanup_bed(input_bed, output_bed, gtf_path=output_gtf, verbose=True)

Extracting gene mapping from GTF: ../data/genome_data/gencode.v25.annotation.ensembl_cleaned.gtf
Extracted 58037 unique gene ID to name mappings from GTF
Retrieved 116029 gene name mappings

Cleanup Summary:
  Total entries: 26
  Invalid entries removed: 0
  Duplicates removed: 0
  Gene names updated: 2
  Valid entries in final file: 26


{'total': 26,
 'invalid_format': 0,
 'invalid_ensembl': 0,
 'duplicates': 0,
 'updated': 2,
 'valid': 26}

In [8]:
alt_isoforms = AlternativeIsoform()
alt_isoforms.load_bed("../data/ribosome_profiling/selected_truncations_JL_cleaned.bed")
gene_list = alt_isoforms.get_gene_list()

with open("../data/ribosome_profiling/gene_list_reduced.txt", "w") as f:
    for gene in gene_list:
        f.write(gene + "\n")