In [1]:
from swissisoform.isoform import AlternativeIsoform

In [2]:
# Initialize handlers
alt_isoforms = AlternativeIsoform()

# Load BED file
alt_isoforms.load_bed(
    "../data/ribosome_profiling/RiboTISHV6_Ly2024_AnnoToTruncation_exonintersect.bed"
)

gene_list = alt_isoforms.get_gene_list()

In [3]:
# Write gene_list to a txt file in the ribosome profiling directory
with open("../data/ribosome_profiling/gene_list.txt", "w") as f:
    for gene in gene_list:
        f.write(gene + "\n")

In [4]:
from biomart import BiomartServer


def get_ensembl_reference():
    """Fetch current ENSEMBL ID to gene name mappings using biomart.

    Returns:
        dict: Mapping of ENSEMBL IDs to gene names
    """
    server = BiomartServer("http://www.ensembl.org/biomart")
    database = server.databases["ENSEMBL_MART_ENSEMBL"]
    dataset = database.datasets["hsapiens_gene_ensembl"]

    attributes = ["ensembl_gene_id", "external_gene_name"]
    response = dataset.search({"attributes": attributes})

    reference_data = {}
    for line in response.iter_lines():
        ensembl_id, gene_name = line.decode("utf-8").split("\t")
        if ensembl_id and gene_name:
            reference_data[ensembl_id] = gene_name

    return reference_data


def parse_bed_line(line):
    """Parse a single BED file line to extract ENSEMBL ID and gene name."""
    fields = line.strip().split("\t")
    if len(fields) < 4:
        return None

    name_parts = fields[3].split("_")
    if len(name_parts) < 3:
        return None

    # Get base ENSEMBL ID without version number
    full_ensembl_id = name_parts[0]
    ensembl_id = full_ensembl_id.split(".")[0]
    gene_name = name_parts[1]

    return {
        "ensembl_id": ensembl_id,
        "full_ensembl_id": full_ensembl_id,
        "gene_name": gene_name,
        "fields": fields,
        "line": line.strip(),
    }


def process_bed_file(bed_file_path, reference_data, callback):
    """Process BED file with a custom callback function for each line."""
    stats = {
        "total": 0,
        "processed": 0,
        "mismatches": [],
        "invalid_format": [],
        "not_found": [],
    }

    with open(bed_file_path, "r") as f:
        for line_num, line in enumerate(f, 1):
            if not line.strip():
                continue

            stats["total"] += 1
            parsed = parse_bed_line(line)

            if not parsed:
                stats["invalid_format"].append((line_num, line.strip()))
                continue

            if not parsed["ensembl_id"].startswith("ENSG"):
                stats["invalid_format"].append((line_num, line.strip()))
                continue

            if parsed["ensembl_id"] not in reference_data:
                stats["not_found"].append(
                    (line_num, parsed["ensembl_id"], parsed["gene_name"])
                )
                continue

            ref_gene_name = reference_data[parsed["ensembl_id"]]
            if ref_gene_name.upper() != parsed["gene_name"].upper():
                stats["mismatches"].append(
                    (line_num, parsed["ensembl_id"], parsed["gene_name"], ref_gene_name)
                )

            callback(parsed, ref_gene_name)
            stats["processed"] += 1

    return stats


def validate_ensembl_mappings(bed_file_path):
    """Validate ENSEMBL ID to gene name mappings in a BED file."""
    print("Fetching current Ensembl reference data...")
    reference_data = get_ensembl_reference()
    print(f"Retrieved {len(reference_data)} reference mappings")

    valid_entries = []

    def validation_callback(parsed, ref_gene_name):
        valid_entries.append(
            (parsed["ensembl_id"], parsed["gene_name"], parsed["line"])
        )

    stats = process_bed_file(bed_file_path, reference_data, validation_callback)
    return {**stats, "valid_entries": valid_entries}


def update_bed_with_reference_names(bed_file_path, output_path):
    """Create a new BED file with gene names updated to match Ensembl reference."""
    print("Fetching current Ensembl reference data...")
    reference_data = get_ensembl_reference()
    print(f"Retrieved {len(reference_data)} reference mappings")

    with open(output_path, "w") as f_out:

        def update_callback(parsed, ref_gene_name):
            fields = parsed["fields"].copy()
            name_parts = fields[3].split("_")
            name_parts[1] = ref_gene_name
            fields[3] = "_".join(name_parts)
            f_out.write("\t".join(fields) + "\n")

        stats = process_bed_file(bed_file_path, reference_data, update_callback)

    return stats


def print_validation_results(results):
    """Print validation results in a readable format."""
    print(f"\nValidation Results:")
    print(f"Total entries processed: {results['total']}")
    print(f"Successfully validated: {results['processed']}")

    if results["invalid_format"]:
        print(f"\nInvalid format entries ({len(results['invalid_format'])} found):")
        for line_num, line in results["invalid_format"]:
            print(f"Line {line_num}: {line}")

    if results["not_found"]:
        print(
            f"\nENSEMBL IDs not found in reference ({len(results['not_found'])} found):"
        )
        for line_num, ensembl_id, gene_name in results["not_found"]:
            print(f"Line {line_num}: {ensembl_id} ({gene_name})")

    if results["mismatches"]:
        print(f"\nReference mismatches ({len(results['mismatches'])} found):")
        for line_num, ensembl_id, bed_gene, ref_gene in results["mismatches"]:
            print(f"Line {line_num}: {ensembl_id}")
            print(f"  BED gene name: {bed_gene}")
            print(f"  Reference gene name: {ref_gene}\n")


def print_update_results(stats):
    """Print update operation results."""
    print(f"\nUpdate Results:")
    print(f"Total entries: {stats['total']}")
    print(f"Successfully updated: {stats['processed']}")
    print(f"Invalid format entries: {len(stats['invalid_format'])}")
    print(f"ENSEMBL IDs not found: {len(stats['not_found'])}")
    print(f"Gene names that were different: {len(stats['mismatches'])}")

In [5]:
results = validate_ensembl_mappings(
    "../data/ribosome_profiling/RiboTISHV6_Ly2024_AnnoToTruncation_exonintersect.bed"
)
print_validation_results(results)

Fetching current Ensembl reference data...
Retrieved 48379 reference mappings

Validation Results:
Total entries processed: 4926
Successfully validated: 4922

ENSEMBL IDs not found in reference (4 found):
Line 426: ENSG00000112096 (SOD2)
Line 427: ENSG00000112096 (SOD2)
Line 1894: ENSG00000215271 (HOMEZ)
Line 1895: ENSG00000215271 (HOMEZ)

Reference mismatches (154 found):
Line 45: ENSG00000065427
  BED gene name: KARS
  Reference gene name: KARS1

Line 87: ENSG00000141699
  BED gene name: FAM134C
  Reference gene name: RETREG3

Line 104: ENSG00000119333
  BED gene name: WDR34
  Reference gene name: DYNC2I2

Line 109: ENSG00000112941
  BED gene name: PAPD7
  Reference gene name: TENT4A

Line 113: ENSG00000109685
  BED gene name: WHSC1
  Reference gene name: NSD2

Line 167: ENSG00000146729
  BED gene name: GBAS
  Reference gene name: NIPSNAP2

Line 202: ENSG00000157870
  BED gene name: FAM213B
  Reference gene name: PRXL2B

Line 203: ENSG00000106105
  BED gene name: GARS
  Reference gen

In [7]:
stats = update_bed_with_reference_names(
    "../data/ribosome_profiling/RiboTISHV6_Ly2024_AnnoToTruncation_exonintersect.bed",
    "../data/ribosome_profiling/RiboTISHV6_MD2025_AnnoToTruncation_exonintersect.bed",
)
print_update_results(stats)

Fetching current Ensembl reference data...
Retrieved 48379 reference mappings

Update Results:
Total entries: 4926
Successfully updated: 4922
Invalid format entries: 0
ENSEMBL IDs not found: 4
Gene names that were different: 154


In [8]:
# Initialize handlers
alt_isoforms = AlternativeIsoform()

# Load BED file
alt_isoforms.load_bed(
    "../data/ribosome_profiling/RiboTISHV6_MD2025_AnnoToTruncation_exonintersect.bed"
)

gene_list = alt_isoforms.get_gene_list()

In [9]:
# Write gene_list to a txt file in the ribosome profiling directory
with open("../data/ribosome_profiling/gene_list.txt", "w") as f:
    for gene in gene_list:
        f.write(gene + "\n")