File to go step by step thorugh the NMD-Scanner steps, easier for testing, debugging and adjusting of the code because you can test it here first

# Import Dependencies

In [2]:
conda list

# packages in environment at /opt/modules/i12g/anaconda/envs/NMD_test:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
_python_abi3_support      1.0                  hd8ed1ab_2    conda-forge
alsa-lib                  1.2.13               hb9d3cd8_0    conda-forge
anyio                     4.9.0              pyh29332c3_0    conda-forge
argcomplete               3.6.0                    pypi_0    pypi
argh                      0.31.3                   pypi_0    pypi
argon2-cffi               23.1.0             pyhd8ed1ab_1    conda-forge
argon2-cffi-bindings      21.2.0          py311h9ecbd09_5    conda-forge
arrow                     1.3.0              pyhd8ed1ab_1    conda-forge
asttokens                 3.0.0              pyhd8ed1ab_1    conda-forge
async-lru                 2.0.5              pyh29332c3_0    conda-forg

In [3]:
import pyranges as pr
import pandas as pd
import os
from pyfaidx import Fasta
import numpy as np
from Bio.Seq import Seq

pd.options.display.max_columns = None # to show all columns of the dataframe

# Read files

In [4]:
ls -ll

total 9068
-rw-r--r--. 1 l_schroeder users   38865 Jan 11 14:06 create_test_VCF.ipynb
-rw-r--r--. 1 l_schroeder users 6252969 Dec 18 14:04 NMD.ipynb
-rw-r--r--. 1 l_schroeder users  249345 Jan 11 14:27 nmd-vep.ipynb
-rw-r--r--. 1 l_schroeder users    2465 Dec 18 14:34 README.md
-rw-r--r--. 1 l_schroeder users 1307856 Dec 18 14:04 train.ipynb
-rw-r--r--. 1 l_schroeder users 1307856 Jan 11 13:57 train_new.ipynb
-rw-r--r--. 1 l_schroeder users   54222 Dec 18 14:04 validation_MMRF_TARGET.ipynb
-rw-r--r--. 1 l_schroeder users   54222 Jan 11 14:05 validation_new.ipynb


In [5]:
vcf_path = "~/NMD/nmd-variant-effect-prediction/resources/TCGA_benchmark/tcga_dataset.vcf"
# vcf_path = "resources/part-00241-61a0abbf-fbf9-444f-8287-4e46ad4b9b7b-c000.vcf"
gtf_path = "/s/genomes/Gencode/Gencode_human/release_40/gencode.v40.annotation.gtf.gz" # new version
gtf_path_old = "/s/genomes/Gencode/Gencode_human/release_33/GRCh37_mapping/gencode.v33lift37.annotation.gtf.gz" # old version
fasta_path = "/s/genomes/Gencode/Gencode_human/release_40/GRCh38.p13.genome.fa"

In [6]:
vcf_path = "~/NMD/nmd-variant-effect-prediction/test_variant/variant.vcf"
gtf_path = "/s/genomes/Gencode/Gencode_human/release_34/GRCh37_mapping/gencode.v34lift37.annotation.gtf.gz"
fasta_path = "/s/genomes/Gencode/Gencode_human/release_34/GRCh37_mapping/GRCh37.primary_assembly.genome.fa"

In [7]:
def read_vcf(vcf_path):
    """
    Read a single VCF file into a PyRanges object with adjusted coordinates.
    """
    df = pd.read_csv(
        vcf_path,
        comment='#',
        sep='\t',
        header=None,
        names=['Chromosome', 'Start', 'ID', 'Ref', 'Alt', 'Qual', 'Filter', 'Info']
    )

    # Adjust coordinates to 0-based, half-open (BED-like)
    df['Start'] = df['Start'] - 1
    df['End'] = df['Start'] + df['Ref'].str.len()

    # Keep only relevant columns
    gr = pr.PyRanges(df[['Chromosome', 'Start', 'End', 'ID', 'Ref', 'Alt', 'Qual', 'Filter', 'Info']])
    return gr

def read_gtf(gtf_path):
    """
    Reads a GTF file into a PyRanges object.
    """
    if not os.path.exists(gtf_path):
        raise FileNotFoundError(f"GTF file not found: {gtf_path}")
    return pr.read_gtf(gtf_path)

def read_fasta(fasta_path):
    """
    Reads a genome FASTA file using pyfaidx.Fasta and returns a pyfaidx.Fasta object.
    """
    if not os.path.exists(fasta_path):
        raise FileNotFoundError(f"FASTA file not found: {fasta_path}")
    return Fasta(fasta_path)

In [8]:
vcf = read_vcf(vcf_path)
print(f"VCF shape: {vcf.df.shape}")
vcf

VCF shape: (3, 9)


Unnamed: 0,Chromosome,Start,End,ID,Ref,Alt,Qual,Filter,Info
0,chr8,42720558,42720559,.,C,T,.,.,.
1,chr8,42706139,42706140,.,C,C,.,.,.
2,chr8,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.


In [9]:
gtf = read_gtf(gtf_path)
print(f"GTF File shape: {gtf.df.shape}")

GTF File shape: (2919180, 32)


In [10]:
gtf_test = gtf[gtf.Feature.isin(["CDS", "transcript", "exon"])]
gtf_test

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,gene_type,gene_name,transcript_type,transcript_name,exon_number,exon_id,level,transcript_support_level,hgnc_id,tag,havana_gene,havana_transcript,remap_original_location,remap_status,remap_num_mappings,remap_target_status,ont,protein_id,ccdsid,gene_status,transcript_status,remap_substituted_missing_target
0,chr1,HAVANA,exon,11868,12227,.,+,.,ENSG00000223972.5_3,ENST00000456328.2_1,transcribed_unprocessed_pseudogene,DDX11L1,processed_transcript,DDX11L1-202,1,ENSE00002234944.1_1,2,1,HGNC:37102,basic,OTTHUMG00000000961.1_3,OTTHUMT00000362751.1_1,chr1:+:11869-12227,full_contig,,,,,,,,
1,chr1,HAVANA,transcript,11868,14409,.,+,.,ENSG00000223972.5_3,ENST00000456328.2_1,transcribed_unprocessed_pseudogene,DDX11L1,processed_transcript,DDX11L1-202,,,2,1,HGNC:37102,basic,OTTHUMG00000000961.1_3,OTTHUMT00000362751.1_1,,full_contig,1,overlap,,,,,,
2,chr1,HAVANA,exon,12009,12057,.,+,.,ENSG00000223972.5_3,ENST00000450305.2_1,transcribed_unprocessed_pseudogene,DDX11L1,transcribed_unprocessed_pseudogene,DDX11L1-201,1,ENSE00001948541.1_1,2,,HGNC:37102,basic,OTTHUMG00000000961.1_3,OTTHUMT00000002844.1_1,chr1:+:12010-12057,full_contig,,,PGO:0000019,,,,,
3,chr1,HAVANA,transcript,12009,13670,.,+,.,ENSG00000223972.5_3,ENST00000450305.2_1,transcribed_unprocessed_pseudogene,DDX11L1,transcribed_unprocessed_pseudogene,DDX11L1-201,,,2,,HGNC:37102,basic,OTTHUMG00000000961.1_3,OTTHUMT00000002844.1_1,,full_contig,1,overlap,PGO:0000019,,,,,
4,chr1,HAVANA,exon,12178,12227,.,+,.,ENSG00000223972.5_3,ENST00000450305.2_1,transcribed_unprocessed_pseudogene,DDX11L1,transcribed_unprocessed_pseudogene,DDX11L1-201,2,ENSE00001671638.2_1,2,,HGNC:37102,basic,OTTHUMG00000000961.1_3,OTTHUMT00000002844.1_1,chr1:+:12179-12227,full_contig,,,PGO:0000019,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2375420,chrY,HAVANA,transcript,59358334,59360548,.,-,.,ENSG00000227159.8_3_PAR_Y,ENST00000507418.6_2_PAR_Y,unprocessed_pseudogene,DDX11L16,unprocessed_pseudogene,DDX11L16-201,,,2,,HGNC:37115,PAR,OTTHUMG00000022678.1_3,OTTHUMT00000058841.1_2,,full_contig,1,overlap,PGO:0000005,,,,,
2375421,chrY,HAVANA,exon,59359354,59359508,.,-,.,ENSG00000227159.8_3_PAR_Y,ENST00000507418.6_2_PAR_Y,unprocessed_pseudogene,DDX11L16,unprocessed_pseudogene,DDX11L16-201,2,ENSE00002036959.1_1,2,,HGNC:37115,PAR,OTTHUMG00000022678.1_3,OTTHUMT00000058841.1_2,chrY:-:57213204-57213357,full_contig,,,PGO:0000005,,,,,
2375422,chrY,HAVANA,exon,59359676,59359753,.,-,.,ENSG00000227159.8_3_PAR_Y,ENST00000507418.6_2_PAR_Y,unprocessed_pseudogene,DDX11L16,unprocessed_pseudogene,DDX11L16-201,3,ENSE00002021169.1_1,2,,HGNC:37115,PAR,OTTHUMG00000022678.1_3,OTTHUMT00000058841.1_2,chrY:-:57213526-57213602,full_contig,,,PGO:0000005,,,,,
2375423,chrY,HAVANA,exon,59360030,59360115,.,-,.,ENSG00000227159.8_3_PAR_Y,ENST00000507418.6_2_PAR_Y,unprocessed_pseudogene,DDX11L16,unprocessed_pseudogene,DDX11L16-201,4,ENSE00002046926.1_1,2,,HGNC:37115,PAR,OTTHUMG00000022678.1_3,OTTHUMT00000058841.1_2,chrY:-:57213880-57213964,full_contig,,,PGO:0000005,,,,,


In [11]:
# fasta = read_fasta(fasta_path)
# or
fasta = Fasta(fasta_path)

In [12]:
# adjust exon number in GTF (recommended for hg19)

def compute_exon_numbers(gtf_df):
    """
    Compute exon numbers for all transcripts in a GTF PyRanges object.
    Exon numbers are assigned based on genomic order per transcript and strand.
    CDS features inherit the exon number of the exon they overlap.

    :param gtf: PyRanges object of the GTF
    :return: PyRanges object with new column 'exon_number_computed'
    """
    gtf_df = gtf.df.copy()

    # Step 1: Compute exon numbers for exon features
    exons = gtf_df[gtf_df.Feature == "exon"].copy()
    for tx, group in exons.groupby("transcript_id"):
        strand = group["Strand"].iloc[0]
        if strand == "+":
            sorted_group = group.sort_values("Start")
        else:
            sorted_group = group.sort_values("Start", ascending=False)
        exons.loc[sorted_group.index, "exon_number"] = range(1, len(sorted_group) + 1)

    # Step 2: Assign exon numbers to CDS features
    cds = gtf_df[gtf_df.Feature == "CDS"].copy()
    for tx, exon_group in exons.groupby("transcript_id"):
        cds_group = cds[cds.transcript_id == tx]
        for idx, cds_row in cds_group.iterrows():
            overlaps = exon_group[(exon_group["Start"] <= cds_row["End"]) & (exon_group["End"] >= cds_row["Start"])]
            if not overlaps.empty:
                # choose exon with maximum overlap
                overlap_idx = overlaps.apply(lambda row: min(row["End"], cds_row["End"]) - max(row["Start"], cds_row["Start"]), axis=1).idxmax()
                gtf_df.loc[idx, "exon_number"] = exon_group.loc[overlap_idx, "exon_number"]

    # Step 3: Update exon features
    gtf_df.loc[exons.index, "exon_number"] = exons["exon_number"]

    return pr.PyRanges(gtf_df)

In [None]:
# test compute_exon_numbers() only with a few rows of gtf so its faster

gtf_df_subset = gtf.df.iloc[:500,:]
gtf_df_subset = pr.PyRanges(gtf_df_subset)

gtf_df_subset = compute_exon_numbers(gtf_df_subset)

In [13]:
# extract CDS regions from the GTF file
cds = gtf[gtf.Feature == "CDS"]
cds_df = cds.df

# extract exon regions from the GTF file and compute exon related metrics:
# exon length & number of exons contained in each transcript
exons = gtf[gtf.Feature == "exon"]
exons_df = exons.df
exons_df["exon_length"] = exons_df["End"] - exons_df["Start"]

# Extract PTC

Create reference and alternative CDS and transcript sequences (+ metadata) and analyze for start and stop codons & -loss

In [14]:
def adjust_last_cds_for_stop_codon(df, transcript_col="transcript_id"):

    """
    Adjusts the genomic coordinates of the last CDS exon in each transcript by adding 3 positions,
    thus to include the stop codon.
    Plus strand: extend last exon (largest start position) at the END (+3 to End)
    Minus strand: extend last exon (smallest start position) at the START (-3 from Start)
    :param df: Dataframe containing CDS annotation
    :param exon_col: The name of the column that indicated the exon number, so we can find out which is the last CDS snippet
    :param transcript_col: The name of the column that indicates the transcript ID
    :return: Modified pandas DataFrame where the last exon of each transcript is extended by 3 bases to include the stop codon.
    """

    df = df.copy()

    adjusted_idx = []
    for tx, group in df.groupby(transcript_col):
        strand = group["Strand"].iloc[0]

        if strand == "+":
            # last exon has max Start position
            idx = group["Start"].idxmax()
            df.at[idx, "End"] += 3
        elif strand == "-":
            # last exon has min Start position
            idx = group["Start"].idxmin()
            df.at[idx, "Start"] -= 3

        adjusted_idx.append(idx)

    return df

cds_df_adj = adjust_last_cds_for_stop_codon(cds_df)
cds_df_adj

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,gene_type,gene_name,transcript_type,transcript_name,exon_number,exon_id,level,transcript_support_level,hgnc_id,tag,havana_gene,havana_transcript,remap_original_location,remap_status,remap_num_mappings,remap_target_status,ont,protein_id,ccdsid,gene_status,transcript_status,remap_substituted_missing_target
0,chr1,HAVANA,CDS,65564,65573,.,+,0,ENSG00000186092.6_5,ENST00000641515.2_2,protein_coding,OR4F5,protein_coding,OR4F5-202,2,ENSE00003813641.1,2,,HGNC:14825,basic,OTTHUMG00000001094.1_5,OTTHUMT00000003223.1_2,chr1:+:65565-65573,full_contig,,,,ENSP00000493376.2,,,,
1,chr1,HAVANA,CDS,69036,70008,.,+,0,ENSG00000186092.6_5,ENST00000641515.2_2,protein_coding,OR4F5,protein_coding,OR4F5-202,3,ENSE00003813949.1,2,,HGNC:14825,basic,OTTHUMG00000001094.1_5,OTTHUMT00000003223.1_2,chr1:+:69037-70005,full_contig,,,,ENSP00000493376.2,,,,
2,chr1,ENSEMBL,CDS,69090,70008,.,+,0,ENSG00000186092.6_5,ENST00000335137.4_2,protein_coding,OR4F5,protein_coding,OR4F5-201,1,ENSE00002319515.2,3,,HGNC:14825,CCDS,OTTHUMG00000001094.1_5,,chr1:+:69091-70005,full_contig,,,,ENSP00000334393.3,CCDS30547.1,,,
3,chr1,HAVANA,CDS,367658,368597,.,+,0,ENSG00000235249.1,ENST00000426406.1,protein_coding,OR4F29,protein_coding,OR4F29-001,1,ENSE00002316283.1,2,,,CCDS,OTTHUMG00000002860.1,OTTHUMT00000007999.1,,,,,,ENSP00000409316.1,CCDS41220.1,KNOWN,KNOWN,V19
4,chr1,HAVANA,CDS,859811,860328,.,+,0,ENSG00000187634.12_9,ENST00000420190.6_5,protein_coding,SAMD11,protein_coding,SAMD11-203,1,ENSE00001637883.2,2,3,HGNC:28706,cds_end_NF,OTTHUMG00000040719.1_9,OTTHUMT00000316521.1_5,chr1:+:924432-924948,full_contig,,,,ENSP00000411579.2,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763876,chrY,ENSEMBL,CDS,27187915,27188033,.,-,0,ENSG00000185894.8_7,ENST00000618574.1_5,protein_coding,BPY2C,protein_coding,BPY2C-204,3,ENSE00001711311.1,3,1,HGNC:18225,CCDS,OTTHUMG00000045199.1_7,,chrY:-:25041769-25041886,full_contig,,,,ENSP00000480751.1,CCDS44030.1,,,
763877,chrY,HAVANA,CDS,27187915,27188033,.,-,0,ENSG00000185894.8_7,ENST00000382287.5_6,protein_coding,BPY2C,protein_coding,BPY2C-201,5,ENSE00001711311.1,2,1,HGNC:18225,CCDS,OTTHUMG00000045199.1_7,OTTHUMT00000104944.1_6,chrY:-:25041769-25041886,full_contig,,,,ENSP00000371724.1,CCDS44030.1,,,
763878,chrY,ENSEMBL,CDS,27190092,27190170,.,-,0,ENSG00000185894.8_7,ENST00000618574.1_5,protein_coding,BPY2C,protein_coding,BPY2C-204,4,ENSE00003722907.1,3,1,HGNC:18225,CCDS,OTTHUMG00000045199.1_7,,chrY:-:25043946-25044023,full_contig,,,,ENSP00000480751.1,CCDS44030.1,,,
763879,chrY,HAVANA,CDS,27190092,27190170,.,-,0,ENSG00000185894.8_7,ENST00000382287.5_6,protein_coding,BPY2C,protein_coding,BPY2C-201,6,ENSE00003604811.1,2,1,HGNC:18225,CCDS,OTTHUMG00000045199.1_7,OTTHUMT00000104944.1_6,chrY:-:25043946-25044023,full_contig,,,,ENSP00000371724.1,CCDS44030.1,,,


In [15]:
cds_df_adj["exon_number"] = cds_df_adj["exon_number"].astype(int)

In [16]:
intersection_cds_vcf = pr.PyRanges(cds_df_adj).join(vcf, how=None, suffix="_variant").df
intersection_cds_vcf

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,gene_type,gene_name,transcript_type,transcript_name,exon_number,exon_id,level,transcript_support_level,hgnc_id,tag,havana_gene,havana_transcript,remap_original_location,remap_status,remap_num_mappings,remap_target_status,ont,protein_id,ccdsid,gene_status,transcript_status,remap_substituted_missing_target,Start_variant,End_variant,ID,Ref,Alt,Qual,Filter,Info
0,chr8,HAVANA,CDS,42705933,42706140,.,-,0,ENSG00000120925.16_6,ENST00000319104.7_2,protein_coding,RNF170,protein_coding,RNF170-203,1,ENSE00001239742.5,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383165.1_2,chr8:-:42850794-42850997,full_contig,,,,ENSP00000326138.3,CCDS55230.1,,,,42706139,42706140,.,C,C,.,.,.
1,chr8,ENSEMBL,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000534961.5_2,protein_coding,RNF170,protein_coding,RNF170-209,3,ENSE00003678305.1,3,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,,chr8:-:42865416-42865489,full_contig,,,,ENSP00000445725.1,CCDS6138.1,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.
2,chr8,ENSEMBL,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000534961.5_2,protein_coding,RNF170,protein_coding,RNF170-209,3,ENSE00003678305.1,3,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,,chr8:-:42865416-42865489,full_contig,,,,ENSP00000445725.1,CCDS6138.1,,,,42720558,42720559,.,C,T,.,.,.
3,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000319104.7_2,protein_coding,RNF170,protein_coding,RNF170-203,2,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383165.1_2,chr8:-:42865416-42865489,full_contig,,,,ENSP00000326138.3,CCDS55230.1,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.
4,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000319104.7_2,protein_coding,RNF170,protein_coding,RNF170-203,2,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383165.1_2,chr8:-:42865416-42865489,full_contig,,,,ENSP00000326138.3,CCDS55230.1,,,,42720558,42720559,.,C,T,.,.,.
5,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000526349.5_2,protein_coding,RNF170,protein_coding,RNF170-205,3,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383168.1_2,chr8:-:42865416-42865489,full_contig,,,,ENSP00000435782.1,CCDS55229.1,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.
6,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000526349.5_2,protein_coding,RNF170,protein_coding,RNF170-205,3,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383168.1_2,chr8:-:42865416-42865489,full_contig,,,,ENSP00000435782.1,CCDS55229.1,,,,42720558,42720559,.,C,T,.,.,.
7,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000527424.6_3,protein_coding,RNF170,protein_coding,RNF170-206,3,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383166.1_3,chr8:-:42865416-42865489,full_contig,,,,ENSP00000434797.1,CCDS6138.1,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.
8,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000527424.6_3,protein_coding,RNF170,protein_coding,RNF170-206,3,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383166.1_3,chr8:-:42865416-42865489,full_contig,,,,ENSP00000434797.1,CCDS6138.1,,,,42720558,42720559,.,C,T,.,.,.
9,chr8,HAVANA,CDS,42720605,42720632,.,-,2,ENSG00000120925.16_6,ENST00000531440.5_2,protein_coding,RNF170,protein_coding,RNF170-208,1,ENSE00002147066.1,2,4,HGNC:25358,cds_end_NF,OTTHUMG00000165277.1_6,OTTHUMT00000383170.1_2,chr8:-:42865466-42865489,full_contig,,,,ENSP00000436416.1,,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.


In [None]:
# fix minus strand variants (only for TCGA / MMREF VCF!)

# Fix REF and ALT for minus-strand CDSs
mask_minus_strand = intersection_cds_vcf["Strand"] == "-"
intersection_cds_vcf.loc[mask_minus_strand, "Ref"] = intersection_cds_vcf.loc[mask_minus_strand, "Ref"].apply(lambda seq: str(Seq(seq).reverse_complement()))
intersection_cds_vcf.loc[mask_minus_strand, "Alt"] = intersection_cds_vcf.loc[mask_minus_strand, "Alt"].apply(lambda seq: str(Seq(seq).reverse_complement()))

intersection_cds_vcf

In [17]:
# Import Dependencies
from functools import lru_cache

# Global cache to map id --> actual fasta object
_fasta_cache = {}

@lru_cache(maxsize=None)
def fetch_seq_cached(chrom, start, end, fasta_id):
    fasta = _fasta_cache[fasta_id]  # lookup the actual fasta object
    return fasta[chrom][start:end].seq.upper()

def add_exon_cds_sequence(df, fasta, chrom_col="Chromosome", start_col="Start", end_col="End", new_col="Exon_CDS_seq"):
    fasta_id = id(fasta)
    _fasta_cache[fasta_id] = fasta  # store for later access

    def extract_seq(row):
        return fetch_seq_cached(row[chrom_col], row[start_col], row[end_col], fasta_id)

    df[new_col] = df.apply(extract_seq, axis=1)
    return df

intersection_cds_vcf = add_exon_cds_sequence(intersection_cds_vcf, fasta)
intersection_cds_vcf

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,gene_type,gene_name,transcript_type,transcript_name,exon_number,exon_id,level,transcript_support_level,hgnc_id,tag,havana_gene,havana_transcript,remap_original_location,remap_status,remap_num_mappings,remap_target_status,ont,protein_id,ccdsid,gene_status,transcript_status,remap_substituted_missing_target,Start_variant,End_variant,ID,Ref,Alt,Qual,Filter,Info,Exon_CDS_seq
0,chr8,HAVANA,CDS,42705933,42706140,.,-,0,ENSG00000120925.16_6,ENST00000319104.7_2,protein_coding,RNF170,protein_coding,RNF170-203,1,ENSE00001239742.5,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383165.1_2,chr8:-:42850794-42850997,full_contig,,,,ENSP00000326138.3,CCDS55230.1,,,,42706139,42706140,.,C,C,.,.,.,TTAACCTGAGAACCAAAGGATGGAAACACAGTCTAGTAACGTGAAG...
1,chr8,ENSEMBL,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000534961.5_2,protein_coding,RNF170,protein_coding,RNF170-209,3,ENSE00003678305.1,3,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,,chr8:-:42865416-42865489,full_contig,,,,ENSP00000445725.1,CCDS6138.1,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...
2,chr8,ENSEMBL,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000534961.5_2,protein_coding,RNF170,protein_coding,RNF170-209,3,ENSE00003678305.1,3,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,,chr8:-:42865416-42865489,full_contig,,,,ENSP00000445725.1,CCDS6138.1,,,,42720558,42720559,.,C,T,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...
3,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000319104.7_2,protein_coding,RNF170,protein_coding,RNF170-203,2,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383165.1_2,chr8:-:42865416-42865489,full_contig,,,,ENSP00000326138.3,CCDS55230.1,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...
4,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000319104.7_2,protein_coding,RNF170,protein_coding,RNF170-203,2,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383165.1_2,chr8:-:42865416-42865489,full_contig,,,,ENSP00000326138.3,CCDS55230.1,,,,42720558,42720559,.,C,T,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...
5,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000526349.5_2,protein_coding,RNF170,protein_coding,RNF170-205,3,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383168.1_2,chr8:-:42865416-42865489,full_contig,,,,ENSP00000435782.1,CCDS55229.1,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...
6,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000526349.5_2,protein_coding,RNF170,protein_coding,RNF170-205,3,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383168.1_2,chr8:-:42865416-42865489,full_contig,,,,ENSP00000435782.1,CCDS55229.1,,,,42720558,42720559,.,C,T,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...
7,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000527424.6_3,protein_coding,RNF170,protein_coding,RNF170-206,3,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383166.1_3,chr8:-:42865416-42865489,full_contig,,,,ENSP00000434797.1,CCDS6138.1,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...
8,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000527424.6_3,protein_coding,RNF170,protein_coding,RNF170-206,3,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383166.1_3,chr8:-:42865416-42865489,full_contig,,,,ENSP00000434797.1,CCDS6138.1,,,,42720558,42720559,.,C,T,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...
9,chr8,HAVANA,CDS,42720605,42720632,.,-,2,ENSG00000120925.16_6,ENST00000531440.5_2,protein_coding,RNF170,protein_coding,RNF170-208,1,ENSE00002147066.1,2,4,HGNC:25358,cds_end_NF,OTTHUMG00000165277.1_6,OTTHUMT00000383170.1_2,chr8:-:42865466-42865489,full_contig,,,,ENSP00000436416.1,,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.,ATCGCCAGTAAGCAATAATGCAGGCAC


In [19]:
# new function to apply variants with <DEL> and <DUP>

def apply_variant_edge_aware_with_lengths(row):

    """
    Applies a variant to a CDS exon sequence, taking into account not only SNVs but also partial overlaps at exon
    boundaries and computing the alternative sequence.

    :param row: A single row from the DataFrame (A pandas.Series) containing among others CDS Start and End, Variant
                Start and End, Ref, Alt, Exon_CDS_seq (Original CDS sequence as string)
    :return: The input pandas.Series with additional information:
             Exon_CDS_length (length of the original CDS),
             Exon_Alt_CDS_seq (alternative CDS after applying the variant / None if invalid),
             Exon_Alt_CDS_length (length of the alternative CDS / None if invalid).
    """

    cds_seq = list(row["Exon_CDS_seq"])
    strand = row["Strand"]
    ref = row["Ref"]
    alt = row["Alt"]
    cds_start = int(row["Start"])
    cds_end = int(row["End"])
    var_start = int(row["Start_variant"])
    var_end = int(row["End_variant"])

    # Special handling for deletions (Ref = N and Alt = <DEL>)
    if ref == "N" and alt == "<DEL>":
        # Clip deletion to the CDS region (only remove overlap part)

        # Determine the overlap between variant and this CDS region
        overlap_start = max(var_start, cds_start)
        overlap_end = min(var_end, cds_end)

        # If there is no overlap between the variant and this CDS region
        if overlap_start >= overlap_end:
            return pd.Series({
                "Exon_CDS_length": len(cds_seq),
                "Exon_Alt_CDS_seq": None,
                "Exon_Alt_CDS_length": None
            })

        cds_index_start = overlap_start - cds_start
        cds_index_end = overlap_end - cds_start

        alt_seq = []
        alt_seq.extend(cds_seq[:cds_index_start])  # keep sequence before deletion
        alt_seq.extend(cds_seq[cds_index_end:])  # keep sequence after deletion

        return pd.Series({
            "Exon_CDS_length": len(cds_seq),
            "Exon_Alt_CDS_seq": "".join(alt_seq),
            "Exon_Alt_CDS_length": len(alt_seq)
        })

    # Special handling for duplications (Ref = N and Alt = <DUP>)
    if ref == "N" and alt == "<DUP>":

        # Determine the overlap between variant and this CDS region
        overlap_start = max(var_start, cds_start)
        overlap_end = min(var_end, cds_end)

        # If there is no overlap between the variant and this CDS region
        if overlap_start >= overlap_end:
            return pd.Series({
                "Exon_CDS_length": len(cds_seq),
                "Exon_Alt_CDS_seq": None,
                "Exon_Alt_CDS_length": None
            })

        cds_index_start = overlap_start - cds_start
        cds_index_end = overlap_end - cds_start

        alt_seq = []
        alt_seq.extend(cds_seq[:cds_index_start])  # sequence up to the end of the overlap
        alt_seq.extend(cds_seq[cds_index_start:cds_index_end]) # overlap-region (original, in CDS)
        alt_seq.extend(cds_seq[cds_index_start:cds_index_end])  # duplicate the overlapped region
        alt_seq.extend(cds_seq[cds_index_end:])  # keep sequence after duplication

        return pd.Series({
            "Exon_CDS_length": len(cds_seq),
            "Exon_Alt_CDS_seq": "".join(alt_seq),
            "Exon_Alt_CDS_length": len(alt_seq)
        })

    # Determine the overlap between variant and this CDS region
    overlap_start = max(var_start, cds_start)
    overlap_end = min(var_end, cds_end)

    # If there is no overlap between the variant and this CDS region
    if overlap_start >= overlap_end:
        return pd.Series({
            "Exon_CDS_length": len(cds_seq),
            "Exon_Alt_CDS_seq": None,
            "Exon_Alt_CDS_length": None
        })

    # Position of overlap within the CDS
    cds_index = overlap_start - cds_start
    overlap_len = overlap_end - overlap_start

    # Offset of the overlapping region within the variant
    ref_offset = overlap_start - var_start
    ref_in_cds = ref[ref_offset:ref_offset + overlap_len]
    alt_in_cds = alt[ref_offset:ref_offset + overlap_len]

    # Determine if there’s leftover alt outside CDS (insertions at end)
    extra_alt = ""
    if len(alt) > len(ref):
        # Limit extra_alt to what corresponds to CDS overlap
        extra_start = ref_offset + overlap_len
        if var_end > cds_end:
            # Only include alt bases that map to CDS
            remaining_cds_len = cds_end - overlap_end
            extra_alt = alt[extra_start:extra_start + remaining_cds_len]
        else:
            extra_alt = alt[extra_start:]

    # Confirm that the reference matches
    cds_ref_part = "".join(cds_seq[cds_index:cds_index + overlap_len])
    if cds_ref_part != ref_in_cds.upper(): # reference mismatch
        return pd.Series({
            "Exon_CDS_length": len(cds_seq),
            "Exon_Alt_CDS_seq": None,
            "Exon_Alt_CDS_length": None
        })

    # Build alternative sequence
    alt_seq = []
    alt_seq.extend(cds_seq[:cds_index]) # Copy the CDS up to the variant position

    if len(ref) == len(alt): # Substitution
        alt_seq.extend(list(alt_in_cds))
    elif len(alt) > len(ref): # Insertion
        alt_seq.extend(list(alt_in_cds))
        alt_seq.extend(list(extra_alt))
    elif len(ref) > len(alt): # Deletion
        alt_seq.extend(list(alt_in_cds))

    # Add remaining CDS sequence after variant
    alt_seq.extend(cds_seq[cds_index + overlap_len:])

    return pd.Series({
        "Exon_CDS_length": len(cds_seq),
        "Exon_Alt_CDS_seq": "".join(alt_seq),
        "Exon_Alt_CDS_length": len(alt_seq)
    })

intersection_cds_vcf[["Exon_CDS_length", "Exon_Alt_CDS_seq", "Exon_Alt_CDS_length"]] = intersection_cds_vcf.apply(
        apply_variant_edge_aware_with_lengths,
        axis=1
    )

intersection_cds_vcf

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,gene_type,gene_name,transcript_type,transcript_name,exon_number,exon_id,level,transcript_support_level,hgnc_id,tag,havana_gene,havana_transcript,remap_original_location,remap_status,remap_num_mappings,remap_target_status,ont,protein_id,ccdsid,gene_status,transcript_status,remap_substituted_missing_target,Start_variant,End_variant,ID,Ref,Alt,Qual,Filter,Info,Exon_CDS_seq,Exon_CDS_length,Exon_Alt_CDS_seq,Exon_Alt_CDS_length
0,chr8,HAVANA,CDS,42705933,42706140,.,-,0,ENSG00000120925.16_6,ENST00000319104.7_2,protein_coding,RNF170,protein_coding,RNF170-203,1,ENSE00001239742.5,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383165.1_2,chr8:-:42850794-42850997,full_contig,,,,ENSP00000326138.3,CCDS55230.1,,,,42706139,42706140,.,C,C,.,.,.,TTAACCTGAGAACCAAAGGATGGAAACACAGTCTAGTAACGTGAAG...,207,TTAACCTGAGAACCAAAGGATGGAAACACAGTCTAGTAACGTGAAG...,207
1,chr8,ENSEMBL,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000534961.5_2,protein_coding,RNF170,protein_coding,RNF170-209,3,ENSE00003678305.1,3,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,,chr8:-:42865416-42865489,full_contig,,,,ENSP00000445725.1,CCDS6138.1,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74,,0
2,chr8,ENSEMBL,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000534961.5_2,protein_coding,RNF170,protein_coding,RNF170-209,3,ENSE00003678305.1,3,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,,chr8:-:42865416-42865489,full_contig,,,,ENSP00000445725.1,CCDS6138.1,,,,42720558,42720559,.,C,T,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74,TGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74
3,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000319104.7_2,protein_coding,RNF170,protein_coding,RNF170-203,2,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383165.1_2,chr8:-:42865416-42865489,full_contig,,,,ENSP00000326138.3,CCDS55230.1,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74,,0
4,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000319104.7_2,protein_coding,RNF170,protein_coding,RNF170-203,2,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383165.1_2,chr8:-:42865416-42865489,full_contig,,,,ENSP00000326138.3,CCDS55230.1,,,,42720558,42720559,.,C,T,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74,TGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74
5,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000526349.5_2,protein_coding,RNF170,protein_coding,RNF170-205,3,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383168.1_2,chr8:-:42865416-42865489,full_contig,,,,ENSP00000435782.1,CCDS55229.1,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74,,0
6,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000526349.5_2,protein_coding,RNF170,protein_coding,RNF170-205,3,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383168.1_2,chr8:-:42865416-42865489,full_contig,,,,ENSP00000435782.1,CCDS55229.1,,,,42720558,42720559,.,C,T,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74,TGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74
7,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000527424.6_3,protein_coding,RNF170,protein_coding,RNF170-206,3,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383166.1_3,chr8:-:42865416-42865489,full_contig,,,,ENSP00000434797.1,CCDS6138.1,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74,,0
8,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000527424.6_3,protein_coding,RNF170,protein_coding,RNF170-206,3,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383166.1_3,chr8:-:42865416-42865489,full_contig,,,,ENSP00000434797.1,CCDS6138.1,,,,42720558,42720559,.,C,T,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74,TGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74
9,chr8,HAVANA,CDS,42720605,42720632,.,-,2,ENSG00000120925.16_6,ENST00000531440.5_2,protein_coding,RNF170,protein_coding,RNF170-208,1,ENSE00002147066.1,2,4,HGNC:25358,cds_end_NF,OTTHUMG00000165277.1_6,OTTHUMT00000383170.1_2,chr8:-:42865466-42865489,full_contig,,,,ENSP00000436416.1,,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.,ATCGCCAGTAAGCAATAATGCAGGCAC,27,,0


In [20]:
# Filter out Variants with a reference mismatch
mismatched_rows = intersection_cds_vcf[intersection_cds_vcf["Exon_Alt_CDS_seq"].isna()]
print(f"\n[Warning] Skipping {len(mismatched_rows)} variants due to reference mismatches")
if not mismatched_rows.empty:
    print(mismatched_rows[["transcript_id", "Chromosome", "Start_variant", "End_variant", "Ref", "Alt"]].to_string(index=False))
    
intersection_cds_vcf = intersection_cds_vcf[intersection_cds_vcf["Exon_Alt_CDS_seq"].notna()].copy()
intersection_cds_vcf




Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,gene_type,gene_name,transcript_type,transcript_name,exon_number,exon_id,level,transcript_support_level,hgnc_id,tag,havana_gene,havana_transcript,remap_original_location,remap_status,remap_num_mappings,remap_target_status,ont,protein_id,ccdsid,gene_status,transcript_status,remap_substituted_missing_target,Start_variant,End_variant,ID,Ref,Alt,Qual,Filter,Info,Exon_CDS_seq,Exon_CDS_length,Exon_Alt_CDS_seq,Exon_Alt_CDS_length
0,chr8,HAVANA,CDS,42705933,42706140,.,-,0,ENSG00000120925.16_6,ENST00000319104.7_2,protein_coding,RNF170,protein_coding,RNF170-203,1,ENSE00001239742.5,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383165.1_2,chr8:-:42850794-42850997,full_contig,,,,ENSP00000326138.3,CCDS55230.1,,,,42706139,42706140,.,C,C,.,.,.,TTAACCTGAGAACCAAAGGATGGAAACACAGTCTAGTAACGTGAAG...,207,TTAACCTGAGAACCAAAGGATGGAAACACAGTCTAGTAACGTGAAG...,207
1,chr8,ENSEMBL,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000534961.5_2,protein_coding,RNF170,protein_coding,RNF170-209,3,ENSE00003678305.1,3,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,,chr8:-:42865416-42865489,full_contig,,,,ENSP00000445725.1,CCDS6138.1,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74,,0
2,chr8,ENSEMBL,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000534961.5_2,protein_coding,RNF170,protein_coding,RNF170-209,3,ENSE00003678305.1,3,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,,chr8:-:42865416-42865489,full_contig,,,,ENSP00000445725.1,CCDS6138.1,,,,42720558,42720559,.,C,T,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74,TGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74
3,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000319104.7_2,protein_coding,RNF170,protein_coding,RNF170-203,2,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383165.1_2,chr8:-:42865416-42865489,full_contig,,,,ENSP00000326138.3,CCDS55230.1,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74,,0
4,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000319104.7_2,protein_coding,RNF170,protein_coding,RNF170-203,2,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383165.1_2,chr8:-:42865416-42865489,full_contig,,,,ENSP00000326138.3,CCDS55230.1,,,,42720558,42720559,.,C,T,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74,TGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74
5,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000526349.5_2,protein_coding,RNF170,protein_coding,RNF170-205,3,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383168.1_2,chr8:-:42865416-42865489,full_contig,,,,ENSP00000435782.1,CCDS55229.1,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74,,0
6,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000526349.5_2,protein_coding,RNF170,protein_coding,RNF170-205,3,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383168.1_2,chr8:-:42865416-42865489,full_contig,,,,ENSP00000435782.1,CCDS55229.1,,,,42720558,42720559,.,C,T,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74,TGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74
7,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000527424.6_3,protein_coding,RNF170,protein_coding,RNF170-206,3,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383166.1_3,chr8:-:42865416-42865489,full_contig,,,,ENSP00000434797.1,CCDS6138.1,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74,,0
8,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000527424.6_3,protein_coding,RNF170,protein_coding,RNF170-206,3,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383166.1_3,chr8:-:42865416-42865489,full_contig,,,,ENSP00000434797.1,CCDS6138.1,,,,42720558,42720559,.,C,T,.,.,.,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74,TGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...,74
9,chr8,HAVANA,CDS,42720605,42720632,.,-,2,ENSG00000120925.16_6,ENST00000531440.5_2,protein_coding,RNF170,protein_coding,RNF170-208,1,ENSE00002147066.1,2,4,HGNC:25358,cds_end_NF,OTTHUMG00000165277.1_6,OTTHUMT00000383170.1_2,chr8:-:42865466-42865489,full_contig,,,,ENSP00000436416.1,,,,,42720557,42720632,.,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,.,.,.,ATCGCCAGTAAGCAATAATGCAGGCAC,27,,0


In [21]:
# Limit to relevant transcript (to save time)
relevant_transcripts = intersection_cds_vcf["transcript_id"].unique()
cds_df_adj = cds_df_adj[cds_df_adj["transcript_id"].isin(relevant_transcripts)].copy()

In [22]:
cds_df_adj = add_exon_cds_sequence(cds_df_adj, fasta)
cds_df_adj

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,gene_type,gene_name,transcript_type,transcript_name,exon_number,exon_id,level,transcript_support_level,hgnc_id,tag,havana_gene,havana_transcript,remap_original_location,remap_status,remap_num_mappings,remap_target_status,ont,protein_id,ccdsid,gene_status,transcript_status,remap_substituted_missing_target,Exon_CDS_seq
330801,chr8,HAVANA,CDS,42705933,42706140,.,-,0,ENSG00000120925.16_6,ENST00000319104.7_2,protein_coding,RNF170,protein_coding,RNF170-203,1,ENSE00001239742.5,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383165.1_2,chr8:-:42850794-42850997,full_contig,,,,ENSP00000326138.3,CCDS55230.1,,,,TTAACCTGAGAACCAAAGGATGGAAACACAGTCTAGTAACGTGAAG...
330802,chr8,ENSEMBL,CDS,42711301,42711571,.,-,0,ENSG00000120925.16_6,ENST00000534961.5_2,protein_coding,RNF170,protein_coding,RNF170-209,1,ENSE00001239728.3,3,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,,chr8:-:42856162-42856428,full_contig,,,,ENSP00000445725.1,CCDS6138.1,,,,TCATCTAGTTAGCCTTTGGGTTATCACTTCTCGATACATAATAGAG...
330803,chr8,HAVANA,CDS,42711301,42711571,.,-,0,ENSG00000120925.16_6,ENST00000526349.5_2,protein_coding,RNF170,protein_coding,RNF170-205,1,ENSE00002152697.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383168.1_2,chr8:-:42856162-42856428,full_contig,,,,ENSP00000435782.1,CCDS55229.1,,,,TCATCTAGTTAGCCTTTGGGTTATCACTTCTCGATACATAATAGAG...
330804,chr8,HAVANA,CDS,42711301,42711571,.,-,0,ENSG00000120925.16_6,ENST00000527424.6_3,protein_coding,RNF170,protein_coding,RNF170-206,1,ENSE00002184629.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383166.1_3,chr8:-:42856162-42856428,full_contig,,,,ENSP00000434797.1,CCDS6138.1,,,,TCATCTAGTTAGCCTTTGGGTTATCACTTCTCGATACATAATAGAG...
330805,chr8,ENSEMBL,CDS,42716887,42716998,.,-,0,ENSG00000120925.16_6,ENST00000534961.5_2,protein_coding,RNF170,protein_coding,RNF170-209,2,ENSE00003614933.1,3,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,,chr8:-:42861745-42861855,full_contig,,,,ENSP00000445725.1,CCDS6138.1,,,,AGATCTGGGTTGCCCTGAGAATCTCCGGTTATAATCATTAATATCC...
330806,chr8,HAVANA,CDS,42716887,42716998,.,-,0,ENSG00000120925.16_6,ENST00000526349.5_2,protein_coding,RNF170,protein_coding,RNF170-205,2,ENSE00003614933.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383168.1_2,chr8:-:42861745-42861855,full_contig,,,,ENSP00000435782.1,CCDS55229.1,,,,AGATCTGGGTTGCCCTGAGAATCTCCGGTTATAATCATTAATATCC...
330807,chr8,HAVANA,CDS,42716887,42716998,.,-,0,ENSG00000120925.16_6,ENST00000527424.6_3,protein_coding,RNF170,protein_coding,RNF170-206,2,ENSE00003614933.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383166.1_3,chr8:-:42861745-42861855,full_contig,,,,ENSP00000434797.1,CCDS6138.1,,,,AGATCTGGGTTGCCCTGAGAATCTCCGGTTATAATCATTAATATCC...
330810,chr8,ENSEMBL,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000534961.5_2,protein_coding,RNF170,protein_coding,RNF170-209,3,ENSE00003678305.1,3,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,,chr8:-:42865416-42865489,full_contig,,,,ENSP00000445725.1,CCDS6138.1,,,,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...
330811,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000319104.7_2,protein_coding,RNF170,protein_coding,RNF170-203,2,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383165.1_2,chr8:-:42865416-42865489,full_contig,,,,ENSP00000326138.3,CCDS55230.1,,,,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...
330812,chr8,HAVANA,CDS,42720558,42720632,.,-,2,ENSG00000120925.16_6,ENST00000526349.5_2,protein_coding,RNF170,protein_coding,RNF170-205,3,ENSE00003678305.1,2,1,HGNC:25358,CCDS,OTTHUMG00000165277.1_6,OTTHUMT00000383168.1_2,chr8:-:42865416-42865489,full_contig,,,,ENSP00000435782.1,CCDS55229.1,,,,CGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACCA...


In [23]:
def create_reference_cds(intersection_cds_vcf, cds_df_test):

    """
    Constructs the whole CDS sequence (multiple exons) for transcripts affected by a variant, both in their reference
    and alternative form.
    :param intersection_cds_vcf: DataFrame containing variant-CDS intersection and corresponding alternative CDS sequences
                                 includes: transcript_id, Exon_CDS_seq + length, Exon_ALT_CDS_seq + length
    :param cds_df_test: Reference exon-level CDS data for all transcripts with exon_number
                        includes: transcript_id, exon_number, Start, End, Strand, Exon_CDS_seq
    :return: DataFrame with one row per variant-transcript pair, containing full reference and alternative CDS + lengths,
             exon-wise CDS information as tuple (exon number, exon-wise CDS length)
    """

    results = []

    for transcript_id, var_df in intersection_cds_vcf.groupby("transcript_id"):  # Only transcripts with a variant

        # 1. Get reference exons
        ref_exons = cds_df_test[cds_df_test["transcript_id"] == transcript_id].copy()
        ref_exons = ref_exons.sort_values("Start")

        # Get reference CDS sequence start und stop position for finding position in transcript sequence
        ref_cds_start = ref_exons["Start"].min()
        ref_cds_stop = ref_exons["End"].max()

        # Join reference exon sequences to form full CDS sequence
        ref_seq = "".join(ref_exons["Exon_CDS_seq"].tolist())

        ######
        # Since I sometimes get errors in the following code snippet because of NaN values,
        # we print them but leave them in our dataframe for now
        #nan_rows = ref_exons[ref_exons["Exon_CDS_seq"].isna()]
        #if not nan_rows.empty:
        #    print(f"\n[Warning] Found {len(nan_rows)} NaN Exon_CDS_seq entries in transcript: {transcript_id}")
        #    print(nan_rows.to_string(index=False))

        # Collect exon numbers and lengths (for tracking exon contribution later on)
        # ref_cds_lengths = [len(seq) for seq in ref_exons["Exon_CDS_seq"].tolist()]
        
        ## old option:
        #ref_cds_info = [
        #    (row["exon_number"], len(row["Exon_CDS_seq"]))
        #    for _, row in ref_exons.iterrows()
        #]

        ## new option:
        ref_cds_info = sorted([
            (row["exon_number"], len(row["Exon_CDS_seq"]))
            for _, row in ref_exons.iterrows()
        ], key=lambda x: x[0])

        ######

        # Get strand info (all should be the same within transcript)
        strand = ref_exons["Strand"].iloc[0]

        for variant, cds_df in var_df.groupby(["Chromosome", "Start_variant", "End_variant", "Ref", "Alt"],
                                              observed=True):

            # Sort variant exons
            cds_df = cds_df.sort_values("Start")

            # Copy ref exons for modification
            alt_exons = ref_exons.copy()

            # Replace affected exon sequences with variant versions
            for _, var_row in cds_df.iterrows():
                exon_nr = var_row["exon_number"]
                alt_exons.loc[alt_exons["exon_number"] == exon_nr, "Exon_CDS_seq"] = var_row["Exon_Alt_CDS_seq"]

            # Join and sort alt CDS
            alt_exons = alt_exons.sort_values("Start")

            ######
            # Since I sometimes get errors in the following code snippet because of NaN values,
            # we print them but leave them in our dataframe for now (same as before)
            #nan_alt_rows = alt_exons[alt_exons["Exon_CDS_seq"].isna()]
            #if not nan_alt_rows.empty:
            #    print(f"\n[Warning] NaN Exon_CDS_seq values found in alt_exons for variant in transcript: {transcript_id}")
            #    print(nan_alt_rows.to_string(index=False))

            # alt_cds_lengths = [len(seq) for seq in alt_exons["Exon_CDS_seq"].tolist()]
            
            ## old option:
            #alt_cds_info = [
            #    (row["exon_number"], len(row["Exon_CDS_seq"]))
            #    for _, row in alt_exons.iterrows()
            #]

            ## new option:
            alt_cds_info = sorted([
                (row["exon_number"], len(row["Exon_CDS_seq"]))
                for _, row in alt_exons.iterrows()
            ], key=lambda x: x[0])
            ######

            # Get alternative CDS sequence start and stop position for finding position in transcript sequence
            alt_cds_start = alt_exons["Start"].min()
            alt_cds_stop = alt_exons["End"].max()

            alt_seq = "".join(alt_exons["Exon_CDS_seq"].tolist())

            # Apply reverse complement if on minus strand
            if strand == "-":
                ref_seq_final = str(Seq(ref_seq).reverse_complement())
                alt_seq_final = str(Seq(alt_seq).reverse_complement())
            else:
                ref_seq_final = ref_seq
                alt_seq_final = alt_seq

            # Append to results
            results.append({
                "transcript_id": transcript_id,
                "variant_id": var_row["ID"],

                "ref_cds_start": ref_cds_start,
                "ref_cds_stop": ref_cds_stop,
                "ref_cds_seq": ref_seq_final,
                "ref_cds_len": len(ref_seq_final),

                "alt_cds_start": alt_cds_start,
                "alt_cds_stop": alt_cds_stop,
                "alt_cds_seq": alt_seq_final,
                "alt_cds_len": len(alt_seq_final),

                "chromosome": var_row["Chromosome"],
                "gene_id": var_row["gene_id"],
                "strand": strand,
                "ref": var_row["Ref"],
                "alt": var_row["Alt"],
                "start_variant": var_row["Start_variant"],
                "end_variant": var_row["End_variant"],

                "ref_cds_info": ref_cds_info,
                "alt_cds_info": alt_cds_info,

                # "ref_cds_lengths": [length for exon_num, length in ref_cds_info],
                # "alt_cds_lengths": [length for exon_num, length in alt_cds_info]
            })

    results_df = pd.DataFrame(results)
    return(results_df)

results_df = create_reference_cds(intersection_cds_vcf, cds_df_adj)
results_df

Unnamed: 0,transcript_id,variant_id,ref_cds_start,ref_cds_stop,ref_cds_seq,ref_cds_len,alt_cds_start,alt_cds_stop,alt_cds_seq,alt_cds_len,chromosome,gene_id,strand,ref,alt,start_variant,end_variant,ref_cds_info,alt_cds_info
0,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,C,42706139,42706140,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]"
1,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,529,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 0), (3, 109), (4, 76), (5, 137)]"
2,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]"
3,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,451,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 0), (4, 70)]"
4,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 74), (4, 70)]"
5,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)..."
6,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76..."
7,ENST00000531440.5_2,.,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,349,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,322,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 27), (2, 109), (3, 76), (4, 137)]","[(1, 0), (2, 109), (3, 76), (4, 137)]"
8,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)..."
9,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76..."


In [24]:
def get_transcript_sequence(exons_df, fasta):

    """
    Construct full transcript sequences by concatenating the exon sequences from the FASTA genome reference, grouped by transcript.
    Get transcript length and transcript information as well.
    :param exons_df: DataFrame containing exon-level annotations from the GTF file.
                     Must include: transcript_id, strand, chromosome, start, end, exon_number
    :param fasta: Fasta file, reference genome object
    :return: DataFrame with one row per transcript with full transcript sequence, start, end, strand, transcript sequence length, and
             per exon sequence length information for that transcript
    """

    exon_data = []

    # Process each transcript individually
    for transcript_id, group in exons_df.groupby("transcript_id"):
        strand = group.iloc[0]["Strand"]

        if strand not in ["+", "-"]:
            print(f"Unknown strand for {transcript_id}")
            continue

        # Sort by exon start coordinate (strand not considered here yet)
        group_sorted = group.sort_values(by="Start").copy()

        seq_parts = [] # to accumulate exon sequences
        starts = [] # for overall transcript start
        ends = [] # for overall transcript end
        exon_info = [] # for tracking exon_number and length

        # fetch exon sequence and metadata
        for _, row in group_sorted.iterrows():
            chrom = row["Chromosome"]
            start = int(row["Start"])
            end = int(row["End"])
            exon_number = row["exon_number"]

            starts.append(start)
            ends.append(end)

            # Fetch exon sequence from fasta reference genome
            exon_seq = fasta[chrom][start:end] #.seq
            exon_seq_str = str(exon_seq).upper()
            seq_parts.append(exon_seq_str)

            exon_info.append((exon_number, len(exon_seq_str)))

        # join exon sequences into a full transcript sequence
        joined_seq = "".join(seq_parts)

        # Apply reverse complement for minus strand transcripts
        if strand == "-":
            joined_seq = str(Seq(joined_seq).reverse_complement())
            exon_info = exon_info[::-1]

        exon_data.append({
            "Chromosome": chrom,
            "transcript_id": transcript_id,
            "start": min(starts),
            "end": max(ends),
            "strand": strand,
            "transcript_sequence": joined_seq,
            "transcript_length": len(joined_seq),
            "transcript_exon_info": exon_info
        })

    exon_seqs = pd.DataFrame(exon_data)
    return(exon_seqs)


exons_df = exons_df[exons_df["transcript_id"].isin(relevant_transcripts)].copy()
exon_seqs = get_transcript_sequence(exons_df, fasta)

In [25]:
exon_seqs_indexed = exon_seqs.set_index("transcript_id")
exon_seqs_indexed

Unnamed: 0_level_0,Chromosome,start,end,strand,transcript_sequence,transcript_length,transcript_exon_info
transcript_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
ENST00000319104.7_2,chr8,42704779,42751728,-,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74..."
ENST00000526349.5_2,chr8,42710849,42751849,-,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338,"[(7, 223), (6, 23), (5, 76), (4, 109), (3, 74)..."
ENST00000527424.6_3,chr8,42708443,42751748,-,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764,"[(7, 122), (6, 144), (5, 76), (4, 109), (3, 74..."
ENST00000531440.5_2,chr8,42720608,42751741,-,AAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGG...,580,"[(6, 115), (5, 112), (4, 144), (3, 76), (2, 10..."
ENST00000534961.5_2,chr8,42708439,42751866,-,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116,"[(7, 470), (6, 144), (5, 76), (4, 109), (3, 74..."


In [26]:
def check_cds_in_transcript(row):
    transcript_id = row["transcript_id"]

    # Skip if transcript_id not found
    if transcript_id not in exon_seqs_indexed.index:
        return False

    transcript_seq = exon_seqs_indexed.loc[transcript_id, "transcript_sequence"]
    ref_cds_seq = row["ref_cds_seq"]

    # Check if CDS is a substring of the transcript
    return ref_cds_seq in transcript_seq

results_df["cds_in_transcript"] = results_df.apply(check_cds_in_transcript, axis=1)
results_df

Unnamed: 0,transcript_id,variant_id,ref_cds_start,ref_cds_stop,ref_cds_seq,ref_cds_len,alt_cds_start,alt_cds_stop,alt_cds_seq,alt_cds_len,chromosome,gene_id,strand,ref,alt,start_variant,end_variant,ref_cds_info,alt_cds_info,cds_in_transcript
0,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,C,42706139,42706140,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]",True
1,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,529,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 0), (3, 109), (4, 76), (5, 137)]",True
2,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]",True
3,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,451,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 0), (4, 70)]",True
4,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 74), (4, 70)]",True
5,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)...",True
6,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...",True
7,ENST00000531440.5_2,.,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,349,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,322,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 27), (2, 109), (3, 76), (4, 137)]","[(1, 0), (2, 109), (3, 76), (4, 137)]",False
8,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)...",True
9,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...",True


In [27]:
def get_exon(cds_pos, exon_info):
    """
    Map a CDS-relative position to the corresponding exon number using exon_info,
    which is a list of (exon_number, exon_length) tuples in CDS order.
    """
    pos_counter = 0
    for exon_number, exon_length in exon_info:
        if cds_pos < pos_counter + exon_length:
            return exon_number
        pos_counter += exon_length
    return exon_info[-1][0]  # fallback

def analyze_sequence(results_df):

    """
    Analyzes reference and alternative CDS for start and stop codons, their positions, and potential premature termination codons (PTCs)

    :param results_df: DataFrame containing CDS sequences and exon information for both reference and alternative sequences, per variant
    :return: DataFrame with added annotation columns for reference and alternative sequence separately:
             such as start codon position / exon, last codon and its validity as stop codon, first in-frame stop codon + position,
             number and information of all available stop codons, premature stop codon flag
    """

    valid_stop_codons = {"TAA", "TAG", "TGA"}
    start_codon = "ATG"

    df = results_df.copy()

    # Initialize result columns for both reference and alternative sequence
    for label in ["ref", "alt"]:
        df[f"{label}_start_codon_pos"] = None
        df[f"{label}_start_codon_exon"] = None  # exon number
        df[f"{label}_last_codon"] = None
        df[f"{label}_valid_stop"] = None
        df[f"{label}_first_stop_codon"] = None
        df[f"{label}_first_stop_pos"] = None
        df[f"{label}_num_stop_codons"] = None
        df[f"{label}_all_stop_codons"] = None
        df[f"{label}_stop_codon_exons"] = None  # exon number
        df[f"{label}_is_premature"] = None

    # Row-wise codon scanning
    for idx, row in df.iterrows():
        for label in ["ref", "alt"]:
            seq = row[f"{label}_cds_seq"]

            exon_info = row[f"{label}_cds_info"]  # for exon number

            # Skip invalid or too-short sequences
            if not isinstance(seq, str) or len(seq) < 3:
                continue

            start_pos = None
            stop_codons = []
            stop_exons = []  # for exon number

            # Scan in codons (step=3)
            for i in range(0, len(seq) - 2, 3):
                codon = seq[i:i + 3]
                if codon == start_codon and start_pos is None: # first start codon position
                    start_pos = i
                if codon in valid_stop_codons: # record all stop codons with their positions and exons
                    stop_codons.append((i, codon))
                    stop_exons.append(get_exon(i, exon_info))  # for exon number

            last_codon = seq[-3:]
            is_valid_stop = last_codon in valid_stop_codons
            first_stop_pos = stop_codons[0][0] if stop_codons else None
            first_stop = stop_codons[0][1] if stop_codons else None
            is_premature = first_stop_pos is not None and first_stop_pos < len(seq) - 3
            start_exon = get_exon(start_pos, exon_info) if start_pos is not None else None  # for exon number

            # Store results
            df.at[idx, f"{label}_start_codon_pos"] = start_pos
            df.at[idx, f"{label}_start_codon_exon"] = start_exon  # exon number
            df.at[idx, f"{label}_last_codon"] = last_codon
            df.at[idx, f"{label}_valid_stop"] = is_valid_stop
            df.at[idx, f"{label}_first_stop_codon"] = first_stop
            df.at[idx, f"{label}_first_stop_pos"] = first_stop_pos
            df.at[idx, f"{label}_num_stop_codons"] = len(stop_codons)
            df.at[idx, f"{label}_all_stop_codons"] = stop_codons
            df.at[idx, f"{label}_stop_codon_exons"] = stop_exons  # exon number
            df.at[idx, f"{label}_is_premature"] = is_premature

    return df

analysis_df = analyze_sequence(results_df)

In [28]:
analysis_df

Unnamed: 0,transcript_id,variant_id,ref_cds_start,ref_cds_stop,ref_cds_seq,ref_cds_len,alt_cds_start,alt_cds_stop,alt_cds_seq,alt_cds_len,chromosome,gene_id,strand,ref,alt,start_variant,end_variant,ref_cds_info,alt_cds_info,cds_in_transcript,ref_start_codon_pos,ref_start_codon_exon,ref_last_codon,ref_valid_stop,ref_first_stop_codon,ref_first_stop_pos,ref_num_stop_codons,ref_all_stop_codons,ref_stop_codon_exons,ref_is_premature,alt_start_codon_pos,alt_start_codon_exon,alt_last_codon,alt_valid_stop,alt_first_stop_codon,alt_first_stop_pos,alt_num_stop_codons,alt_all_stop_codons,alt_stop_codon_exons,alt_is_premature
0,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,C,42706139,42706140,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False
1,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,529,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 0), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TGA,351.0,2,"[(351, TGA), (447, TGA)]","[4, 5]",True
2,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False
3,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,451,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 0), (4, 70)]",True,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,0,1,TGA,True,TGA,96.0,8,"[(96, TGA), (102, TGA), (141, TAA), (144, TGA)...","[1, 1, 1, 1, 1, 2, 2, 4]",True
4,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 74), (4, 70)]",True,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False
5,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,348.0,8,"[(348, TGA), (354, TGA), (393, TAA), (396, TGA...","[2, 2, 4, 4, 4, 6, 6, 6]",True
6,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False
7,ENST00000531440.5_2,.,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,349,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,322,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 27), (2, 109), (3, 76), (4, 137)]","[(1, 0), (2, 109), (3, 76), (4, 137)]",False,0,1,GAT,False,,,0,[],[],False,0,2,GTG,False,,,0,[],[],False
8,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,348.0,8,"[(348, TGA), (354, TGA), (393, TAA), (396, TGA...","[2, 2, 4, 4, 4, 6, 6, 6]",True
9,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False


In [30]:
def start_stop_loss(df):
    df = df.copy()

    # Start codon loss: reference sequence has a start codon, alternative sequence does not or the position is changed
    df["start_loss"] = (
                               (df["ref_start_codon_pos"].notna()) & df["alt_start_codon_pos"].isna()
                       ) | (
                               df["ref_start_codon_pos"] != df["alt_start_codon_pos"]
                       )

    # Stop codon loss: reference sequence had a valid stop codon, the alternative sequence does not or the position is changed
    df["stop_loss"] = (
                              (df["ref_valid_stop"] == True) & (df["alt_valid_stop"] != True)
                      ) | (
                              df["ref_last_codon"] != df["alt_last_codon"]  # Or take this out?
                      )

    return df

loss_df = start_stop_loss(analysis_df)
loss_df

Unnamed: 0,transcript_id,variant_id,ref_cds_start,ref_cds_stop,ref_cds_seq,ref_cds_len,alt_cds_start,alt_cds_stop,alt_cds_seq,alt_cds_len,chromosome,gene_id,strand,ref,alt,start_variant,end_variant,ref_cds_info,alt_cds_info,cds_in_transcript,ref_start_codon_pos,ref_start_codon_exon,ref_last_codon,ref_valid_stop,ref_first_stop_codon,ref_first_stop_pos,ref_num_stop_codons,ref_all_stop_codons,ref_stop_codon_exons,ref_is_premature,alt_start_codon_pos,alt_start_codon_exon,alt_last_codon,alt_valid_stop,alt_first_stop_codon,alt_first_stop_pos,alt_num_stop_codons,alt_all_stop_codons,alt_stop_codon_exons,alt_is_premature,start_loss,stop_loss
0,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,C,42706139,42706140,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,False,False
1,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,529,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 0), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TGA,351.0,2,"[(351, TGA), (447, TGA)]","[4, 5]",True,False,False
2,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,False,False
3,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,451,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 0), (4, 70)]",True,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,0,1,TGA,True,TGA,96.0,8,"[(96, TGA), (102, TGA), (141, TAA), (144, TGA)...","[1, 1, 1, 1, 1, 2, 2, 4]",True,False,False
4,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 74), (4, 70)]",True,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,False,False
5,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,348.0,8,"[(348, TGA), (354, TGA), (393, TAA), (396, TGA...","[2, 2, 4, 4, 4, 6, 6, 6]",True,False,False
6,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,False,False
7,ENST00000531440.5_2,.,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,349,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,322,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 27), (2, 109), (3, 76), (4, 137)]","[(1, 0), (2, 109), (3, 76), (4, 137)]",False,0,1,GAT,False,,,0,[],[],False,0,2,GTG,False,,,0,[],[],False,False,True
8,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,348.0,8,"[(348, TGA), (354, TGA), (393, TAA), (396, TGA...","[2, 2, 4, 4, 4, 6, 6, 6]",True,False,False
9,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,False,False


In [31]:
# Annotate transcript information (transcript start, end, sequence, length, exon info) in case of start or stop loss
# transcript sequences are in: exon_seqs_subset

transcript_starts = exon_seqs.set_index("transcript_id")["start"].to_dict()
loss_df["transcript_start"] = loss_df["transcript_id"].map(transcript_starts)
transcript_ends = exon_seqs.set_index("transcript_id")["end"].to_dict()
loss_df["transcript_end"] = loss_df["transcript_id"].map(transcript_ends)
transcript_sequences = exon_seqs.set_index("transcript_id")["transcript_sequence"].to_dict()  # create map of transcript-id to transcript sequence
loss_df["transcript_seq"] = loss_df["transcript_id"].map(transcript_sequences)
transcript_lengths = exon_seqs.set_index("transcript_id")["transcript_length"].to_dict()
loss_df["transcript_length"] = loss_df["transcript_id"].map(transcript_lengths)

In [32]:
# In case of start or stop loss:
# Splice alternative CDS into reference transcript sequence to create alternative transcript sequence and measure new length    

def splice_alt_cds_into_transcript(row, transcript_seq):

    # Step 1: search for ref_cds_seq match in the transcript, and replace that with the alt_cds_seq
    ref_cds_seq = row["ref_cds_seq"].upper()
    alt_cds_seq = row["alt_cds_seq"].upper()

    # Find the ref CDS in the transcript sequence
    ref_start_idx = transcript_seq.find(ref_cds_seq)

    if ref_start_idx == -1:
        return None  # Cannot find ref CDS, alignment problem

    ref_end_idx = ref_start_idx + len(ref_cds_seq)

    # Replace the reference CDS with the variant-modified / alternative one
    new_transcript_seq = (
        transcript_seq[:ref_start_idx] +
        alt_cds_seq +
        transcript_seq[ref_end_idx:]
    )

    return new_transcript_seq

loss_df["alt_transcript_seq"] = loss_df.apply(
    lambda row: splice_alt_cds_into_transcript(row, row["transcript_seq"])
    if pd.notnull(row["transcript_seq"]) else None,
    axis=1
)
loss_df["alt_transcript_length"] = loss_df["alt_transcript_seq"].apply(
    lambda x: len(x) if pd.notnull(x) else None
)

In [34]:
# Add exon info to dataframe
transcript_exon_info = exon_seqs.set_index("transcript_id")["transcript_exon_info"].to_dict()
loss_df["transcript_exon_info"] = loss_df["transcript_id"].map(transcript_exon_info)

loss_df

Unnamed: 0,transcript_id,variant_id,ref_cds_start,ref_cds_stop,ref_cds_seq,ref_cds_len,alt_cds_start,alt_cds_stop,alt_cds_seq,alt_cds_len,chromosome,gene_id,strand,ref,alt,start_variant,end_variant,ref_cds_info,alt_cds_info,cds_in_transcript,ref_start_codon_pos,ref_start_codon_exon,ref_last_codon,ref_valid_stop,ref_first_stop_codon,ref_first_stop_pos,ref_num_stop_codons,ref_all_stop_codons,ref_stop_codon_exons,ref_is_premature,alt_start_codon_pos,alt_start_codon_exon,alt_last_codon,alt_valid_stop,alt_first_stop_codon,alt_first_stop_pos,alt_num_stop_codons,alt_all_stop_codons,alt_stop_codon_exons,alt_is_premature,start_loss,stop_loss,transcript_start,transcript_end,transcript_seq,transcript_length,alt_transcript_seq,alt_transcript_length,transcript_exon_info
0,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,C,42706139,42706140,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,False,False,42704779,42751728,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866.0,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74..."
1,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,529,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 0), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TGA,351.0,2,"[(351, TGA), (447, TGA)]","[4, 5]",True,False,False,42704779,42751728,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1792.0,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74..."
2,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,False,False,42704779,42751728,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866.0,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74..."
3,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,451,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 0), (4, 70)]",True,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,0,1,TGA,True,TGA,96.0,8,"[(96, TGA), (102, TGA), (141, TAA), (144, TGA)...","[1, 1, 1, 1, 1, 2, 2, 4]",True,False,False,42710849,42751849,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1264.0,"[(7, 223), (6, 23), (5, 76), (4, 109), (3, 74)..."
4,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 74), (4, 70)]",True,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,False,False,42710849,42751849,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338.0,"[(7, 223), (6, 23), (5, 76), (4, 109), (3, 74)..."
5,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,348.0,8,"[(348, TGA), (354, TGA), (393, TAA), (396, TGA...","[2, 2, 4, 4, 4, 6, 6, 6]",True,False,False,42708443,42751748,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3690.0,"[(7, 122), (6, 144), (5, 76), (4, 109), (3, 74..."
6,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,False,False,42708443,42751748,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764.0,"[(7, 122), (6, 144), (5, 76), (4, 109), (3, 74..."
7,ENST00000531440.5_2,.,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,349,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,322,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 27), (2, 109), (3, 76), (4, 137)]","[(1, 0), (2, 109), (3, 76), (4, 137)]",False,0,1,GAT,False,,,0,[],[],False,0,2,GTG,False,,,0,[],[],False,False,True,42720608,42751741,AAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGG...,580,,,"[(6, 115), (5, 112), (4, 144), (3, 76), (2, 10..."
8,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,348.0,8,"[(348, TGA), (354, TGA), (393, TAA), (396, TGA...","[2, 2, 4, 4, 4, 6, 6, 6]",True,False,False,42708439,42751866,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4042.0,"[(7, 470), (6, 144), (5, 76), (4, 109), (3, 74..."
9,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,False,False,42708439,42751866,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116.0,"[(7, 470), (6, 144), (5, 76), (4, 109), (3, 74..."


In [36]:
def analyze_transcript(results_df):

    """
    Analyze the alternative transcript sequence in cases of start or stop codons loss due to mutations.
    Scan for new in-frame start or stop codons in the alternative transcript sequence.
    :param results_df: DataFrame containing transcript sequence data and annotations, including start_loss and stop_loss flags
    :return: pandas DataFrame with additional columns for rescued start / stop codon information
    """

    valid_stop_codons = {"TAA", "TAG", "TGA"}
    start_codon = "ATG"

    df = results_df.copy()

    # Add new columns to store results
    df["transcript_start_codon_pos"] = None
    df["transcript_start_codon_exon"] = None  # for exon number
    df["transcript_last_codon"] = None
    df["transcript_valid_stop"] = None
    df["transcript_first_stop_codon"] = None
    df["transcript_first_stop_pos"] = None
    df["transcript_num_stop_codons"] = None
    df["transcript_all_stop_codons"] = None
    df["transcript_stop_codon_exons"] = None  # for exon number

    for idx, row in df.iterrows():
        seq = row["alt_transcript_seq"]
        cds_start = row["alt_cds_start"] - row["transcript_start"]  # start analysis at cds start position
        # cds_stop = cds_start + len(seq)

        exon_info = row["transcript_exon_info"]  # for exon number

        # Skip rows with invalid or too-short sequences
        if not isinstance(seq, str) or len(seq) < 3:
            continue

        start_pos = None
        start_exon = None  # for exon number
        stop_codons = []
        stop_exons = []  # for exon number

        # only analyze rows flagged with start or stop codon loss: skip the others and fill with None values
        # Skips only if we have both start_loss = FALSE and stop_loss = FALSE. If one is true, then don't skip.
        if not (row["start_loss"] or row["stop_loss"]):
            continue

        # START LOSS rescue search
        if row["start_loss"]:
            # Walk through sequence starting at CDS start with +1 positions until start codon is found
            for i in range(cds_start, len(seq) - 2):
                codon = seq[i:i + 3]
                if codon == start_codon:
                    start_pos = i

                    start_exon = get_exon(start_pos, exon_info)  # for exon number

                    # From new start codon, scan codons in frame
                    for j in range(i, len(seq) - 2, 3):
                        codon2 = seq[j:j + 3]
                        if codon2 in valid_stop_codons:
                            stop_codons.append((j, codon2))
                            stop_exons.append(get_exon(j, exon_info))  # for exon number

                    break

        # STOP LOSS rescue search
        elif row["stop_loss"]:
            # Start at cds_stop, scan codons in frame to end --> lets start at cds start so we are in frame
            for i in range(cds_start, len(seq) - 2, 3):
                codon = seq[i:i + 3]
                if codon == start_codon and start_pos is None:
                    start_pos = i
                    start_exon = get_exon(start_pos, exon_info)  # for exon number
                if codon in valid_stop_codons:
                    stop_codons.append((i, codon))
                    stop_exons.append(get_exon(i, exon_info))  # for exon number

        # Set last codon in transcript (use last full codon)
        last_codon = seq[-3:] if len(seq) >= 3 else None
        is_valid_stop = last_codon in valid_stop_codons
        first_stop_pos = stop_codons[0][0] if stop_codons else None
        first_stop = stop_codons[0][1] if stop_codons else None

        # Store results
        df.at[idx, "transcript_start_codon_pos"] = start_pos
        df.at[idx, "transcript_start_codon_exon"] = start_exon  # for exon number
        df.at[idx, "transcript_last_codon"] = last_codon
        df.at[idx, "transcript_valid_stop"] = is_valid_stop
        df.at[idx, "transcript_first_stop_codon"] = first_stop
        df.at[idx, "transcript_first_stop_pos"] = first_stop_pos
        df.at[idx, "transcript_num_stop_codons"] = len(stop_codons)
        df.at[idx, "transcript_all_stop_codons"] = stop_codons
        df.at[idx, "transcript_stop_codon_exons"] = stop_exons  # for exon number

    return df
    
analyze_transcript_df = analyze_transcript(loss_df)
analyze_transcript_df

Unnamed: 0,transcript_id,variant_id,ref_cds_start,ref_cds_stop,ref_cds_seq,ref_cds_len,alt_cds_start,alt_cds_stop,alt_cds_seq,alt_cds_len,chromosome,gene_id,strand,ref,alt,start_variant,end_variant,ref_cds_info,alt_cds_info,cds_in_transcript,ref_start_codon_pos,ref_start_codon_exon,ref_last_codon,ref_valid_stop,ref_first_stop_codon,ref_first_stop_pos,ref_num_stop_codons,ref_all_stop_codons,ref_stop_codon_exons,ref_is_premature,alt_start_codon_pos,alt_start_codon_exon,alt_last_codon,alt_valid_stop,alt_first_stop_codon,alt_first_stop_pos,alt_num_stop_codons,alt_all_stop_codons,alt_stop_codon_exons,alt_is_premature,start_loss,stop_loss,transcript_start,transcript_end,transcript_seq,transcript_length,alt_transcript_seq,alt_transcript_length,transcript_exon_info,transcript_start_codon_pos,transcript_start_codon_exon,transcript_last_codon,transcript_valid_stop,transcript_first_stop_codon,transcript_first_stop_pos,transcript_num_stop_codons,transcript_all_stop_codons,transcript_stop_codon_exons
0,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,C,42706139,42706140,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,False,False,42704779,42751728,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866.0,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74...",,,,,,,,,
1,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,529,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 0), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TGA,351.0,2,"[(351, TGA), (447, TGA)]","[4, 5]",True,False,False,42704779,42751728,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1792.0,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74...",,,,,,,,,
2,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,False,False,42704779,42751728,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866.0,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74...",,,,,,,,,
3,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,451,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 0), (4, 70)]",True,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,0,1,TGA,True,TGA,96.0,8,"[(96, TGA), (102, TGA), (141, TAA), (144, TGA)...","[1, 1, 1, 1, 1, 2, 2, 4]",True,False,False,42710849,42751849,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1264.0,"[(7, 223), (6, 23), (5, 76), (4, 109), (3, 74)...",,,,,,,,,
4,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 74), (4, 70)]",True,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,False,False,42710849,42751849,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338.0,"[(7, 223), (6, 23), (5, 76), (4, 109), (3, 74)...",,,,,,,,,
5,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,348.0,8,"[(348, TGA), (354, TGA), (393, TAA), (396, TGA...","[2, 2, 4, 4, 4, 6, 6, 6]",True,False,False,42708443,42751748,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3690.0,"[(7, 122), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,
6,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,False,False,42708443,42751748,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764.0,"[(7, 122), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,
7,ENST00000531440.5_2,.,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,349,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,322,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 27), (2, 109), (3, 76), (4, 137)]","[(1, 0), (2, 109), (3, 76), (4, 137)]",False,0,1,GAT,False,,,0,[],[],False,0,2,GTG,False,,,0,[],[],False,False,True,42720608,42751741,AAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGG...,580,,,"[(6, 115), (5, 112), (4, 144), (3, 76), (2, 10...",,,,,,,,,
8,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,348.0,8,"[(348, TGA), (354, TGA), (393, TAA), (396, TGA...","[2, 2, 4, 4, 4, 6, 6, 6]",True,False,False,42708439,42751866,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4042.0,"[(7, 470), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,
9,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,False,False,42708439,42751866,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116.0,"[(7, 470), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,


In [38]:
results = analyze_transcript_df.copy()

# Add additional NMD features

In [39]:
results # --> get row for calculation

Unnamed: 0,transcript_id,variant_id,ref_cds_start,ref_cds_stop,ref_cds_seq,ref_cds_len,alt_cds_start,alt_cds_stop,alt_cds_seq,alt_cds_len,chromosome,gene_id,strand,ref,alt,start_variant,end_variant,ref_cds_info,alt_cds_info,cds_in_transcript,ref_start_codon_pos,ref_start_codon_exon,ref_last_codon,ref_valid_stop,ref_first_stop_codon,ref_first_stop_pos,ref_num_stop_codons,ref_all_stop_codons,ref_stop_codon_exons,ref_is_premature,alt_start_codon_pos,alt_start_codon_exon,alt_last_codon,alt_valid_stop,alt_first_stop_codon,alt_first_stop_pos,alt_num_stop_codons,alt_all_stop_codons,alt_stop_codon_exons,alt_is_premature,start_loss,stop_loss,transcript_start,transcript_end,transcript_seq,transcript_length,alt_transcript_seq,alt_transcript_length,transcript_exon_info,transcript_start_codon_pos,transcript_start_codon_exon,transcript_last_codon,transcript_valid_stop,transcript_first_stop_codon,transcript_first_stop_pos,transcript_num_stop_codons,transcript_all_stop_codons,transcript_stop_codon_exons
0,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,C,42706139,42706140,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,False,False,42704779,42751728,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866.0,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74...",,,,,,,,,
1,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,529,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 0), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TGA,351.0,2,"[(351, TGA), (447, TGA)]","[4, 5]",True,False,False,42704779,42751728,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1792.0,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74...",,,,,,,,,
2,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,False,False,42704779,42751728,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866.0,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74...",,,,,,,,,
3,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,451,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 0), (4, 70)]",True,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,0,1,TGA,True,TGA,96.0,8,"[(96, TGA), (102, TGA), (141, TAA), (144, TGA)...","[1, 1, 1, 1, 1, 2, 2, 4]",True,False,False,42710849,42751849,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1264.0,"[(7, 223), (6, 23), (5, 76), (4, 109), (3, 74)...",,,,,,,,,
4,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 74), (4, 70)]",True,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,False,False,42710849,42751849,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338.0,"[(7, 223), (6, 23), (5, 76), (4, 109), (3, 74)...",,,,,,,,,
5,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,348.0,8,"[(348, TGA), (354, TGA), (393, TAA), (396, TGA...","[2, 2, 4, 4, 4, 6, 6, 6]",True,False,False,42708443,42751748,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3690.0,"[(7, 122), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,
6,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,False,False,42708443,42751748,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764.0,"[(7, 122), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,
7,ENST00000531440.5_2,.,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,349,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,322,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 27), (2, 109), (3, 76), (4, 137)]","[(1, 0), (2, 109), (3, 76), (4, 137)]",False,0,1,GAT,False,,,0,[],[],False,0,2,GTG,False,,,0,[],[],False,False,True,42720608,42751741,AAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGG...,580,,,"[(6, 115), (5, 112), (4, 144), (3, 76), (2, 10...",,,,,,,,,
8,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,348.0,8,"[(348, TGA), (354, TGA), (393, TAA), (396, TGA...","[2, 2, 4, 4, 4, 6, 6, 6]",True,False,False,42708439,42751866,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4042.0,"[(7, 470), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,
9,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,False,False,42708439,42751866,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116.0,"[(7, 470), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,


In [40]:
row = results.iloc[0]
row

transcript_id                                                ENST00000319104.7_2
variant_id                                                                     .
ref_cds_start                                                           42705933
ref_cds_stop                                                            42743007
ref_cds_seq                    ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...
ref_cds_len                                                                  603
alt_cds_start                                                           42705933
alt_cds_stop                                                            42743007
alt_cds_seq                    ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...
alt_cds_len                                                                  603
chromosome                                                                  chr8
gene_id                                                     ENSG00000120925.16_6
strand                      

## needed functions

In [41]:
def add_nmd_features(row):

    # 5' and 3' UTR lengths
    utr_lengths = calculate_utr_lengths(row)
    utr3_length = utr_lengths["utr3_length"]
    utr5_length = utr_lengths["utr5_length"]

    # Total, Upstream and Downstream exon count
    exon_features = calculate_exon_features(row)
    total_exon_count = exon_features["total_exon_count"]
    upstream_exon_count = exon_features["upstream_exon_count"]
    downstream_exon_count = exon_features["downstream_exon_count"]

    # Distance between PTC to start codon
    ptc_to_start_codon = calculate_ptc_to_start_distance(row)
    # PTC location < 150nt to start codon
    ptc_less_than_150nt_to_start = (
            ptc_to_start_codon is not None and ptc_to_start_codon < 150
    )

    # PTC exon length
    ptc_exon_length = calculate_ptc_exon_length(row)

    # Distance PTC to normal stop codon
    stop_codon_distance = calculate_stop_codon_dist(row)

    # Distance PTC to downstream exon junction
    ptc_to_intron = calculate_ptc_to_downstream_ej(row)

    # Add likely_misannotated flag
    likely_misannotated = add_likely_misannotated_flag(row)


    return {
        "utr3_length": utr3_length,
        "utr5_length": utr5_length,
        "total_exon_count": total_exon_count,
        "upstream_exon_count": upstream_exon_count,
        "downstream_exon_count": downstream_exon_count,
        # "ptc_pos_codon": ptc_pos_codon,
        "ptc_to_start_codon": ptc_to_start_codon,
        "ptc_less_than_150nt_to_start": ptc_less_than_150nt_to_start,
        "ptc_exon_length": ptc_exon_length,
        "stop_codon_distance": stop_codon_distance,
        "ptc_to_intron": ptc_to_intron,
        "likely_misannotated": likely_misannotated
    }

In [42]:
def calculate_utr_lengths(row):

    strand = row.get("strand")
    ref_cds_info = row.get("ref_cds_info") or []
    transcript_exon_info = row.get("transcript_exon_info") or []

    if not ref_cds_info or not transcript_exon_info:
        return {"utr5_length": None, "utr3_length": None}

    # Convert to dicts for easier lookup
    transcript_exon_dict = {int(k): int(v) for k, v in transcript_exon_info}
    cds_exons_dict = {int(exon): int(length) for exon, length in ref_cds_info}

    # Handle single exon
    if len(transcript_exon_dict) == 1:
        if strand == "+":
            utr5 = row["ref_cds_start"] - row["transcript_start"]
            utr3 = row["transcript_end"] - row["ref_cds_stop"]
        else:
            utr5 = row["transcript_end"] - row["ref_cds_stop"]
            utr3 = row["ref_cds_start"] - row["transcript_start"]

        utr5 = utr5 if utr5 >= 0 else None
        utr3 = utr3 if utr3 >= 0 else None

        return {"utr5_length": utr5, "utr3_length": utr3}

    cds_exon_nums = sorted(cds_exons_dict.keys())
    tx_exon_nums = sorted(transcript_exon_dict.keys())

    utr5 = 0
    utr3 = 0

    for exon in tx_exon_nums:
        exon_len = transcript_exon_dict[exon]
        cds_len = cds_exons_dict.get(exon, 0)
        utr_len = exon_len - cds_len

        if utr_len <= 0:
            continue

        if exon in cds_exons_dict:
            # Exon overlaps CDS, partial UTR
            if strand == "+":
                if exon == cds_exon_nums[0]:
                    utr5 += utr_len
                elif exon == cds_exon_nums[-1]:
                    utr3 += utr_len
            else:
                if exon == cds_exon_nums[0]:
                    utr3 += utr_len
                elif exon == cds_exon_nums[-1]:
                    utr5 += utr_len
        else:
            # Exon is outside CDS
            if strand == "+":
                if exon < cds_exon_nums[0]:
                    utr5 += exon_len
                elif exon > cds_exon_nums[-1]:
                    utr3 += exon_len
            else:
                if exon > cds_exon_nums[-1]:
                    utr5 += exon_len
                elif exon < cds_exon_nums[0]:
                    utr3 += exon_len

    utr5 = utr5 if utr5 >= 0 else None
    utr3 = utr3 if utr3 >= 0 else None
    return {"utr5_length": utr5, "utr3_length": utr3}


In [43]:
def calculate_exon_features(row):

    """
    Calculate exon-related features:
    - total_exon_count: always computed if transcript_exon_info is available
    - upstream_exon_count / downstream_exon_count: only computed if a PTC exists
    """

    exon_info = row.get("transcript_exon_info") or []
    stop_exons = row.get("alt_stop_codon_exons") or []

    total_exons = len(exon_info)

    if not row.get("alt_is_premature") or not stop_exons or not exon_info:
        return {
            "total_exon_count": total_exons if total_exons > 0 else None,
            "upstream_exon_count": None,
            "downstream_exon_count": None
        }

    # Take the PTC exon closest to CDS start
    ptc_exon = min(int(e) for e in stop_exons)

    # get exon numbers from transcript_exon_info
    exon_numbers = [int(e[0]) for e in exon_info]

    # If the PTC exon is not in transcript → cannot compute
    if ptc_exon not in exon_numbers:
        return {
            "total_exon_count": total_exons,
            "upstream_exon_count": None,
            "downstream_exon_count": None
        }

    upstream = sum(1 for e in exon_numbers if e < ptc_exon)
    downstream = sum(1 for e in exon_numbers if e > ptc_exon)

    return {
        "total_exon_count": int(total_exons),
        "upstream_exon_count": int(upstream),
        "downstream_exon_count": int(downstream)
    }

In [44]:
def calculate_ptc_to_start_distance(row):

    if not row.get("alt_is_premature"):
        return None

    start = row.get("alt_start_codon_pos")
    stop = row.get("alt_first_stop_pos")

    if start is None or stop is None:
        return None

    # PTC codon position: is PTC_to_start_codon / 3 --> leave it out
    #offset = stop - start
    #return offset // 3 if offset >= 0 else None

    if stop <= start:
        return None

    return stop-start # distance between the PTC to start codon in nt

In [45]:
def calculate_ptc_exon_length(row):
    """
    Return the length of the exon containing the first premature stop codon (PTC).
    """

    if not row.get("alt_is_premature"):
        return None

    stop_exons = row.get("alt_stop_codon_exons") or []
    exon_info = row.get("transcript_exon_info") or []

    if not stop_exons or not exon_info:
        return None

    # First PTC exon = smallest exon number (transcript-order, strand-corrected)
    ptc_exon = min(int(e) for e in stop_exons)

    exon_dict = {int(e): int(length) for e, length in exon_info}
    return exon_dict.get(ptc_exon)

In [46]:
def calculate_stop_codon_dist(row):

    """
    Calculate the distance between the reference stop codon and the alternative stop codon.
    Positive means the PTC is upstream of the reference stop codon.
    """

    ref_stop = row.get("ref_first_stop_pos")
    alt_stop = row.get("alt_first_stop_pos")

    if ref_stop is None or alt_stop is None:
        return None

    return ref_stop - alt_stop

In [47]:
def calculate_ptc_to_downstream_ej(row):

    """
    Calculate distance from PTC to the downstream exon junction (next exon start/end depending on strand).
    Returns None if not applicable.
    """

    # only calculate if we have PTC
    if not row.get("alt_is_premature"):
        return None

    stop_exons = row.get("alt_stop_codon_exons") or []
    
    # exon_info = row.get("transcript_exon_info") or []
    exon_info = row.get("alt_cds_info") or [] # Assuming that the PTC cannot be outside the CDS, since it needs to come before the original stop codon
    
    ptc_pos = row.get("alt_first_stop_pos")

    if not stop_exons or not exon_info or ptc_pos is None:
        return None

    # Choose the PTC exon (smallest number, closer to start)
    ptc_exon = min(stop_exons)

    # Sum lengths of exons up to and including ptc_exon
    cumulative_length = 0
    for exon_num, length in exon_info:
        cumulative_length += length
        if exon_num == ptc_exon:
            break

    # Distance from PTC to downstream exon junction
    distance = cumulative_length - ptc_pos
    return distance

In [48]:
def add_likely_misannotated_flag(row):

    """
    Flag rows that look inconsistent between CDS and transcript annotations and might be likely misannotated.
    A row is flagged as likely misannotated if any of these conditions apply:
        cds_in_transcript = False (the assembled CDS is not found in the transcript sequence)
        ref_start_codon_pos is defined and not 0 (reference CDS has a start codon not at the very start)
        ref_valid_stop is False (the last reference codon is not a valid stop codon)

    :return: A boolean flag. True if any condition above is met and thus the row is likely misannotated, False otherwise.
    """

    # Add likely_misannotated flag: when
    # "cds_in_transcript" is FALSE
    # "ref_start_codon_pos" is not 0
    # "ref_valid_stop" is FALSE

    cds_in_transcript = row.get("cds_in_transcript")
    ref_start_codon_pos = row.get("ref_start_codon_pos")
    ref_valid_stop = row.get("ref_valid_stop")

    # if any of these are missing entirely, flag as likely misannotated
    if cds_in_transcript is None or ref_start_codon_pos is None or ref_valid_stop is None:
        return True

    flag = (
        (cds_in_transcript is False) or
        ((ref_start_codon_pos is not None) and (ref_start_codon_pos != 0)) or
        (ref_valid_stop is False)
    )

    return(flag)

In [49]:
extra_features = results.apply(add_nmd_features, axis=1, result_type='expand')
results = pd.concat([results, extra_features], axis=1)

In [50]:
results

Unnamed: 0,transcript_id,variant_id,ref_cds_start,ref_cds_stop,ref_cds_seq,ref_cds_len,alt_cds_start,alt_cds_stop,alt_cds_seq,alt_cds_len,chromosome,gene_id,strand,ref,alt,start_variant,end_variant,ref_cds_info,alt_cds_info,cds_in_transcript,ref_start_codon_pos,ref_start_codon_exon,ref_last_codon,ref_valid_stop,ref_first_stop_codon,ref_first_stop_pos,ref_num_stop_codons,ref_all_stop_codons,ref_stop_codon_exons,ref_is_premature,alt_start_codon_pos,alt_start_codon_exon,alt_last_codon,alt_valid_stop,alt_first_stop_codon,alt_first_stop_pos,alt_num_stop_codons,alt_all_stop_codons,alt_stop_codon_exons,alt_is_premature,start_loss,stop_loss,transcript_start,transcript_end,transcript_seq,transcript_length,alt_transcript_seq,alt_transcript_length,transcript_exon_info,transcript_start_codon_pos,transcript_start_codon_exon,transcript_last_codon,transcript_valid_stop,transcript_first_stop_codon,transcript_first_stop_pos,transcript_num_stop_codons,transcript_all_stop_codons,transcript_stop_codon_exons,utr3_length,utr5_length,total_exon_count,upstream_exon_count,downstream_exon_count,ptc_to_start_codon,ptc_less_than_150nt_to_start,ptc_exon_length,stop_codon_distance,ptc_to_intron,likely_misannotated
0,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,C,42706139,42706140,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,False,False,42704779,42751728,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866.0,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74...",,,,,,,,,,1154,109,6,,,,False,,0.0,,False
1,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,529,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 0), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TGA,351.0,2,"[(351, TGA), (447, TGA)]","[4, 5]",True,False,False,42704779,42751728,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1792.0,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74...",,,,,,,,,,1154,109,6,3.0,2.0,351.0,False,76.0,249.0,41.0,False
2,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,False,False,42704779,42751728,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866.0,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74...",,,,,,,,,,1154,109,6,,,,False,,0.0,,False
3,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,451,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 0), (4, 70)]",True,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,0,1,TGA,True,TGA,96.0,8,"[(96, TGA), (102, TGA), (141, TAA), (144, TGA)...","[1, 1, 1, 1, 1, 2, 2, 4]",True,False,False,42710849,42751849,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1264.0,"[(7, 223), (6, 23), (5, 76), (4, 109), (3, 74)...",,,,,,,,,,452,361,7,0.0,6.0,96.0,True,722.0,426.0,174.0,False
4,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 74), (4, 70)]",True,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,False,False,42710849,42751849,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338.0,"[(7, 223), (6, 23), (5, 76), (4, 109), (3, 74)...",,,,,,,,,,452,361,7,,,,False,,0.0,,False
5,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,348.0,8,"[(348, TGA), (354, TGA), (393, TAA), (396, TGA...","[2, 2, 4, 4, 4, 6, 6, 6]",True,False,False,42708443,42751748,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3690.0,"[(7, 122), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,,2858,129,7,1.0,5.0,348.0,False,111.0,426.0,33.0,False
6,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,False,False,42708443,42751748,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764.0,"[(7, 122), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,,2858,129,7,,,,False,,0.0,,False
7,ENST00000531440.5_2,.,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,349,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,322,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 27), (2, 109), (3, 76), (4, 137)]","[(1, 0), (2, 109), (3, 76), (4, 137)]",False,0,1,GAT,False,,,0,[],[],False,0,2,GTG,False,,,0,[],[],False,False,True,42720608,42751741,AAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGG...,580,,,"[(6, 115), (5, 112), (4, 144), (3, 76), (2, 10...",,,,,,,,,,0,234,6,,,,False,,,,True
8,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,348.0,8,"[(348, TGA), (354, TGA), (393, TAA), (396, TGA...","[2, 2, 4, 4, 4, 6, 6, 6]",True,False,False,42708439,42751866,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4042.0,"[(7, 470), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,,2862,477,7,1.0,5.0,348.0,False,111.0,426.0,33.0,False
9,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,False,False,42708439,42751866,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116.0,"[(7, 470), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,,2862,477,7,,,,False,,0.0,,False


# Compute NMD-rules

In [51]:
def evaluate_nmd_escape_rules(row):

    """
    Evaluate whether a premature stop codon in a transcript is likely to escape nonsense-mediated decay (NMD) based on
    established biological rules. This function applies five NMD escape rules to determine if a premature termination
    codon (PTC) is likely to escape degradation:
    1. Last exon rule: The PTC is in the last exon
    2. 50nt penultimate rule: The PTC is within 50 nucleotides upstream of the last exon junction
    3. Long exon rule: The PTC is in an exon with >407 nucleotides
    4. Start proximal rule: The PTC is within 150 nucleotides of the start codon
    5. Single exon rule: The transcript where the PTC lays consists only of a single exon
    A PTC is considered to escape NMD if it satisfies any of the above rules.

    :param row: A row of the DataFrame including alt_is_premature (bool), alt_first_stop_pos (int),
                alt_stop_codon_exons (list[int]), transcript_exon_info (list[tuple[exon_number (int), exon_length (int)]]),
                alt_start_codon_pos (int)
    :return: A dictionary with boolean flags for each rule and overall NMD escape
    """

    # Only relevant for premature stop codons
    if not row.get("alt_is_premature"):
        return {
            "nmd_last_exon_rule": False,
            "nmd_50nt_penultimate_rule": False,
            "nmd_long_exon_rule": False,
            "nmd_start_proximal_rule": False,
            "nmd_single_exon_rule": False,
            "nmd_escape": False
        }

    # Extract relevant data
    stop_pos = row.get("alt_first_stop_pos")
    stop_exons = row.get("alt_stop_codon_exons") or []
    exon_info = row.get("transcript_exon_info") or []
    start_pos = row.get("alt_start_codon_pos")

    total_exons = row.get("total_exon_count")
    downstream_exons = row.get("downstream_exon_count")
    ptc_exon_length = row.get("ptc_exon_length")

    # Preprocess exon info
    sorted_exons = sorted(exon_info, key=lambda x: x[0])
    exon_length_map = {exon_num: length for exon_num, length in sorted_exons}
    exon_offsets = {}
    offset = 0
    for exon_num, length in sorted_exons:
        exon_offsets[exon_num] = (offset, offset + length)
        offset += length

    # Single exon rule
    # rule_single_exon = len(sorted_exons) == 1 # old code
    rule_single_exon = total_exons == 1

    # Last exon rule
    #if not rule_single_exon:
    #    last_exon = sorted_exons[-1][0] if sorted_exons else None
    #    rule_last_exon = bool(stop_exons and max(stop_exons) == last_exon)
    #else: rule_last_exon = False
    rule_last_exon = downstream_exons == 0 if downstream_exons is not None else False

    # 50nt from penultimate exon end
    if len(sorted_exons) >= 2:
        penultimate_exon_num, penultimate_len = sorted_exons[-2]
        pen_start, pen_end = exon_offsets.get(penultimate_exon_num, (None, None))
        rule_50nt_penultimate = pen_end is not None and (stop_pos >= pen_end - 50) and (stop_pos < pen_end)
    else:
        rule_50nt_penultimate = False

    # Long exon rule (with exon longer than >407nt)
    # rule_long_exon = any(exon_length_map.get(exon, 0) > 407 for exon in stop_exons) # old code
    rule_long_exon = ptc_exon_length is not None and ptc_exon_length > 407

    # Start-proximal rule (closer than 150nt from the start codon)
    rule_start_proximal = start_pos is not None and stop_pos is not None and (stop_pos - start_pos) < 150 and (stop_pos - start_pos) >= 0

    # NMD escape if any rule is true
    escape = rule_last_exon or rule_50nt_penultimate or rule_long_exon or rule_start_proximal or rule_single_exon

    return {
        "nmd_last_exon_rule": rule_last_exon,
        "nmd_50nt_penultimate_rule": rule_50nt_penultimate,
        "nmd_long_exon_rule": rule_long_exon,
        "nmd_start_proximal_rule": rule_start_proximal,
        "nmd_single_exon_rule": rule_single_exon,
        "nmd_escape": escape
    }

In [52]:
nmd_results = results.apply(evaluate_nmd_escape_rules, axis=1, result_type='expand')
results = pd.concat([results, nmd_results], axis=1)

In [53]:
results

Unnamed: 0,transcript_id,variant_id,ref_cds_start,ref_cds_stop,ref_cds_seq,ref_cds_len,alt_cds_start,alt_cds_stop,alt_cds_seq,alt_cds_len,chromosome,gene_id,strand,ref,alt,start_variant,end_variant,ref_cds_info,alt_cds_info,cds_in_transcript,ref_start_codon_pos,ref_start_codon_exon,ref_last_codon,ref_valid_stop,ref_first_stop_codon,ref_first_stop_pos,ref_num_stop_codons,ref_all_stop_codons,ref_stop_codon_exons,ref_is_premature,alt_start_codon_pos,alt_start_codon_exon,alt_last_codon,alt_valid_stop,alt_first_stop_codon,alt_first_stop_pos,alt_num_stop_codons,alt_all_stop_codons,alt_stop_codon_exons,alt_is_premature,start_loss,stop_loss,transcript_start,transcript_end,transcript_seq,transcript_length,alt_transcript_seq,alt_transcript_length,transcript_exon_info,transcript_start_codon_pos,transcript_start_codon_exon,transcript_last_codon,transcript_valid_stop,transcript_first_stop_codon,transcript_first_stop_pos,transcript_num_stop_codons,transcript_all_stop_codons,transcript_stop_codon_exons,utr3_length,utr5_length,total_exon_count,upstream_exon_count,downstream_exon_count,ptc_to_start_codon,ptc_less_than_150nt_to_start,ptc_exon_length,stop_codon_distance,ptc_to_intron,likely_misannotated,nmd_last_exon_rule,nmd_50nt_penultimate_rule,nmd_long_exon_rule,nmd_start_proximal_rule,nmd_single_exon_rule,nmd_escape
0,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,C,42706139,42706140,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,False,False,42704779,42751728,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866.0,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74...",,,,,,,,,,1154,109,6,,,,False,,0.0,,False,False,False,False,False,False,False
1,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,529,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 0), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TGA,351.0,2,"[(351, TGA), (447, TGA)]","[4, 5]",True,False,False,42704779,42751728,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1792.0,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74...",,,,,,,,,,1154,109,6,3.0,2.0,351.0,False,76.0,249.0,41.0,False,False,False,False,False,False,False
2,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,False,False,42704779,42751728,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866.0,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74...",,,,,,,,,,1154,109,6,,,,False,,0.0,,False,False,False,False,False,False,False
3,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,451,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 0), (4, 70)]",True,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,0,1,TGA,True,TGA,96.0,8,"[(96, TGA), (102, TGA), (141, TAA), (144, TGA)...","[1, 1, 1, 1, 1, 2, 2, 4]",True,False,False,42710849,42751849,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1264.0,"[(7, 223), (6, 23), (5, 76), (4, 109), (3, 74)...",,,,,,,,,,452,361,7,0.0,6.0,96.0,True,722.0,426.0,174.0,False,False,False,True,True,False,True
4,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 74), (4, 70)]",True,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,False,False,42710849,42751849,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338.0,"[(7, 223), (6, 23), (5, 76), (4, 109), (3, 74)...",,,,,,,,,,452,361,7,,,,False,,0.0,,False,False,False,False,False,False,False
5,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,348.0,8,"[(348, TGA), (354, TGA), (393, TAA), (396, TGA...","[2, 2, 4, 4, 4, 6, 6, 6]",True,False,False,42708443,42751748,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3690.0,"[(7, 122), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,,2858,129,7,1.0,5.0,348.0,False,111.0,426.0,33.0,False,False,False,False,False,False,False
6,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,False,False,42708443,42751748,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764.0,"[(7, 122), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,,2858,129,7,,,,False,,0.0,,False,False,False,False,False,False,False
7,ENST00000531440.5_2,.,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,349,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,322,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 27), (2, 109), (3, 76), (4, 137)]","[(1, 0), (2, 109), (3, 76), (4, 137)]",False,0,1,GAT,False,,,0,[],[],False,0,2,GTG,False,,,0,[],[],False,False,True,42720608,42751741,AAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGG...,580,,,"[(6, 115), (5, 112), (4, 144), (3, 76), (2, 10...",,,,,,,,,,0,234,6,,,,False,,,,True,False,False,False,False,False,False
8,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,348.0,8,"[(348, TGA), (354, TGA), (393, TAA), (396, TGA...","[2, 2, 4, 4, 4, 6, 6, 6]",True,False,False,42708439,42751866,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4042.0,"[(7, 470), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,,2862,477,7,1.0,5.0,348.0,False,111.0,426.0,33.0,False,False,False,False,False,False,False
9,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,False,False,42708439,42751866,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116.0,"[(7, 470), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,,2862,477,7,,,,False,,0.0,,False,False,False,False,False,False,False


# Write output

In [54]:
ls -ll

total 9632
-rw-r--r--. 1 l_schroeder users   38865 Jan 11 14:06 create_test_VCF.ipynb
-rw-r--r--. 1 l_schroeder users 6252969 Dec 18 14:04 NMD.ipynb
-rw-r--r--. 1 l_schroeder users  825558 Jan 11 16:22 nmd-vep.ipynb
-rw-r--r--. 1 l_schroeder users    2465 Dec 18 14:34 README.md
-rw-r--r--. 1 l_schroeder users 1307856 Dec 18 14:04 train.ipynb
-rw-r--r--. 1 l_schroeder users 1307856 Jan 11 13:57 train_new.ipynb
-rw-r--r--. 1 l_schroeder users   54222 Dec 18 14:04 validation_MMRF_TARGET.ipynb
-rw-r--r--. 1 l_schroeder users   54222 Jan 11 14:05 validation_new.ipynb


In [None]:
output_path = "~/NMD/nmd-variant-effect-prediction/test_variant/output.csv"

results.to_csv(output_path, index=False)

In [55]:
rule_cols = [
    "nmd_last_exon_rule",
    "nmd_50nt_penultimate_rule",
    "nmd_long_exon_rule",
    "nmd_start_proximal_rule",
    "nmd_single_exon_rule",
    "nmd_escape",
]

summary = results[rule_cols].apply(lambda x: x.value_counts(dropna=False))
print(summary)

       nmd_last_exon_rule  nmd_50nt_penultimate_rule  nmd_long_exon_rule  \
False                10.0                       10.0                   9   
True                  NaN                        NaN                   1   

       nmd_start_proximal_rule  nmd_single_exon_rule  nmd_escape  
False                        9                  10.0           9  
True                         1                   NaN           1  


In [56]:
results.head(20)

Unnamed: 0,transcript_id,variant_id,ref_cds_start,ref_cds_stop,ref_cds_seq,ref_cds_len,alt_cds_start,alt_cds_stop,alt_cds_seq,alt_cds_len,chromosome,gene_id,strand,ref,alt,start_variant,end_variant,ref_cds_info,alt_cds_info,cds_in_transcript,ref_start_codon_pos,ref_start_codon_exon,ref_last_codon,ref_valid_stop,ref_first_stop_codon,ref_first_stop_pos,ref_num_stop_codons,ref_all_stop_codons,ref_stop_codon_exons,ref_is_premature,alt_start_codon_pos,alt_start_codon_exon,alt_last_codon,alt_valid_stop,alt_first_stop_codon,alt_first_stop_pos,alt_num_stop_codons,alt_all_stop_codons,alt_stop_codon_exons,alt_is_premature,start_loss,stop_loss,transcript_start,transcript_end,transcript_seq,transcript_length,alt_transcript_seq,alt_transcript_length,transcript_exon_info,transcript_start_codon_pos,transcript_start_codon_exon,transcript_last_codon,transcript_valid_stop,transcript_first_stop_codon,transcript_first_stop_pos,transcript_num_stop_codons,transcript_all_stop_codons,transcript_stop_codon_exons,utr3_length,utr5_length,total_exon_count,upstream_exon_count,downstream_exon_count,ptc_to_start_codon,ptc_less_than_150nt_to_start,ptc_exon_length,stop_codon_distance,ptc_to_intron,likely_misannotated,nmd_last_exon_rule,nmd_50nt_penultimate_rule,nmd_long_exon_rule,nmd_start_proximal_rule,nmd_single_exon_rule,nmd_escape
0,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,C,42706139,42706140,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,False,False,42704779,42751728,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866.0,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74...",,,,,,,,,,1154,109,6,,,,False,,0.0,,False,False,False,False,False,False,False
1,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,529,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 0), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TGA,351.0,2,"[(351, TGA), (447, TGA)]","[4, 5]",True,False,False,42704779,42751728,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1792.0,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74...",,,,,,,,,,1154,109,6,3.0,2.0,351.0,False,76.0,249.0,41.0,False,False,False,False,False,False,False
2,ENST00000319104.7_2,.,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,42705933,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,603,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]","[(1, 207), (2, 74), (3, 109), (4, 76), (5, 137)]",True,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,0,1,TAA,True,TAA,600.0,1,"[(600, TAA)]",[5],False,False,False,42704779,42751728,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866,GACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGGGAGCGGTCCTGGA...,1866.0,"[(6, 102), (5, 144), (4, 76), (3, 109), (2, 74...",,,,,,,,,,1154,109,6,,,,False,,0.0,,False,False,False,False,False,False,False
3,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,451,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 0), (4, 70)]",True,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,0,1,TGA,True,TGA,96.0,8,"[(96, TGA), (102, TGA), (141, TAA), (144, TGA)...","[1, 1, 1, 1, 1, 2, 2, 4]",True,False,False,42710849,42751849,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1264.0,"[(7, 223), (6, 23), (5, 76), (4, 109), (3, 74)...",,,,,,,,,,452,361,7,0.0,6.0,96.0,True,722.0,426.0,174.0,False,False,False,True,True,False,True
4,ENST00000526349.5_2,.,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,42711301,42725216,ATGTACTGTCCCATCTGCCTGCACCAAGCCTCCTTCCCGGTGGAGA...,525,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 70)]","[(1, 270), (2, 111), (3, 74), (4, 70)]",True,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,0,1,TGA,True,TGA,522.0,1,"[(522, TGA)]",[4],False,False,False,42710849,42751849,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338,GGCGAGCACGCGCTCCGCCCTGGAGGCTGCGGCGACGGGTCCTCCT...,1338.0,"[(7, 223), (6, 23), (5, 76), (4, 109), (3, 74)...",,,,,,,,,,452,361,7,,,,False,,0.0,,False,False,False,False,False,False,False
5,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,348.0,8,"[(348, TGA), (354, TGA), (393, TAA), (396, TGA...","[2, 2, 4, 4, 4, 6, 6, 6]",True,False,False,42708443,42751748,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3690.0,"[(7, 122), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,,2858,129,7,1.0,5.0,348.0,False,111.0,426.0,33.0,False,False,False,False,False,False,False
6,ENST00000527424.6_3,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,False,False,42708443,42751748,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764,GCACTCCAAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGG...,3764.0,"[(7, 122), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,,2858,129,7,,,,False,,0.0,,False,False,False,False,False,False,False
7,ENST00000531440.5_2,.,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,349,42720605,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,322,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 27), (2, 109), (3, 76), (4, 137)]","[(1, 0), (2, 109), (3, 76), (4, 137)]",False,0,1,GAT,False,,,0,[],[],False,0,2,GTG,False,,,0,[],[],False,False,True,42720608,42751741,AAATTAGAAAGGGGACGTCTAGTGGGTTGCCCGGGAGGGGTGGCGG...,580,,,"[(6, 115), (5, 112), (4, 144), (3, 76), (2, 10...",,,,,,,,,,0,234,6,,,,False,,,,True,False,False,False,False,False,False
8,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,703,chr8,ENSG00000120925.16_6,-,CCGTTTGTCTACAGATTGGACAACTGATTGCCCCAAGCCATGAACC...,C,42720557,42720632,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 0), (4, 109), (5, 76)...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,348.0,8,"[(348, TGA), (354, TGA), (393, TAA), (396, TGA...","[2, 2, 4, 4, 4, 6, 6, 6]",True,False,False,42708439,42751866,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4042.0,"[(7, 470), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,,2862,477,7,1.0,5.0,348.0,False,111.0,426.0,33.0,False,False,False,False,False,False,False
9,ENST00000534961.5_2,.,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,42711301,42743007,ATGGCCAAATATCAAGGTGAAGTTCAAAGTTTGAAACTGGATGATG...,777,chr8,ENSG00000120925.16_6,-,C,T,42720558,42720559,"[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...","[(1, 270), (2, 111), (3, 74), (4, 109), (5, 76...",True,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,0,1,TGA,True,TGA,774.0,1,"[(774, TGA)]",[6],False,False,False,42708439,42751866,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116,AGTTCGCTGCGTGTCGAGGCGAGCACGCGCTCCGCCCTGGAGGCTG...,4116.0,"[(7, 470), (6, 144), (5, 76), (4, 109), (3, 74...",,,,,,,,,,2862,477,7,,,,False,,0.0,,False,False,False,False,False,False,False
