### Validation using an independent dataset (MMRF-TARGET)

In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from scipy import stats
from sklearn.ensemble import RandomForestRegressor

In [2]:
# load dataset
df = pd.read_csv('./resources/MMRF_benchmark/MMRF_TARGET_dataset.csv')
df.shape

(556, 33)

In [3]:
cols = ['downstream_exon_count', 'last_exon', 'PTC_to_start_codon',
        'dist_to_stop_codon', 'PTC_exon_length', 'PTC_to_intron','upstream_exon_count',
        'mRNA_half_life','50nt_to_last_EJ','LOEUF', 'AF', '5UTR_length', '3UTR_length', 'Transcript_length']

df[cols].head(1)

Unnamed: 0,downstream_exon_count,last_exon,PTC_to_start_codon,dist_to_stop_codon,PTC_exon_length,PTC_to_intron,upstream_exon_count,mRNA_half_life,50nt_to_last_EJ,LOEUF,AF,5UTR_length,3UTR_length,Transcript_length
0,19,0,4189,4718,990,31,10,315.133085,0,0.557,0.0,349,4719,9257


In [4]:
df[cols].isnull().sum()

downstream_exon_count     0
last_exon                 0
PTC_to_start_codon        0
dist_to_stop_codon        0
PTC_exon_length           0
PTC_to_intron             0
upstream_exon_count       0
mRNA_half_life            0
50nt_to_last_EJ           0
LOEUF                    74
AF                        0
5UTR_length               0
3UTR_length               0
Transcript_length         0
dtype: int64

In [5]:
df = df.loc[df['LOEUF'].notnull()].reset_index(drop=True)
df.shape

(482, 33)

In [6]:
regr = pickle.load(open('./nmd_eff_predictor', 'rb'))

def predict_NMD_efficiency(downstream_exon_count:int, last_exon:int, PTC_to_start_codon:int, 
                           dist_to_stop_codon:int, PTC_exon_length:int, PTC_to_intron:int, upstream_exon_count:int, 
                           mRNA_half_life:float, c50nt_to_last_EJ:int, LOEUF:float, AF:float,
                           UTR5_length:int, UTR3_length:int, Transcript_length:int) -> float:
    
    input_value =[[downstream_exon_count, last_exon, PTC_to_start_codon,
                   dist_to_stop_codon, PTC_exon_length, PTC_to_intron, upstream_exon_count,
                   mRNA_half_life, c50nt_to_last_EJ, LOEUF, AF,
                   UTR5_length, UTR3_length, Transcript_length]]
    
    assert sum([np.isnan(value) for value in input_value[0]]) == 0, 'Replace NaN value'
    
    pred = regr.predict(input_value)[0]
    
    return pred


def predict_NMD_efficiency_df(examples):
    df = pd.DataFrame(examples)

    assert df.isnull().sum().sum() == 0, 'Replace NaN value'

    pred = regr.predict(df)

    return pred

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [7]:
pred = regr.predict(df[cols])
c, p = stats.pearsonr(pred, df['NMD_efficiency'])
print("Pearson R :" , c)
print("P-value : " , p)

AttributeError: 'DecisionTreeRegressor' object has no attribute 'monotonic_cst'

In [None]:
y_true = df['NMD_efficiency']
y_pred = regr.predict(df[cols]) # = pred

# Pearson correlation
c, p = stats.pearsonr(y_pred, y_true)
print("Pearson R:", c)
print("P-value:", p)
print("R^2:", np.square(c))

# min / max value for line
min_val = min(y_true.min(), y_pred.min())
max_val = max(y_true.max(), y_pred.max())

# Plot: True vs Predicted
plt.figure(figsize=(7, 7))
plt.scatter(y_pred, y_true, alpha=0.3)
plt.plot([min_val, max_val], [min_val, max_val], '--', color='gray')
plt.ylabel("True NMD Efficiency")
plt.xlabel("Predicted NMD Efficiency")
plt.title("True vs. Predicted NMD Efficiency")
plt.grid(True)
plt.tight_layout()
plt.show()

In [12]:
np.square(c)

np.float64(0.21770757270598706)

In [13]:
y_true

0      0.800328
1      0.374699
2     -0.150942
3     -0.096187
4      0.379962
         ...   
477    0.021676
478    0.054982
479    1.263034
480    2.933727
481    1.017487
Name: NMD_efficiency, Length: 482, dtype: float64

In [14]:
y_pred

array([ 1.00970960e+00,  6.77856905e-02,  2.03080875e+00,  4.81471886e-01,
        3.10670642e-01,  2.59207389e-01,  1.80182840e+00,  1.67054339e+00,
        1.14627099e+00,  1.51995204e-01, -2.17854929e-02, -2.98267827e-01,
       -9.09306274e-02,  7.06760913e-01,  2.29664769e+00,  2.21828882e+00,
        2.15191308e+00,  1.92721722e+00,  1.21634655e+00,  1.27582526e+00,
        1.73508285e+00,  1.57790770e+00,  1.42485261e+00,  2.01612605e+00,
        5.53947481e-01,  1.24558830e-01,  2.36933493e+00,  1.43544513e+00,
        1.79099377e+00,  1.74434204e+00,  1.54107255e+00,  1.83404473e+00,
        9.97699899e-02, -2.64403997e-02,  1.03647365e+00, -9.89682183e-02,
        1.48045841e+00,  1.89935797e+00,  1.46421508e+00, -3.80123616e-02,
        2.42031063e-01,  1.85723937e-01,  1.00622269e+00,  1.61639912e+00,
        6.84183957e-01,  2.30876285e+00,  1.57697630e+00,  1.57777752e+00,
       -6.70945910e-02,  1.44733035e+00,  1.57155604e+00,  1.85983519e+00,
        2.22448495e-01,  

In [16]:
from sklearn.metrics import r2_score
r2_score(np.array(y_true), y_pred)

-0.4086085391979781

### Try for own Model

In [17]:
# merge NMD_Scanner Output von MMRF mit dem NMD_efficiency score vom Original TCGA Benchmark
df = pd.read_csv('~/NMD/nmd-variant-effect-prediction/resources/Output/MMRF_TARGET_dataset_final_nmd_results.csv')
df2 = pd.read_csv('~/NMD/nmd-variant-effect-prediction/resources/MMRF_benchmark/MMRF_TARGET_dataset.csv')

# 1. Remove transcript version number from NMD Scanner output (df)
df['transcript_id'] = df['transcript_id'].str.split('.').str[0]

# 2. Adjust start_variant position (+1) to match TCGA coordinates
df['start_variant'] = df['start_variant'] + 1
#df['end_variant'] = df['end_variant'] - 1

# 3. Rename TCGA columns to match NMD Scanner
df2 = df2.rename(columns={
    'Transcript_ID': 'transcript_id',
    'start': 'start_variant'
})

# 4. Select only relevant columns from TCGA
df2_subset = df2[['transcript_id', 'start_variant', 'NMD_efficiency']]

# 5. Merge
df_merged = pd.merge(df, df2_subset, on=['transcript_id', 'start_variant'], how='inner')

# ✅ Inspect result
print("Merged shape:", df_merged.shape)
print("Missing NMD_efficiency values:", df_merged['NMD_efficiency'].isnull().sum())
print(df_merged[['transcript_id', 'start_variant', 'NMD_efficiency']].head())

Merged shape: (549, 74)
Missing NMD_efficiency values: 0
     transcript_id  start_variant  NMD_efficiency
0  ENST00000038176       58609624        0.710493
1  ENST00000040738       13576985        0.022368
2  ENST00000155093        2979509       -0.019109
3  ENST00000156471       34857043        0.040062
4  ENST00000202917      112917610        0.987014


In [18]:
# Select the columns that we want to have as X (our features)
cols = ['start_loss', 'stop_loss', 'nmd_last_exon_rule', 'nmd_50nt_penultimate_rule', 'nmd_long_exon_rule', 'nmd_start_proximal_rule', 'nmd_single_exon_rule', 'nmd_escape', # nmd escape = X or Y?
        'utr3_length', 'utr5_length', 'alt_transcript_length', 'total_exon_count', 'upstream_exon_count', 'downstream_exon_count', 'ptc_to_start_codon', 'ptc_exon_length', 
        'stop_codon_distance'] 

df_merged[cols].head(1)

Unnamed: 0,start_loss,stop_loss,nmd_last_exon_rule,nmd_50nt_penultimate_rule,nmd_long_exon_rule,nmd_start_proximal_rule,nmd_single_exon_rule,nmd_escape,utr3_length,utr5_length,alt_transcript_length,total_exon_count,upstream_exon_count,downstream_exon_count,ptc_to_start_codon,ptc_exon_length,stop_codon_distance
0,False,False,False,False,False,False,False,False,222.0,598,3574.0,31,5.0,25.0,666.0,124.0,2085.0


In [19]:
df_merged[cols].isnull().sum()

start_loss                    0
stop_loss                     0
nmd_last_exon_rule            0
nmd_50nt_penultimate_rule     0
nmd_long_exon_rule            0
nmd_start_proximal_rule       0
nmd_single_exon_rule          0
nmd_escape                    0
utr3_length                  10
utr5_length                   0
alt_transcript_length        39
total_exon_count              0
upstream_exon_count           1
downstream_exon_count         1
ptc_to_start_codon           11
ptc_exon_length               1
stop_codon_distance          39
dtype: int64

In [20]:
# drop non-PTCs: if alt_is_premature = False, und falls alt_is_premature = True dann darf ref_is_premature nicht True sein

# 1. remove rows with ref_is_premature = True weil dann ist Annotations-Fehler und falls alt_is_premature = True dann ist es nicht premature
df_filtered = df_merged[~df_merged["ref_is_premature"]]
rem_rows1 = df_merged.shape[0] - df_filtered.shape[0]

print(f"Rows removed in step 1 (ref_is_premature=True): {rem_rows1}")
print(f"Remaining rows: {df_filtered.shape[0]}")

# 2. remove rows with ref_is_premature = False und alt_is_premature = False
step2_rows = df_filtered.shape[0]
df_filtered = df_filtered[~((df_filtered["ref_is_premature"] == False) & (df_filtered["alt_is_premature"] == False))]
rem_rows2 = step2_rows - df_filtered.shape[0]

print(f"Rows removed in step 2 (both ref and alt not premature): {rem_rows2}")
print(f"Remaining rows: {df_filtered.shape[0]}")

df_filtered = df_filtered.reset_index(drop=True)
df_filtered.head()

Rows removed in step 1 (ref_is_premature=True): 1
Remaining rows: 548
Rows removed in step 2 (both ref and alt not premature): 1
Remaining rows: 547


Unnamed: 0,transcript_id,variant_id,ref_cds_start,ref_cds_stop,ref_cds_seq,ref_cds_len,alt_cds_start,alt_cds_stop,alt_cds_seq,alt_cds_len,...,utr3_length,utr5_length,total_exon_count,upstream_exon_count,downstream_exon_count,ptc_to_start_codon,ptc_less_than_150nt_to_start,ptc_exon_length,stop_codon_distance,NMD_efficiency
0,ENST00000038176,.,58584105,58659631,ATGGCGTTTATCCGGAAGAAGCAGCAGGAGCAGCAGCTGCAGCTCT...,2754,58584105,58659631,ATGGCGTTTATCCGGAAGAAGCAGCAGGAGCAGCAGCTGCAGCTCT...,2754,...,222.0,598,31,5.0,25.0,666.0,False,124.0,2085.0,0.710493
1,ENST00000040738,.,13570010,13627587,ATGGCCACCAACCCACAGCCGCAGCCGCCTCCTCCGGCGCCGCCGC...,9156,13570010,13627587,ATGGCCACCAACCCACAGCCGCAGCCGCCTCCTCCGGCGCCGCCGC...,9156,...,138.0,1273,26,24.0,1.0,8889.0,False,125.0,264.0,0.022368
2,ENST00000155093,.,2953936,2979993,ATGGATGAAGATGAATTTGAATTGCAGCCACAAGAGCCAAACTCAT...,2406,2953936,2979993,ATGGATGAAGATGAATTTGAATTGCAGCCACAAGAGCCAAACTCAT...,2406,...,2513.0,417,8,7.0,0.0,1920.0,False,3697.0,483.0,-0.019109
3,ENST00000156471,.,34856791,34969613,ATGGCAGCCCCTGCGCAGCCCAAGAAGATCGTGGCCCCTACGGTGT...,4458,34856791,34969613,ATGGCAGCCCCTGCGCAGCCCAAGAAGATCGTGGCCCCTACGGTGT...,4458,...,129.0,5010,35,30.0,4.0,4206.0,False,121.0,249.0,0.040062
4,ENST00000202917,.,112907039,112919553,ATGATGGATCTCAGAAATACCCCAGCCAAATCTCTGGACAAGTTCA...,1203,112907039,112919553,ATGATGGATCTCAGAAATACCCCAGCCAAATCTCTGGACAAGTTCA...,1203,...,350.0,78,6,4.0,1.0,945.0,False,154.0,255.0,0.987014


In [21]:
df_filtered[cols].isnull().sum()

start_loss                    0
stop_loss                     0
nmd_last_exon_rule            0
nmd_50nt_penultimate_rule     0
nmd_long_exon_rule            0
nmd_start_proximal_rule       0
nmd_single_exon_rule          0
nmd_escape                    0
utr3_length                   9
utr5_length                   0
alt_transcript_length        38
total_exon_count              0
upstream_exon_count           0
downstream_exon_count         0
ptc_to_start_codon           10
ptc_exon_length               0
stop_codon_distance          38
dtype: int64

In [22]:
# drop null values
df_filtered = df_filtered.loc[df_filtered['alt_transcript_length'].notnull()].reset_index(drop=True)
df_filtered = df_filtered.loc[df_filtered['ptc_to_start_codon'].notnull()].reset_index(drop=True)
df.shape

(2452, 73)

In [23]:
df_filtered[cols].isnull().sum()

start_loss                   0
stop_loss                    0
nmd_last_exon_rule           0
nmd_50nt_penultimate_rule    0
nmd_long_exon_rule           0
nmd_start_proximal_rule      0
nmd_single_exon_rule         0
nmd_escape                   0
utr3_length                  0
utr5_length                  0
alt_transcript_length        0
total_exon_count             0
upstream_exon_count          0
downstream_exon_count        0
ptc_to_start_codon           0
ptc_exon_length              0
stop_codon_distance          0
dtype: int64

In [24]:
regr = pickle.load(open('./best_model', 'rb'))

def predict_NMD_efficiency(start_loss:bool, stop_loss:bool, nmd_last_exon_rule:bool, nmd_50nt_penultimate_rule:bool, 
                           nmd_long_exon_rule:bool, nmd_start_proximal_rule:bool, nmd_single_exon_rule:bool, nmd_escape:bool,
                           utr3_length:int, utr5_length:int, alt_transcript_length:float, total_exon_count:float, 
                           upstream_exon_count:float, downstream_exon_count:float, ptc_to_start_codon:float, ptc_exon_length:float,
                           stop_codon_distance_nmd:float):
    
    input_value =[[start_loss, stop_loss, nmd_last_exon_rule, nmd_50nt_penultimate_rule, nmd_long_exon_rule, nmd_start_proximal_rule, 
                   nmd_single_exon_rule, nmd_escape, utr3_length, utr5_length, alt_transcript_length, total_exon_count, 
                   upstream_exon_count, downstream_exon_count, ptc_to_start_codon, ptc_exon_length, stop_codon_distance_nmd]]
    
    assert sum([np.isnan(value) for value in input_value[0]]) == 0, 'Replace NaN value'
    
    pred = regr.predict(input_value)[0]
    
    return pred


def predict_NMD_efficiency_df(examples):
    df_merged = pd.DataFrame(examples)

    assert df_merged.isnull().sum().sum() == 0, 'Replace NaN value'

    pred = regr.predict(df_merged)

    return pred

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [25]:
pred = regr.predict(df_merged[cols])
c, p = stats.pearsonr(pred, df_merged['NMD_efficiency'])
print("Pearson R :" , c)
print("P-value : " , p)

AttributeError: 'numpy.ndarray' object has no attribute 'predict'

In [26]:
y_true = df_merged['NMD_efficiency']
y_pred = regr.predict(df_merged[cols]) # = pred

# Pearson correlation
c, p = stats.pearsonr(y_pred, y_true)
print("Pearson R:", c)
print("P-value:", p)

# min / max value for line
min_val = min(y_true.min(), y_pred.min())
max_val = max(y_true.max(), y_pred.max())

# Plot: True vs Predicted
plt.figure(figsize=(7, 7))
plt.scatter(y_true, y_pred, alpha=0.3)
plt.plot([min_val, max_val], [min_val, max_val], '--', color='gray')
plt.xlabel("True NMD Efficiency")
plt.ylabel("Predicted NMD Efficiency")
plt.title("True vs. Predicted NMD Efficiency")
plt.grid(True)
plt.tight_layout()
plt.show()

AttributeError: 'numpy.ndarray' object has no attribute 'predict'