# Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats
from sklearn.metrics import roc_curve

from sklearn import metrics
import matplotlib.font_manager as font_manager
from numpy import nan

# Define variables

In [None]:
# define the variants that should be analyzed
# ABCA4_NCSS, ABCA4_DI or MYBPC3_NCSS
variants = 'ABCA4_NCSS'

# Define the column headers that are used in the dataframe
if 'NCSS' in variants:
    column_names = ['RNA','CADD','DSSP','GeneSplicer', 'MaxEntScan', 'MMSplice', 'MTSplice', 'NNSPLICE', 'S-CAP', 'SPIDEX', 'SpliceAI', 'SpliceRover', 'SpliceSiteFinder-like']
else:
     column_names = ['RNA','CADD','DSSP','GeneSplicer', 'MaxEntScan', 'NNSPLICE', 'SpliceAI', 'SpliceRover', 'SpliceSiteFinder-like']

# Import the scores into a dataframe

In [None]:
# store the scores in a dataframe
di = pd.read_excel('variants_scores.xlsx', variants)

# replace missing values with 0
di = di.replace(nan, 0)

# Calculate the delta values 

In [None]:
def delta_score(name, index):
    '''calulate the delta score 
    @param name: name of the splice prediction tool
    @param index: index of the scores
    '''
    wt = name + '_wt'
    var = name + '_var'
    if di.at[index,wt] == 0:
        score = float(di.at[index,var])/alamut[name]
    else: 
        score = (float(di.at[index,var])-float(di.at[index,wt]))/float(di.at[index,wt])
    return np.absolute(score)

In [None]:
# Define the name of the alamut programs with corresponding max value
delta = dict()
delta['SSFL'] = 100
delta['MES'] = 12
delta['NNS'] = 1
delta['GS'] = 15
delta['SpliceRover'] = 1
delta['DSSP'] = 1

# create a dictionary to store the values
delta_scores = dict()

for index in di.index:
        
    element = []
    
    # get the % mutant RNA
    value = di.at[index,'% Mutant RNA']
    if value > 20:
        element.append(1)
    else:
        element.append(0)
    
    # add the absolute value of the CADD score 
    element.append(np.absolute(di.at[index,'CADD']))
    
    # add the DSSP score 
    element.append(delta_score('DSSP', index))
    
    # add the GeneSplicer score 
    element.append(delta_score('GS', index))
    
    # add the MaxEntScan score 
    element.append(delta_score('MES', index))
    
    if 'NCSS' in variants:

        # add the absolute value of the MMsplice score 
        element.append(np.absolute(di.at[index,'MMSplice']))

        # add the absolute value of the MMsplice score
        element.append(np.absolute(di.at[index,'MTSplice']))
    
    # add the NNSPLICE score 
    element.append(delta_score('NNS', index))
    
    if 'NCSS' in variants:
        
        # add the S-SCAP score
        element.append(np.absolute(di.at[index,'SCAP']))
    
        # add the absolute value of the SPIDEX score
        element.append(np.absolute(di.at[index,'Spidex']))
 
    # add the SpliceAI score 
    element.append(di.at[index,'SpliceAI'])
    
    # add  the SpliceRover score 
    element.append(delta_score('SpliceRover', index))
    
    # add the SSFL score 
    element.append(delta_score('SSFL', index))

        
    delta_scores[index] = element

delta_df = pd.DataFrame(delta_scores)
delta_df = delta_df.transpose()
delta_df.columns = column_names

print(delta_df.head())

# ROC curves

In [None]:
names = column_names[1:]

# prepare the data 

# 1) List with classification (0,1)
label = []
for index in delta_df.index:
    value = delta_df.at[index,'RNA']
    if value > 0.2:
        label.append(1)
    else:
        label.append(0)
label = np.array(label)

# 2) list with probabilities predicted by the splicing prediction program 
probabilities = []
for name in names:
    probabilities.append(np.array(delta_df[name].tolist()))

# 3) Add the alamut consensus

if 'NCSS' in variants:
    loc = [2,3,6,11]
else:
    loc = [2,3,4,7]
alamut3 = []

for i in range(len(probabilities[0])):
    p = [probabilities[j][i] for j in loc]
    largest_integer = max(p) 
    p.remove(largest_integer)
    second_largest_integer = max(p)
    p.remove(second_largest_integer)
    third_largest_integer = max(p)
    alamut3.append((largest_integer + second_largest_integer + third_largest_integer)/3)

probabilities.insert(0,alamut3)

names.insert(0,'Alamut 3/4')

# 4) Define the colors for the lines    
colors = {'Alamut 3/4': 'olive', 'CADD' : 'green', 'DSSP' : 'lawngreen',
          'GeneSplicer' : 'cyan', 'MaxEntScan' : 'lightskyblue', 'MMSplice' : 'blue', 'MTSplice' : 'purple',
          'NNSPLICE' : 'red', 'S-CAP' : 'hotpink', 'SPIDEX' : 'pink', 'SpliceAI' : 'orange',
          'SpliceRover' : 'gold', 'SpliceSiteFinder-like' : 'grey'}

In [None]:
# create a dictionary to store the AUC values
aucs = {}

# Plot the ROC curve
plt.figure(figsize=(10,10))
for i in range(len(probabilities)):
    prob = probabilities[i]
    fper, tper, thresholds = roc_curve(label, prob, pos_label=1) 
    auc = metrics.roc_auc_score(label, prob)
    aucs[i] = auc
    plt.plot(fper, tper, color=colors[names[i]], label=names[i]) # + ': ' + "{0:0.2f}".format(auc))

font_prop = font_manager.FontProperties(size=18)
    
plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
plt.xlabel('False Positive Rate', size=18)
plt.ylabel('True Positive Rate', size = 18)
plt.title(('ROC Curve ' + variants + ' variants'), size = 20)
plt.tick_params(labelsize=18)
plt.legend(prop=font_prop)
plt.savefig(('ROC_' + variants + '.svg'),format='svg', dpi=1200)
plt.show()


In [None]:
print(aucs)