# Import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats
from sklearn.metrics import roc_curve
from sklearn import metrics
import matplotlib.font_manager as font_manager
from numpy import nan
from collections import Counter
from sklearn.metrics import confusion_matrix

# Input

In [None]:
# define the variants that should be analyzed
# ABCA4_NCSS, ABCA4_DI or MYBPC3_NCSS
variants = 'ABCA4_NCSS'

# Read in the data and store it in a pandas dataframe

In [None]:
# store the scores in a dataframe
di = pd.read_excel('variants_scores.xlsx', variants)

# Get the distribution of variants

In [None]:
print('# variants: ', di.shape[0])
print('# non splice altering variants: ', di[di['% Mutant RNA'] <= 20].count()['cDNA variant'])
print('# splice altering variants: ', di[di['% Mutant RNA'] > 20].count()['cDNA variant'])

# Affects donor/acceptor

In [None]:
print('donor: ', di[di['affects'] == 'donor'].count()['cDNA variant'])
print('acceptor: ', di[di['affects'] == 'acceptor'].count()['cDNA variant'])

In [None]:
# Filter for variants that affect the SDS/SAS
donor = di['affects'] == "donor"
acceptor = di['affects'] == "acceptor"

# Filter for variants that affect splicing or not
sa = di['% Mutant RNA'] > 20
nsa = di['% Mutant RNA'] <= 20

# print the result
print('donor + affects splicing: ',di[donor & sa].shape[0])
print('donor + does not affect splicing: ',di[donor & nsa].shape[0])
print('acceptor + affects splicing: ',di[acceptor & sa].shape[0])
print('acceptor + does not affect splicing: ',di[acceptor & nsa].shape[0])


# Caclulate distribution around splice site for NCSS variants

In [None]:
locations = []
column_names = ['RNA','SSFL','MES','NNS','GS', 'splicerover', 'DSSP', 'spliceai','cagi','cadd', 'spidex', 'mmsplice','mtsplice','scap']

# only calculate the distribution for NCSS variants
if 'NCSS' in variants:
    for index in di.index:
        
        # get the cDNA positions
        cdna = di.at[index,'cDNA variant']
        
        # check if the variant alters splicing
        if di.at[index, '% Mutant RNA'] > 20:
            sa = 'sa'
        else:
            sa = ''
        
        # check if the variant is located upstream or downstream of the splice site
        if '-' in cdna:
            # check if it is a deletion
            if 'del' in cdna:
                # depending on the length of the deletion (on or multiple bases) a different handling is required
                if '_' in cdna:
                    loc = -int((cdna.split('-')[1][:-3]).split('_')[0])
                    locations.append((str(loc) + ' ' + sa))
                else:
                    loc = -int((cdna.split('-')[1][:-3]))
                    locations.append((str(loc) + ' ' + sa))
            # if the varant is not a deletion, the cdna string can be split to get the location of the variant
            else:
                loc = -int((cdna.split('-')[1]).split('>')[0][:-1])
                locations.append((str(loc) + ' ' + sa))
        
        # perform the same actions for the downsstream variants but now split the string at + instead of -
        elif '+' in cdna:
            if 'del' in cdna:
                if '_' in cdna:
                    loc = int((cdna.split('+')[1][:-3]).split('_')[0])
                    locations.append((str(loc) + ' ' + sa))
                else:
                    loc = int((cdna.split('+')[1][:-3]))
                    locations.append((str(loc) + ' ' + sa))
            else:
                loc = int((cdna.split('+')[1]).split('>')[0][:-1])
                locations.append((str(loc) + ' ' + sa))
        else:
            loc = 0
            locations.append((str(loc) + ' ' + sa))

# count how often a variant is located at a certain position in the NCSS motif and print the result          
Counter(locations)

# Calculate the missing scores for each tool

In [None]:
# Define the max value for the tools fror which a delta value has to be calculated
delta = dict()
delta['SSFL'] = 100
delta['MES'] = 12
delta['NNS'] = 1
delta['GS'] = 15
delta['SpliceRover'] = 1
delta['DSSP'] = 1

# create a dictionary to store the values
delta_scores = dict()

for index in di.index:
        
    element = []
    
    # get the % mutant RNA
    value = di.at[index,'% Mutant RNA']
    if value > 20:
        element.append(1)
    else:
        element.append(0)
    
    # calulate the delta scores
    for name in delta:
        wt = name + '_wt'
        var = name + '_var'
        if di.at[index,wt] == 0:
            score = float(di.at[index,var])/delta[name]
        else: 
            score = (float(di.at[index,var])-float(di.at[index,wt]))/float(di.at[index,wt])
        element.append(np.absolute(score))
    
    # add the value of the SpliceAI score 
    element.append(di.at[index,'SpliceAI'])
    
    # add the absolute value of the Cagi score 
    element.append(np.absolute(di.at[index,'CAGI']))
    
    # add the absolute value of the CADD score 
    element.append(np.absolute(di.at[index,'CADD']))
    
    # add the absolute value of the Spidex score
    element.append(np.absolute(di.at[index,'Spidex']))
    
    # add the absolute value of the MMsplice score 
    element.append(np.absolute(di.at[index,'MMSplice']))
    
    # add the absolute value of the MMsplice score
    element.append(np.absolute(di.at[index,'MTSplice']))
    
    # add the CAGI score
    element.append(np.absolute(di.at[index,'SCAP']))
        
    delta_scores[index] = element

delta_df = pd.DataFrame(delta_scores)
delta_df = delta_df.transpose()
delta_df.columns = column_names

delta_df.isnull().sum()

# Confusion Matrix

In [None]:
def Find_Optimal_Cutoff(target, predicted):
    """ Find the optimal probability cutoff point for a classification model related to event rate
    Parameters
    ----------
    target : Matrix with dependent or target data, where rows are observations

    predicted : Matrix with predicted data, where rows are observations

    Returns
    -------     
    list type, with optimal cutoff value

    """
    fpr, tpr, threshold = roc_curve(target, predicted)
    i = np.arange(len(tpr)) 
    roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
    roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]

    return list(roc_t['threshold']) 

In [None]:
# replace missing values with 0
delta_df = delta_df.replace(nan, 0)

# set the threshold for the values that are considered to affect splicing.
# Everything above the threshold is defined to affect splicing
threshold = []
threshold.append(Find_Optimal_Cutoff(delta_df['RNA'], delta_df.SSFL)[0])
threshold.append(Find_Optimal_Cutoff(delta_df['RNA'], delta_df.MES)[0])
threshold.append(Find_Optimal_Cutoff(delta_df['RNA'], delta_df.NNS)[0])
threshold.append(Find_Optimal_Cutoff(delta_df['RNA'], delta_df.GS)[0])
threshold.append(Find_Optimal_Cutoff(delta_df['RNA'], delta_df.splicerover)[0])
threshold.append(Find_Optimal_Cutoff(delta_df['RNA'], delta_df.DSSP)[0])
threshold.append(Find_Optimal_Cutoff(delta_df['RNA'], delta_df.spliceai)[0])
threshold.append(Find_Optimal_Cutoff(delta_df['RNA'], delta_df.cagi)[0])
threshold.append(Find_Optimal_Cutoff(delta_df['RNA'], delta_df.cadd)[0])
threshold.append(Find_Optimal_Cutoff(delta_df['RNA'], delta_df.spidex)[0])
threshold.append(Find_Optimal_Cutoff(delta_df['RNA'], delta_df.mmsplice)[0])
threshold.append(Find_Optimal_Cutoff(delta_df['RNA'], delta_df.mtsplice)[0])
threshold.append(Find_Optimal_Cutoff(delta_df['RNA'], delta_df.scap)[0])

print(threshold)

# create a new dataframe to store the classification 
classification = pd.DataFrame(delta_df['RNA']) 

# add the classification of the different tools to the dataframe
classification['SSFL'] = (delta_df.SSFL > threshold[0]).astype('int')
classification['MES'] = (delta_df.MES > threshold[1]).astype('int')
classification['NNS'] = (delta_df.NNS > threshold[2]).astype('int')
classification['GS'] = (delta_df.GS > threshold[3]).astype('int')
classification['splicerover'] = (delta_df.splicerover > threshold[4]).astype('int')
classification['DSSP'] = (delta_df.DSSP > threshold[5]).astype('int')
classification['spliceai'] = (delta_df.spliceai > threshold[6]).astype('int')
classification['cagi'] = (delta_df.cagi > threshold[7]).astype('int')
classification['cadd'] = (delta_df.cadd > threshold[8]).astype('int')
classification['spidex'] = (delta_df.spidex > threshold[9]).astype('int')
classification['mmsplice'] = (delta_df.mmsplice > threshold[10]).astype('int')
classification['mtsplice'] = (delta_df.mtsplice > threshold[11]).astype('int')
classification['scap'] = (delta_df.scap > threshold[12]).astype('int')

classification['consensus'] = ((classification['SSFL'] + classification['MES'] + classification['NNS'] + classification['GS']) > 1)

In [None]:
# calculcate the confusion matrix 
# the confusion matrix is in the format [[TN FP][FN TP]]
for name in column_names[1:]:
    print(name)
    print(confusion_matrix(classification.RNA.values, classification[name].values))

print('consensus')
print(confusion_matrix(classification.RNA.values, classification['consensus'].values))