# Import

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats
from sklearn.metrics import roc_curve
from sklearn import metrics
import matplotlib.font_manager as font_manager
from numpy import nan
from collections import Counter
from sklearn.metrics import confusion_matrix
from functions import read_scores_from_excel

# Input

In [None]:
# define the variants that should be analyzed
# ABCA4_NCSS, ABCA4_DI or MYBPC3_NCSS
variants = 'ABCA4_NCSS'

column_names = ['RNA','CADD','DSSP','GeneSplicer', 'MaxEntScan', 'MMSplice', 'MTSplice', 'NNSPLICE', 'S-CAP', 'SPIDEX', 'SpliceAI', 'SpliceRover', 'SpliceSiteFinder-like']

# Read in the data and store it in a pandas dataframe

In [None]:
# store the scores in a dataframe
di = pd.read_excel('variants_scores.xlsx', variants)

# Get the distribution of variants

In [None]:
print('# variants: ', di.shape[0])
print('# non splice altering variants: ', di[di['% Mutant RNA'] <= 20].count()['cDNA variant'])
print('# splice altering variants: ', di[di['% Mutant RNA'] > 20].count()['cDNA variant'])

# Affects donor/acceptor

In [None]:
print('donor: ', di[di['affects'] == 'donor'].count()['cDNA variant'])
print('acceptor: ', di[di['affects'] == 'acceptor'].count()['cDNA variant'])

In [None]:
# Filter for variants that affect the SDS/SAS
donor = di['affects'] == "donor"
acceptor = di['affects'] == "acceptor"

# Filter for variants that affect splicing or not
sa = di['% Mutant RNA'] > 20
nsa = di['% Mutant RNA'] <= 20

# print the result
print('donor + affects splicing: ',di[donor & sa].shape[0])
print('donor + does not affect splicing: ',di[donor & nsa].shape[0])
print('acceptor + affects splicing: ',di[acceptor & sa].shape[0])
print('acceptor + does not affect splicing: ',di[acceptor & nsa].shape[0])


# Caclulate distribution around splice site for NCSS variants

In [None]:
locations = []

# only calculate the distribution for NCSS variants
if 'NCSS' in variants:
    for index in di.index:
        
        # get the ss positions
        pos = di.at[index,'position ss']
        
        # check if the variant alters splicing
        if di.at[index, '% Mutant RNA'] > 20:
            sa = 'sa'
        else:
            sa = ''
            
        # check if the variant affects the donor or acceptor
        affects = di.at[index,'affects']

        locations.append((str(pos) + ' ' + sa + ' ' + affects))

# count how often a variant is located at a certain position in the NCSS motif and print the result          
Counter(locations)

# Calculate the missing scores for each tool

In [None]:
delta_df = read_scores_from_excel('variants_scores.xlsx', variants, fillna = False, diall = True)
delta_df.columns = column_names

delta_df.isnull().sum()

# Confusion Matrix

In [None]:
def Find_Optimal_Cutoff(target, predicted):
    """ Find the optimal probability cutoff point for a classification model related to event rate
    @target : Matrix with dependent or target data, where rows are observations
    @predicted : Matrix with predicted data, where rows are observations
    Returns list type, with optimal cutoff value
    
    adapted from: https://stackoverflow.com/questions/28719067/roc-curve-and-cut-off-point-python

    """
    fpr, tpr, threshold = roc_curve(target, predicted)
    i = np.arange(len(tpr)) 
    roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'threshold' : pd.Series(threshold, index=i)})
    roc_t = roc.iloc[(roc.tf-0).abs().argsort()[:1]]

    return list(roc_t['threshold']) 

In [None]:
# replace missing values with 0
delta_df = delta_df.replace(nan, 0)

# set the threshold for the values that are considered to affect splicing.
# Everything above the threshold is defined to affect splicing
threshold = []
for name in column_names[1:]:
    threshold.append(Find_Optimal_Cutoff(delta_df['RNA'], delta_df[name])[0])

print(threshold)

# create a new dataframe to store the classification 
classification = pd.DataFrame(delta_df['RNA']) 

# add the classification of the different tools to the dataframe
i = 0
for name in column_names[1:]:
    classification[name] = (delta_df[name] > threshold[i]).astype('int')
    i += 1


classification['consensus'] = ((classification['SpliceSiteFinder-like'] + classification['MaxEntScan'] + classification['NNSPLICE'] + classification['GeneSplicer']) > 1)

# calculcate the confusion matrix 
# the confusion matrix is in the format [[TN FP][FN TP]]
for name in column_names[1:]:
    print(name)
    print(confusion_matrix(classification.RNA.values, classification[name].values))

print('consensus')
print(confusion_matrix(classification.RNA.values, classification['consensus'].values))