In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef


import matplotlib.pyplot as plt
import seaborn as sns

import src.evaluation
import src.config

import random

In [2]:
results = pd.read_parquet('./results/df_data.parquet.gzip')

# display(results.head())
# display(results.sample(5))
# print(results.shape)
# print(results.columns.tolist())

In [68]:
def evaluate_mcc(targets, predictions, labels):
    p = {}
    
    for x in labels:
        target = [1 if y == x else 0 for y in targets]
        prediction = [1 if y == x else 0 for y in predictions]

        number_errors = 100
        slice_len = 10000
        errors = np.zeros(number_errors)
        for err in range(number_errors):
            index = random.randint(1, len(target) - slice_len)
            
            target_slice = target[index:index+slice_len]
            pred_slice = prediction[index:index+slice_len]
            # print(target_slice)
            # print(pred_slice)
            
            if target_slice == pred_slice:
                errors[err] = 1
            else:
                errors[err] = matthews_corrcoef(target_slice, pred_slice)
                
            # print(errors[err])
            # if errors[err] == 0:
            #     print(target[index:index+slice_len])
            #     print(prediction[index:index+slice_len])
        error = 0
        avg_err = np.mean(errors)
        error = np.sqrt(1/(errors.shape[0]-1) * np.sum(errors-avg_err)**2)
        print(errors)
        # print(error)

        print()
        p.update({x: [matthews_corrcoef(target, prediction), error]})
        
    return p

In [69]:
expert_labels = ['S', 'L', 'T', 'O', 'I', 'M']
index_names = ['predicted_label_linear_ALL', 'predicted_label_linear_experts', 'predicted_label_linear_experts_imperfect', 'predicted_label_crf_ALL', 'predicted_label_crf_experts', 'predicted_label_crf_experts_imperfect']
real_names = ['Linear Broad', 'Linear Experts Perfect Gate', 'Linear Experts Imperfect Gate', 'CRF Broad', 'CRF Experts Perfect Gate', 'CRF Experts Imperfect Gate']

mcc_values = {}
for index_name, real_name in zip(index_names, real_names):
    # print(index_name, real_name)
    tmp_mcc_values = evaluate_mcc(
        targets=list(''.join(results['Label'].tolist())),
        predictions=list(''.join(results[index_name].tolist())),
        labels=expert_labels
    )
    # tmp_mcc_values['real_names'] = real_name
    mcc_values.update({real_name: tmp_mcc_values})
    break

[1.         1.         0.96867708 0.         0.95755483 0.
 1.         0.95209979 0.         0.         0.95047012 0.
 0.         0.93379707 0.         0.         0.         0.
 1.         0.         0.         0.         1.         0.
 0.95743063 0.         1.         0.93948029 1.         0.95132681
 1.         0.         0.         0.95223754 1.         0.
 0.96308207 0.         0.9516459  0.         1.         0.94248115
 0.         0.         1.         1.         0.         1.
 0.         1.         0.96963019 0.         0.         0.
 1.         0.         0.         0.         1.         0.
 1.         0.         1.         0.94093468 0.95146724 1.
 0.94702015 1.         0.82299335 1.         0.94046342 1.
 1.         0.         0.97788515 0.93345464 0.         0.93540327
 0.         1.         0.         1.         0.         0.
 1.         0.         0.92223077 0.96007122 0.         0.
 1.         0.95052274 0.         0.         1.         1.
 0.         0.82851748 0.       

In [67]:
mcc_values

{'Linear Broad': {'S': [0.9288273276602671, 4.351682880166993e-16],
  'L': [0.9559388624269421, 1.896887409303561e-16],
  'T': [0.9458460216910255, 1.2943467028189004e-15],
  'O': [0.9378387316919236, 4.240101267855019e-16],
  'I': [0.9317978915274424, 1.283188541587703e-16],
  'M': [0.5756976280898956, 1.7853057969915868e-16]}}

In [None]:
df_mcc_values = pd.DataFrame(mcc_values).reset_index().rename(columns={'index': 'Label'}).melt(id_vars=['Label'], var_name='Model', value_name='MCC')
df_mcc_values['Error'] = df_mcc_values['MCC'].apply(lambda x: x[1])
df_mcc_values['MCC'] = df_mcc_values['MCC'].apply(lambda x: x[0])
# sns.set(style="whitegrid")
# ax = sns.barplot(data=df_mcc_values, ci=None)
# ax.set(ylim=(0.5, None))
df_mcc_values['Label'] = df_mcc_values['Label'].map({'S': 'Sec/SPI\nSignal (S)', 'L': 'Sec/SPII\nSignal (L)', 'T': 'Tat/SPI Signal\n(T)', 'O': 'Outer\nRegion (O)', 'I': 'Inner Region (I)', 'M': 'Membrane\nRegion (M)'})

In [None]:
src.evaluation.plot_mcc_split_label(df_mcc_values, 'mako')

---

In [70]:
def evaluate_mcc_simple(targets, predictions, labels):
    p = {}
    
    for x in labels:
        target = [1 if y == x else 0 for y in targets]
        prediction = [1 if y == x else 0 for y in predictions]
        if target == prediction:
            mcc = 1
        else:
            mcc = matthews_corrcoef(target, prediction)
        p.update({x: mcc})
    return p

In [120]:
mcc_results = [None] * results.shape[0]
for index, item in results.iterrows():
    target = item['Label']
    prediction = item['predicted_label_linear_ALL']
    mcc = evaluate_mcc_simple(target, prediction, expert_labels)
    mcc_results[index] = mcc
mcc_results = pd.DataFrame(mcc_results)
    

In [121]:
# mcc_results['M'].std()

In [122]:
def calc_mcc_error(mcc_results_iter):
    mcc_results_iter = [x for x in mcc_results_iter if x is not None]
    mccs = np.array(mcc_results_iter)
    mean_mcc = np.mean(mccs)
    # error = np.sqrt(1/(mccs.shape[0]-1) * np.sum(mccs-mean_mcc)**2)
    error = mccs.std()/np.sqrt(mccs.shape[0])
    return mean_mcc, error

In [123]:
[calc_mcc_error(mcc_results[x]) for x in expert_labels]

[(0.9707824544601842, 0.002483181357098469),
 (0.9925504911555059, 0.0013182052789598947),
 (0.99690723191429, 0.000746330187672169),
 (0.9477849637883972, 0.0030853825462679825),
 (0.9444530625741157, 0.003287752100556466),
 (0.9537025352864126, 0.002841627537207284)]

In [113]:
tmp_res = results.apply(lambda x: evaluate_mcc_simple(
    targets=list(results['Label']),
    predictions=list(results['predicted_label_crf_experts_imperfect']),
    labels=expert_labels), axis=1)

In [None]:
df_tmp_res = pd.DataFrame([pd.Series(x) for x in tmp_res])

In [None]:
df_tmp_res.head()

In [None]:
df_tmp_res.value_counts()

In [None]:
np.sqrt(1/(df_tmp_res['S'].shape[0]-1) * (np.sum(df_tmp_res['S']-df_tmp_res['S'].mean()**2)))

In [None]:
# mcc_values = {}

# for index_name, real_name in zip(index_names, real_names):
#      p.update({real_name: evaluate_mcc_simple()
    

In [None]:
mc_mcc = results.apply(lambda x: 1 if x['Label'] == x['predicted_label_linear_ALL'] else matthews_corrcoef(list(x['Label']), list(x['predicted_label_linear_ALL'])), axis=1)

In [None]:
np.sqrt(1/(mc_mcc.shape[0]-1) * (np.sum(mc_mcc-mc_mcc.mean()**2)))


In [None]:
mc_mcc.shape[0]

In [None]:
(1/(mc_mcc.shape[0]-1)) * np.sum((mc_mcc-mc_mcc.mean())**2)

In [None]:
mc_mcc.std()/np.sqrt(mc_mcc.shape[0])