In [1]:
its_jupyter_notebook = True

In [2]:
import pandas as pd
import os
import time
import numpy as np
import seaborn as sns
import pickle
import torch
from pathlib import Path
import argparse
import math
import matplotlib.pyplot as plt
import sys
from sklearn.metrics import classification_report, roc_curve, roc_auc_score, auc
from scipy.stats import chi2_contingency, fisher_exact
from tqdm.notebook import tqdm
sys.path.insert(0, '..')

from util.xai import gradcam, interpolate_expl_matrix, plot_matrix
from config import *
from models.nt_classifier import build as build_model 
from util.plot_utils import balance_df, obtain_plot, plot_logs, plot_roc_curves
from util.plot_utils import collect_results_based_on_confidence_level_based_on_percentile as collect_results_based_on_confidence_level
if its_jupyter_notebook:
    sys.argv = [''] #Remove this if it's not a jupyter notebook!

<class 'transformers.tokenization_dna.DNATokenizer'>


In [3]:
checkpoint_dir_paths = []

chkpt_folder = os.path.join(ROOT_DIR, 'checkpoints')

models_to_check = os.listdir(chkpt_folder)
for model_name in models_to_check:
    model_folder = os.path.join(chkpt_folder, model_name)
    test_paris = os.path.join(chkpt_folder, model_name, 'test_results500.csv')
    ricseq = os.path.join(chkpt_folder, model_name, 'ricseq_results500.csv')
    splash = os.path.join(chkpt_folder, model_name, 'splash_results500.csv')
    if os.path.exists(test_paris) & os.path.exists(ricseq) & os.path.exists(splash) :
        checkpoint_dir_paths.append(model_folder)

In [4]:
def obtain_auc_nt_intarna(res):
    fpr, tpr, _ = roc_curve(res.ground_truth, res.probability)
    roc_auc_nt = auc(fpr, tpr)
    fpr, tpr, _ = roc_curve(res.ground_truth, abs(res.E_norm))
    roc_auc_intarna = auc(fpr, tpr)
    return roc_auc_nt, roc_auc_intarna

In [5]:
diz_results = {}
name_map = {}

In [6]:
for _, checkpoint_dir in tqdm(enumerate(checkpoint_dir_paths), total = len(checkpoint_dir_paths)):
    row = {}

    # -------------- -------------- -------------- --------------
    #                            PARIS
    # -------------- -------------- -------------- --------------

    HOW = 'test'
    SPECIE = 'human'
    n_conf = 10

    # -------------- -------------- -------------- --------------

    file_train = os.path.join(rna_rna_files_dir, "gene_pairs_training.txt")
    with open(file_train, "rb") as fp:   # Unpickling
        gene_pairs_train_original = pickle.load(fp)

    file_train = os.path.join(rna_rna_files_dir, "gene_pairs_training_nt_HQ.txt")
    with open(file_train, "rb") as fp:   # Unpickling
        gene_pairs_train = pickle.load(fp)

    file_test = os.path.join(rna_rna_files_dir, f"gene_pairs_{HOW}_nt_HQ.txt")
    with open(file_test, "rb") as fp:   # Unpickling
        gene_pairs_test = pickle.load(fp)

    #-------------- -------------- -------------- --------------

    test500 = pd.read_csv(os.path.join(metadata_dir, f'{HOW}500.csv'))
    assert test500.couples.isin(gene_pairs_test).all()

    df_nt = pd.read_csv(os.path.join(metadata_dir, f'df_nt_HQ.csv'))

    assert test500.shape[0] == df_nt[['couples', 'interacting', 'policy']].merge(test500, on = 'couples').shape[0]
    test500 = df_nt[['couples', 'interacting', 'policy', 'where', 'where_x1', 'where_y1', 'simple_repeats', 'sine_alu', 'low_complex']].merge(test500, on = 'couples')

    id_cds_cds = set(test500[test500['where'] == 'CDS-CDS'].couples)

    #-------------- -------------- -------------- --------------

    res = pd.read_csv(os.path.join(checkpoint_dir, f'{HOW}_results500.csv'))
    # Drop all the pairs (they should be 60-70) that are present in the training set.
    res = res[~res.couples.isin(gene_pairs_train_original)]
    # show only results for 1 specie
    res = res[res.specie == SPECIE]

    intarna = pd.read_csv(os.path.join(intarna_dir, f'{HOW}500', f'{HOW}.csv'), sep = ';')
    intarna['key'] = intarna.id1 + '_' + intarna.id2

    # keep only the lower E_norm for each group
    intarna.sort_values('E_norm', ascending = False, inplace=True)
    intarna.drop_duplicates(subset='key', keep='first', inplace=True)
    intarna = intarna.reset_index(drop = True)
    intarna['couples'] = intarna.id1.str.extractall('(.*)_(.*)').reset_index(drop = True)[0]
    intarna['couples'] = intarna['couples'].astype(int)
    intarna = intarna.dropna()

    res = res.merge(intarna[['E','E_norm', 'couples']].rename({'couples':'id_sample'}, axis =1), on = 'id_sample')

    # -------------- -------------- -------------- --------------

    subset_test_nt_sampled = os.path.join(rna_rna_files_dir, f"gene_pairs_{HOW}_sampled_nt_HQ.txt")
    with open(subset_test_nt_sampled, "rb") as fp:  # Unpickling
        subset_test_nt_sampled = pickle.load(fp)

    easypos_smartneg_sampled = res[res.id_sample.isin(subset_test_nt_sampled)]
    easypos_smartneg_sampled = balance_df(easypos_smartneg_sampled)

    row['paris_acc_ES_sampled'] = (easypos_smartneg_sampled.prediction == easypos_smartneg_sampled.ground_truth).sum()/easypos_smartneg_sampled.shape[0]
    row['paris_auc_ES_sampled'], row['paris_aucINTARNA_ES_sampled'] = obtain_auc_nt_intarna(easypos_smartneg_sampled)

    # -------------- -------------- -------------- --------------

    subset_test_nt_sampled = os.path.join(rna_rna_files_dir, f"gene_pairs_{HOW}_HQ.txt")
    with open(subset_test_nt_sampled, "rb") as fp:  # Unpickling
        subset_test_nt_sampled = pickle.load(fp)

    all_val = res[res.couples.isin(subset_test_nt_sampled)]
    assert all_val.shape[0] == res.shape[0]
    easypos_smartneg = all_val[all_val.policy.isin(['smartneg', 'easypos'])]

    easypos_smartneg_balanced = balance_df(easypos_smartneg)

    row['paris_acc_ES'] = (easypos_smartneg_balanced.prediction == easypos_smartneg_balanced.ground_truth).sum()/easypos_smartneg_balanced.shape[0]
    row['paris_auc_ES'], row['paris_aucINTARNA_ES']  = obtain_auc_nt_intarna(easypos_smartneg_balanced)

    # -------------- -------------- -------------- --------------

    easypos_smartneg_balanced = balance_df(easypos_smartneg[easypos_smartneg.id_sample.isin(id_cds_cds)])

    row['paris_acc_ES_cdscds'] = (easypos_smartneg_balanced.prediction == easypos_smartneg_balanced.ground_truth).sum()/easypos_smartneg_balanced.shape[0]
    row['paris_auc_ES_cdscds'], row['paris_aucINTARNA_ES_cdscds'] = obtain_auc_nt_intarna(easypos_smartneg_balanced)

    # -------------- -------------- -------------- --------------

    easypos_smartneg_balanced = balance_df(easypos_smartneg[~easypos_smartneg.id_sample.isin(id_cds_cds)])

    row['paris_acc_ES_nocdscds'] = (easypos_smartneg_balanced.prediction == easypos_smartneg_balanced.ground_truth).sum()/easypos_smartneg_balanced.shape[0]
    row['paris_auc_ES_nocdscds'], row['paris_aucINTARNA_ES_nocdscds'] = obtain_auc_nt_intarna(easypos_smartneg_balanced)

    # -------------- -------------- -------------- --------------

    HQ_TRESHOLD = 35

    subset = test500[ (abs(test500.seed_x1 - test500.seed_x2) >HQ_TRESHOLD) & (abs(test500.seed_y1 - test500.seed_y2) > HQ_TRESHOLD) ]
    easypos_smartneg_HQ = easypos_smartneg[easypos_smartneg.id_sample.isin(subset.couples)]

    assert set(easypos_smartneg_HQ.policy.value_counts().index) == set(['smartneg', 'easypos'])

    easypos_smartneg_HQ_balanced = balance_df(easypos_smartneg_HQ)

    row['paris_acc_ES_HQ'] = (easypos_smartneg_HQ_balanced.prediction == easypos_smartneg_HQ_balanced.ground_truth).sum()/easypos_smartneg_HQ_balanced.shape[0]
    row['paris_auc_ES_HQ'], row['paris_aucINTARNA_ES_HQ'] = obtain_auc_nt_intarna(easypos_smartneg_HQ_balanced)

    # -------------- -------------- -------------- --------------

    ep = res[(res.policy == 'easypos')]
    row['paris_acc_easypos'] = ep[ep.ground_truth == ep.prediction].shape[0]/ep.shape[0]

    sn = res[(res.policy == 'smartneg')]
    row['paris_acc_smartneg'] = sn[sn.ground_truth == sn.prediction].shape[0]/sn.shape[0]

    hn = res[(res.policy == 'hardneg')]
    row['paris_acc_shardneg'] = hn[hn.ground_truth == hn.prediction].shape[0]/hn.shape[0]

    easy = res[(res.policy == 'easyneg')]
    row['paris_acc_easyneg'] = easy[easy.ground_truth == easy.prediction].shape[0]/easy.shape[0]

    # -------------- -------------- -------------- --------------

    ephnen = res[(res.policy == 'easyneg')|(res.policy == 'easypos')|(res.policy == 'hardneg')]

    ephnen_sampled = balance_df(ephnen)

    row['paris_acc_EPENHN'] = ephnen_sampled[ephnen_sampled.ground_truth == ephnen_sampled.prediction].shape[0]/ephnen_sampled.shape[0]
    row['paris_auc_EPENHN'], row['paris_aucINTARNA_EPENHN'] = obtain_auc_nt_intarna(ephnen_sampled)

    # -------------- -------------- -------------- --------------

    poshq = set(easypos_smartneg_HQ[easypos_smartneg_HQ.ground_truth == 1].id_sample)
    ephnen = res[(res.policy == 'easyneg')|(res.id_sample.isin(poshq))|(res.policy == 'hardneg')]

    ephnen_sampled = balance_df(ephnen)

    row['paris_acc_EPENHN_HQ'] = ephnen_sampled[ephnen_sampled.ground_truth == ephnen_sampled.prediction].shape[0]/ephnen_sampled.shape[0]
    row['paris_auc_EPENHN_HQ'], row['paris_aucINTARNA_EPENHN_HQ'] = obtain_auc_nt_intarna(ephnen_sampled)

    # -------------- -------------- -------------- --------------

    df_nt = pd.read_csv(os.path.join(metadata_dir, f'df_nt_HQ.csv'))
    regex = df_nt[df_nt.couples.isin(gene_pairs_train)].couples_id.str.extractall('(.*)_(.*)').reset_index()
    df_train = regex
    df_train['g1'] = regex[0]
    df_train['g2'] = regex[1]
    df_train_genes = set(df_train['g1']).union(set(df_train['g2']))

    not_in_train = res[~(res.gene1_original.isin(df_train_genes) & res.gene2_original.isin(df_train_genes))]
    not_in_train_epsn = not_in_train[(not_in_train.policy == 'easypos')|(not_in_train.policy == 'smartneg')]
    not_in_train_epsn_balanced = balance_df(not_in_train_epsn)

    row['paris_acc_ES_notrain'] = not_in_train_epsn_balanced[not_in_train_epsn_balanced.ground_truth == not_in_train_epsn_balanced.prediction].shape[0]/not_in_train_epsn_balanced.shape[0]
    row['paris_auc_ES_notrain'], row['paris_aucINTARNA_ES_notrain'] = obtain_auc_nt_intarna(not_in_train_epsn_balanced)

    # -------------- -------------- -------------- --------------

    n_values = 15
    MIN_PERC = 0.5

    confidence_level, auc_nt, auc_intarna = collect_results_based_on_confidence_level(easypos_smartneg, how = 'intarna', MIN_PERC = MIN_PERC, balance = False, n_values = n_values, space = 'log')

    for i in range(len(confidence_level)):
        row[f'paris_ES_INTARNAconf{confidence_level[i]}'] = auc_nt[i]
        row[f'paris_ES_INTARNA_INTARNAconf{confidence_level[i]}'] = auc_intarna[i]

    confidence_level, auc_nt, auc_intarna = collect_results_based_on_confidence_level(easypos_smartneg, how = 'nt', MIN_PERC = MIN_PERC, balance = False, n_values = n_values, space = 'log')

    for i in range(len(confidence_level)):
        row[f'paris_ES_NTconf{confidence_level[i]}'] = auc_nt[i]
        row[f'paris_ES_INTARNA_NTconf{confidence_level[i]}'] = auc_intarna[i]


    # -------------- -------------- -------------- --------------
    #                       RICSEQ and SPLASH
    # -------------- -------------- -------------- --------------

    only_test = True
    exclude_train_genes = False
    MIN_N_READS_RICSEQ = 3

    for how in ['ricseq', 'splash']:

        res = pd.read_csv(os.path.join(checkpoint_dir, f'{how}_results500.csv'))

        file_train = os.path.join(rna_rna_files_dir, f'{how}', 'gene_pairs_training.txt')
        with open(file_train, "rb") as fp:   # Unpickling
            train_couples = pickle.load(fp)

        file_test = os.path.join(rna_rna_files_dir, f'{how}', 'gene_pairs_test.txt')
        with open(file_test, "rb") as fp:   # Unpickling
            test_couples = pickle.load(fp)

        tr_genes=pd.Series(train_couples).str.extractall('(.*)_(.*)').reset_index()
        training_genes = set(tr_genes[0]).union(tr_genes[1])

        if only_test:
            res = res[res.couples.isin(test_couples)]
            if exclude_train_genes:
                res = res[~(res.gene1_original.isin(training_genes)) & (res.gene2_original.isin(training_genes))]


        test500 = pd.read_csv(os.path.join(metadata_dir, f'{how}500.csv'))
        df_nt = pd.read_csv(os.path.join(metadata_dir, f'df_nt_{how}.csv'))

        assert test500.shape[0] == df_nt[['couples', 'interacting', 'policy']].merge(test500, on = 'couples').shape[0]

        if how == 'ricseq':
            test500 = df_nt[['couples', 'interacting', 'policy', 'where', 'where_x1', 'where_y1', 'simple_repeats', 'sine_alu', 'low_complex', 'n_reads']].merge(test500, on = 'couples')
            ids_to_keep = set(test500[test500.n_reads >= MIN_N_READS_RICSEQ].couples).union(test500[test500.interacting==False].couples)
            res = res[res.id_sample.isin(ids_to_keep)]
        elif how == 'mario':
            test500 = df_nt[['couples', 'interacting', 'policy', 'where', 'where_x1', 'where_y1', 'simple_repeats', 'sine_alu', 'low_complex', 'n_reads']].merge(test500, on = 'couples')
        elif how == 'splash':
            test500 = df_nt[['couples', 'interacting', 'policy', 'where', 'where_x1', 'where_y1', 'experiment']].merge(test500, on = 'couples')
        else:
            raise NotImplementedError

        id_cds_cds = set(test500[test500['where'] == 'CDS-CDS'].couples)


        intarna = pd.read_csv(os.path.join(intarna_dir, f'{how}500_RANDOM', f'{how}.csv'), sep = ';')
        intarna['key'] = intarna.id1 + '_' + intarna.id2

        # keep only the lower E_norm for each group
        intarna.sort_values('E_norm', ascending = False, inplace=True)
        intarna.drop_duplicates(subset='key', keep='first', inplace=True)
        intarna = intarna.reset_index(drop = True)
        intarna['couples'] = intarna.id1.str.extractall('(.*)_(.*)').reset_index(drop = True)[0]
        intarna['couples'] = intarna['couples'].astype(int)

        intarna = intarna.dropna()
        res = res.merge(intarna[['E','E_norm', 'couples']].rename({'couples':'id_sample'}, axis =1), on = 'id_sample')

        # -------------- -------------- -------------- --------------

        easypos_smartneg = res[res.policy.isin(['smartneg', 'easypos'])]
        balanced_epsn = balance_df(easypos_smartneg)

        row[f'{how}_acc_ES'] = (balanced_epsn.prediction == balanced_epsn.ground_truth).sum()/balanced_epsn.shape[0]
        row[f'{how}_auc_ES'], row[f'{how}_aucINTARNA_ES'] = obtain_auc_nt_intarna(balanced_epsn)

        # -------------- -------------- -------------- --------------

        easypos_smartneg_balanced = balance_df(easypos_smartneg[easypos_smartneg.id_sample.isin(id_cds_cds)])
        row[f'{how}_acc_ES_cdscds'] = (easypos_smartneg_balanced.prediction == easypos_smartneg_balanced.ground_truth).sum()/easypos_smartneg_balanced.shape[0]
        row[f'{how}_auc_ES_cdscds'], row[f'{how}_aucINTARNA_ES_cdscds'] = obtain_auc_nt_intarna(easypos_smartneg_balanced)

        # -------------- -------------- -------------- --------------

        easypos_smartneg_balanced = balance_df(easypos_smartneg[~easypos_smartneg.id_sample.isin(id_cds_cds)])

        row[f'{how}_acc_ES_nocdscds'] = (easypos_smartneg_balanced.prediction == easypos_smartneg_balanced.ground_truth).sum()/easypos_smartneg_balanced.shape[0]
        row[f'{how}_auc_ES_nocdscds'], row[f'{how}_aucINTARNA_ES_nocdscds'] = obtain_auc_nt_intarna(easypos_smartneg_balanced)

        # -------------- -------------- -------------- --------------

        ep = res[(res.policy == 'easypos')]
        row[f'{how}_acc_easypos'] = ep[ep.ground_truth == ep.prediction].shape[0]/ep.shape[0]

        sn = res[(res.policy == 'smartneg')]
        row[f'{how}_acc_smartneg'] = sn[sn.ground_truth == sn.prediction].shape[0]/sn.shape[0]

        hn = res[(res.policy == 'hardneg')]
        row[f'{how}_acc_shardneg'] = hn[hn.ground_truth == hn.prediction].shape[0]/hn.shape[0]

        easy = res[(res.policy == 'easyneg')]
        row[f'{how}_acc_easyneg'] = easy[easy.ground_truth == easy.prediction].shape[0]/easy.shape[0]

        # -------------- -------------- -------------- --------------

        ephnen = res[(res.policy == 'easyneg')|(res.policy == 'easypos')|(res.policy == 'hardneg')]

        ephnen_sampled = balance_df(ephnen)
        row[f'{how}_acc_EPENHN'] = ephnen_sampled[ephnen_sampled.ground_truth == ephnen_sampled.prediction].shape[0]/ephnen_sampled.shape[0]
        row[f'{how}_auc_EPENHN'], row[f'{how}_aucINTARNA_EPENHN'] = obtain_auc_nt_intarna(ephnen_sampled)


        # -------------- -------------- -------------- --------------

        confidence_level, auc_nt, auc_intarna = collect_results_based_on_confidence_level(easypos_smartneg, how = 'intarna', MIN_PERC = MIN_PERC, balance = False, n_values = n_values, space = 'log')

        for i in range(len(confidence_level)):
            row[f'{how}_ES_INTARNAconf{confidence_level[i]}'] = auc_nt[i]
            row[f'{how}_ES_INTARNA_INTARNAconf{confidence_level[i]}'] = auc_intarna[i]

        confidence_level, auc_nt, auc_intarna = collect_results_based_on_confidence_level(easypos_smartneg, how = 'nt', MIN_PERC = MIN_PERC, balance = False, n_values = n_values, space = 'log')

        for i in range(len(confidence_level)):
            row[f'{how}_ES_NTconf{confidence_level[i]}'] = auc_nt[i]
            row[f'{how}_ES_INTARNA_NTconf{confidence_level[i]}'] = auc_intarna[i]

    # -------------- -------------- -------------- --------------
    #                 RICSEQ and SPLASH not in train
    # -------------- -------------- -------------- --------------

    only_test = True
    exclude_train_genes = True
    MIN_N_READS_RICSEQ = 3

    for how in ['ricseq', 'splash']:

        res = pd.read_csv(os.path.join(checkpoint_dir, f'{how}_results500.csv'))

        file_train = os.path.join(rna_rna_files_dir, f'{how}', 'gene_pairs_training.txt')
        with open(file_train, "rb") as fp:   # Unpickling
            train_couples = pickle.load(fp)

        file_test = os.path.join(rna_rna_files_dir, f'{how}', 'gene_pairs_test.txt')
        with open(file_test, "rb") as fp:   # Unpickling
            test_couples = pickle.load(fp)

        tr_genes=pd.Series(train_couples).str.extractall('(.*)_(.*)').reset_index()
        training_genes = set(tr_genes[0]).union(tr_genes[1])

        if only_test:
            res = res[res.couples.isin(test_couples)]
            if exclude_train_genes:
                res = res[~(res.gene1_original.isin(training_genes)) & (res.gene2_original.isin(training_genes))]


        test500 = pd.read_csv(os.path.join(metadata_dir, f'{how}500.csv'))
        df_nt = pd.read_csv(os.path.join(metadata_dir, f'df_nt_{how}.csv'))

        assert test500.shape[0] == df_nt[['couples', 'interacting', 'policy']].merge(test500, on = 'couples').shape[0]

        if how == 'ricseq':
            test500 = df_nt[['couples', 'interacting', 'policy', 'where', 'where_x1', 'where_y1', 'simple_repeats', 'sine_alu', 'low_complex', 'n_reads']].merge(test500, on = 'couples')
            ids_to_keep = set(test500[test500.n_reads >= MIN_N_READS_RICSEQ].couples).union(test500[test500.interacting==False].couples)
            res = res[res.id_sample.isin(ids_to_keep)]
        elif how == 'mario':
            test500 = df_nt[['couples', 'interacting', 'policy', 'where', 'where_x1', 'where_y1', 'simple_repeats', 'sine_alu', 'low_complex', 'n_reads']].merge(test500, on = 'couples')
        elif how == 'splash':
            test500 = df_nt[['couples', 'interacting', 'policy', 'where', 'where_x1', 'where_y1', 'experiment']].merge(test500, on = 'couples')
        else:
            raise NotImplementedError

        id_cds_cds = set(test500[test500['where'] == 'CDS-CDS'].couples)


        intarna = pd.read_csv(os.path.join(intarna_dir, f'{how}500_RANDOM', f'{how}.csv'), sep = ';')
        intarna['key'] = intarna.id1 + '_' + intarna.id2

        # keep only the lower E_norm for each group
        intarna.sort_values('E_norm', ascending = False, inplace=True)
        intarna.drop_duplicates(subset='key', keep='first', inplace=True)
        intarna = intarna.reset_index(drop = True)
        intarna['couples'] = intarna.id1.str.extractall('(.*)_(.*)').reset_index(drop = True)[0]
        intarna['couples'] = intarna['couples'].astype(int)

        intarna = intarna.dropna()
        res = res.merge(intarna[['E','E_norm', 'couples']].rename({'couples':'id_sample'}, axis =1), on = 'id_sample')

        # -------------- -------------- -------------- --------------

        easypos_smartneg = res[res.policy.isin(['smartneg', 'easypos'])]
        balanced_epsn = balance_df(easypos_smartneg)

        row[f'{how}_acc_ES_notrain'] = (balanced_epsn.prediction == balanced_epsn.ground_truth).sum()/balanced_epsn.shape[0]
        row[f'{how}_auc_ES_notrain'], row[f'{how}_aucINTARNA_ES_notrain'] = obtain_auc_nt_intarna(balanced_epsn)

    model_name = f'model{_}'
    name_map[model_name] = checkpoint_dir
    diz_results[model_name] = row

  0%|          | 0/23 [00:00<?, ?it/s]



In [7]:
df = pd.DataFrame.from_dict(diz_results, 'index')
df = df*100
df = df.round(2)
df = df.reset_index().rename({'index':'model'}, axis = 1)

In [12]:
intarna_columns = []
all_columns = list(df.columns)
for col in all_columns:
    col_clean = col.replace('INTARNAconf', '')
    if 'INTARNA' in col_clean:
        intarna_columns.append(col)

In [15]:
df_intarna, other = df.filter(intarna_columns, axis = 1), df.filter(set(df.columns) - set(intarna_columns), axis = 1)
# df_intarna_std = df_intarna.std() #THIS HAS SEVERAL STANDARD DEV > 1, WHERE THERE IS NTconf IS NORMAL
# df_intarna_std[df_intarna_std > 1]

In [31]:
# df_intarna_std = df_intarna.std() #THIS HAS SEVERAL STANDARD DEV > 1, WHERE THERE IS NTconf IS NORMAL
# df_intarna_std[df_intarna_std > 1]

In [36]:
other.columns

Index(['splash_ES_INTARNAconf1.56', 'paris_acc_EPENHN_HQ',
       'splash_ES_NTconf68.49', 'ricseq_ES_INTARNAconf100.0',
       'paris_ES_NTconf46.91', 'paris_ES_NTconf32.13', 'paris_acc_ES_cdscds',
       'ricseq_acc_ES_notrain', 'splash_ES_NTconf100.0',
       'splash_ES_INTARNAconf4.84',
       ...
       'paris_ES_INTARNAconf7.07', 'splash_ES_NTconf0.5',
       'splash_ES_NTconf3.32', 'splash_ES_NTconf0.73', 'paris_acc_ES_sampled',
       'splash_acc_ES_cdscds', 'paris_acc_ES_HQ', 'splash_acc_ES_nocdscds',
       'paris_acc_EPENHN', 'splash_acc_easypos'],
      dtype='object', length=139)

In [34]:
df_intarna

Unnamed: 0,paris_aucINTARNA_ES_sampled,paris_aucINTARNA_ES,paris_aucINTARNA_ES_cdscds,paris_aucINTARNA_ES_nocdscds,paris_aucINTARNA_ES_HQ,paris_aucINTARNA_EPENHN,paris_aucINTARNA_EPENHN_HQ,paris_aucINTARNA_ES_notrain,paris_ES_INTARNA_INTARNAconf100.0,paris_ES_INTARNA_INTARNAconf68.49,...,splash_ES_INTARNA_NTconf7.07,splash_ES_INTARNA_NTconf4.84,splash_ES_INTARNA_NTconf3.32,splash_ES_INTARNA_NTconf2.27,splash_ES_INTARNA_NTconf1.56,splash_ES_INTARNA_NTconf1.07,splash_ES_INTARNA_NTconf0.73,splash_ES_INTARNA_NTconf0.5,ricseq_aucINTARNA_ES_notrain,splash_aucINTARNA_ES_notrain
0,52.16,52.17,54.89,51.78,52.99,57.06,59.43,51.67,52.18,52.71,...,78.3,76.67,83.33,81.25,100.0,100.0,100.0,100.0,53.21,79.4
1,52.18,52.19,54.91,51.77,52.9,56.69,59.45,51.67,52.18,52.71,...,62.5,58.04,44.17,39.06,20.0,0.0,0.0,0.0,54.03,79.38
2,52.17,52.17,54.96,51.77,52.82,56.8,59.33,51.66,52.18,52.71,...,67.99,64.29,66.96,56.67,56.25,42.86,40.0,33.33,54.18,79.21
3,52.15,52.16,54.91,51.77,52.83,56.9,59.34,51.59,52.18,52.71,...,63.9,66.27,74.17,70.31,88.0,93.75,87.5,100.0,55.29,79.32
4,52.15,52.19,54.94,51.76,52.89,56.99,59.92,51.64,52.18,52.71,...,64.52,70.42,63.33,58.73,66.67,66.67,40.0,,53.57,78.85
5,52.17,52.17,54.92,51.74,53.02,56.93,59.79,51.6,52.18,52.71,...,63.74,67.97,65.83,71.67,68.75,75.0,75.0,100.0,52.49,79.18
6,52.16,52.18,54.87,51.78,52.96,56.96,59.64,51.63,52.18,52.71,...,66.35,66.25,67.62,61.82,50.0,46.67,62.5,33.33,53.83,79.45
7,52.17,52.2,54.88,51.78,52.86,56.9,59.39,51.6,52.18,52.71,...,70.1,66.47,83.33,79.69,66.67,60.0,100.0,100.0,54.86,79.18
8,52.16,52.17,54.93,51.78,52.88,56.78,59.36,51.68,52.18,52.71,...,55.95,61.25,77.14,88.33,87.5,81.25,75.0,75.0,54.14,79.27
9,52.13,52.2,54.96,51.81,52.95,57.07,59.42,51.7,52.18,52.71,...,55.58,47.84,57.26,63.49,79.17,68.75,37.5,50.0,53.88,79.28


In [32]:
df_intarna_std[df_intarna_std > 1]

paris_ES_INTARNA_NTconf15.07      1.150058
paris_ES_INTARNA_NTconf10.32      1.450238
paris_ES_INTARNA_NTconf7.07       2.095403
paris_ES_INTARNA_NTconf4.84       2.670780
paris_ES_INTARNA_NTconf3.32       3.694840
paris_ES_INTARNA_NTconf2.27       4.398201
paris_ES_INTARNA_NTconf1.56       5.917345
paris_ES_INTARNA_NTconf1.07       6.609176
paris_ES_INTARNA_NTconf0.73       8.321608
paris_ES_INTARNA_NTconf0.5        9.918021
ricseq_ES_INTARNA_NTconf68.49     1.096763
ricseq_ES_INTARNA_NTconf46.91     1.398328
ricseq_ES_INTARNA_NTconf32.13     1.638393
ricseq_ES_INTARNA_NTconf22.01     2.154086
ricseq_ES_INTARNA_NTconf15.07     2.282553
ricseq_ES_INTARNA_NTconf10.32     2.637067
ricseq_ES_INTARNA_NTconf7.07      3.056657
ricseq_ES_INTARNA_NTconf4.84      4.016753
ricseq_ES_INTARNA_NTconf3.32      5.986194
ricseq_ES_INTARNA_NTconf2.27      7.943445
ricseq_ES_INTARNA_NTconf1.56      9.378858
ricseq_ES_INTARNA_NTconf1.07     10.151719
ricseq_ES_INTARNA_NTconf0.73     11.538162
ricseq_ES_I

In [17]:
other.columns

Index(['splash_ES_INTARNAconf1.56', 'paris_acc_EPENHN_HQ',
       'splash_ES_NTconf68.49', 'ricseq_ES_INTARNAconf100.0',
       'paris_ES_NTconf46.91', 'paris_ES_NTconf32.13', 'paris_acc_ES_cdscds',
       'ricseq_acc_ES_notrain', 'splash_ES_NTconf100.0',
       'splash_ES_INTARNAconf4.84',
       ...
       'paris_ES_INTARNAconf7.07', 'splash_ES_NTconf0.5',
       'splash_ES_NTconf3.32', 'splash_ES_NTconf0.73', 'paris_acc_ES_sampled',
       'splash_acc_ES_cdscds', 'paris_acc_ES_HQ', 'splash_acc_ES_nocdscds',
       'paris_acc_EPENHN', 'splash_acc_easypos'],
      dtype='object', length=139)

In [36]:
important = ['model', 'paris_acc_ES', 'paris_aucNT_ES', 'paris_aucINTARNA_ES', 'ricseq_acc_ES', 'ricseq_aucNT_ES', 'ricseq_aucINTARNA_ES', 'splash_acc_ES', 'splash_aucNT_ES', 'splash_aucINTARNA_ES']

In [37]:
subset = df.filter(important, axis = 1)

Unnamed: 0,paris_acc_ES,paris_aucNT_ES,paris_aucINTARNA_ES,ricseq_acc_ES,ricseq_aucNT_ES,ricseq_aucINTARNA_ES,splash_acc_ES,splash_aucNT_ES,splash_aucINTARNA_ES
/data01/giorgio/RNARNA-NT/checkpoints/all_modelarch2_easypretrain3_paristfinetuning14_ricseq23_splash29,0.523123,0.543448,0.521564,0.570292,0.596984,0.548357,0.557365,0.564659,0.676705
/data01/giorgio/RNARNA-NT/checkpoints/all_modelarch2_easypretrain3_paristfinetuningSPLASHval8,0.556445,0.585841,0.521669,0.527544,0.549748,0.547138,0.524324,0.527821,0.675616
/data01/giorgio/RNARNA-NT/checkpoints/all_modelarch2_easypretrain3_paristfinetuningSPLASHval29,0.563025,0.585674,0.5216,0.560351,0.572419,0.546981,0.531149,0.536867,0.675588
/data01/giorgio/RNARNA-NT/checkpoints/all_modelarch1_paristfinetuningSPLASHval25_ricseqSPLASHval28_splash44,0.523778,0.533499,0.522,0.553977,0.598566,0.551072,0.596824,0.633032,0.676933
/data01/giorgio/RNARNA-NT/checkpoints/all_modelarch2_easypretrain3_paristfinetuningSPLASHval30,0.564422,0.58648,0.5218,0.566257,0.589892,0.543723,0.527635,0.556702,0.676563
/data01/giorgio/RNARNA-NT/checkpoints/all_modelarch2_easypretrain3_paristfinetuningSPLASHval20,0.553276,0.594997,0.521758,0.551462,0.559104,0.551509,0.529595,0.536345,0.67479
/data01/giorgio/RNARNA-NT/checkpoints/all_modelarch2_easypretrain3_paristfinetuningSPLASHval18_ricseqSPLASHvalBEST,0.518106,0.534161,0.52186,0.603509,0.674304,0.545664,0.550743,0.560763,0.675882
/data01/giorgio/RNARNA-NT/checkpoints/all_modelarch1_paristfinetuningSPLASHval25_ricseqSPLASHval105_splashBESTmodel,0.512442,0.519431,0.521988,0.604035,0.653513,0.546686,0.638102,0.672333,0.674446
/data01/giorgio/RNARNA-NT/checkpoints/all_modelarch1_paristfinetuningSPLASHval25_ricseqSPLASHval28_splash32,0.527944,0.537423,0.522088,0.557836,0.599011,0.55224,0.53223,0.577344,0.675007
/data01/giorgio/RNARNA-NT/checkpoints/all_modelarch1_easypretrain7_paristrain22,0.551902,0.5737,0.521899,0.576023,0.603923,0.540641,0.513514,0.534074,0.67536
