## Compare model size/performance correlations using different measures

In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [9]:
corr_methods = ['pearson', 'spearman', 'ccc']
quantile = 0.01

corr_methods_df = []
for corr_method in corr_methods:
    corr_method_df = pd.read_csv(
        f'./{corr_method}_q{quantile}_cancer_type_corrs.tsv',
        sep='\t'
    )
    corr_method_df['corr_method'] = corr_method
    corr_method_df.rename(columns={
        f'{corr_method}_r': 'corr',
        f'{corr_method}_pval': 'pval'
    }, inplace=True)
    corr_methods_df.append(corr_method_df)

corr_methods_df = pd.concat((corr_methods_df))
                    
print(corr_methods_df.shape)
corr_methods_df.sort_values(by=['gene', 'cancer_type']).head(10)

(834, 5)


Unnamed: 0,gene,cancer_type,corr,pval,corr_method
237,APC,BLCA,-0.060496,0.598787,pearson
236,APC,BLCA,-0.045524,0.692271,spearman
254,APC,BLCA,0.018865,0.0,ccc
64,APC,COAD,0.379339,0.00052,pearson
76,APC,COAD,0.349422,0.001488,spearman
80,APC,COAD,0.094354,0.0,ccc
244,APC,LUAD,-0.092334,0.418325,pearson
210,APC,LUAD,0.020046,0.860801,spearman
70,APC,LUAD,0.103156,0.0,ccc
246,APC,LUSC,-0.098106,0.395966,pearson


In [10]:
# want to find examples where the correlation methods disagree
# particularly pearson vs. CCC, or spearman vs. CCC
corr_wide_df = (corr_methods_df
    .drop(columns='pval')
    .pivot(index=['gene', 'cancer_type'],
           columns='corr_method',
           values='corr')
)

corr_wide_df.head()

Unnamed: 0_level_0,corr_method,ccc,pearson,spearman
gene,cancer_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
APC,BLCA,0.018865,-0.060496,-0.045524
APC,COAD,0.094354,0.379339,0.349422
APC,LUAD,0.103156,-0.092334,0.020046
APC,LUSC,0.056537,-0.098106,-0.042405
APC,READ,0.035483,-0.20957,-0.128229


In [23]:
gene_ranks_df = (corr_wide_df
    .reset_index()
    .groupby('gene')
    .agg(np.median)
    .rank(ascending=False)
)
    
gene_ranks_df.sort_values(by=['pearson'], ascending=True).head()

corr_method,ccc,pearson,spearman
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FBXW7,1.0,1.0,1.0
TP53,5.0,2.0,2.0
ARID1A,4.0,3.0,3.0
BAP1,9.0,4.0,4.0
SETD2,13.0,5.0,6.0


In [24]:
gene_ranks_df['ccc_pearson_diff'] = (
    gene_ranks_df.ccc - gene_ranks_df.pearson
).abs()

gene_ranks_df.sort_values(by='ccc_pearson_diff', ascending=False).head(8)

corr_method,ccc,pearson,spearman,ccc_pearson_diff
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CDH1,7.0,56.0,56.0,49.0
SETBP1,52.0,21.0,18.0,31.0
SF3B1,12.0,42.0,22.0,30.0
H3F3A,24.0,52.0,55.0,28.0
RNF43,26.0,51.0,53.0,25.0
ATRX,55.0,30.0,30.0,25.0
MAP3K1,28.0,53.0,52.0,25.0
FLT3,8.0,32.0,33.0,24.0


In [25]:
gene_ranks_df['ccc_spearman_diff'] = (
    gene_ranks_df.ccc - gene_ranks_df.spearman
).abs()

gene_ranks_df.sort_values(by='ccc_spearman_diff', ascending=False).head(8)

corr_method,ccc,pearson,spearman,ccc_pearson_diff,ccc_spearman_diff
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CDH1,7.0,56.0,56.0,49.0,49.0
SETBP1,52.0,21.0,18.0,31.0,34.0
H3F3A,24.0,52.0,55.0,28.0,31.0
RNF43,26.0,51.0,53.0,25.0,27.0
ATRX,55.0,30.0,30.0,25.0,25.0
FLT3,8.0,32.0,33.0,24.0,25.0
JAK3,20.0,36.0,44.0,16.0,24.0
MAP3K1,28.0,53.0,52.0,25.0,24.0
