## Compare model size/performance correlations using different measures

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
corr_methods = ['pearson', 'spearman', 'ccc']
quantile = 0.01

corr_methods_df = []
for corr_method in corr_methods:
    corr_method_df = pd.read_csv(
        f'./{corr_method}_q{quantile}_cancer_type_corrs.tsv',
        sep='\t'
    )
    corr_method_df['corr_method'] = corr_method
    corr_method_df.rename(columns={
        f'{corr_method}_r': 'corr',
        f'{corr_method}_pval': 'pval'
    }, inplace=True)
    corr_methods_df.append(corr_method_df)

corr_methods_df = pd.concat((corr_methods_df))
                    
print(corr_methods_df.shape)
corr_methods_df.sort_values(by=['gene', 'cancer_type']).head(10)

(834, 5)


Unnamed: 0,gene,cancer_type,corr,pval,corr_method
237,APC,BLCA,-0.060496,0.598787,pearson
236,APC,BLCA,-0.045524,0.692271,spearman
254,APC,BLCA,0.018865,0.0,ccc
64,APC,COAD,0.379339,0.00052,pearson
76,APC,COAD,0.349422,0.001488,spearman
80,APC,COAD,0.094354,0.0,ccc
244,APC,LUAD,-0.092334,0.418325,pearson
210,APC,LUAD,0.020046,0.860801,spearman
70,APC,LUAD,0.103156,0.0,ccc
246,APC,LUSC,-0.098106,0.395966,pearson


In [3]:
# want to find examples where the correlation methods disagree
# particularly pearson vs. CCC, or spearman vs. CCC
corr_wide_df = (corr_methods_df
    .drop(columns='pval')
    .pivot(index=['gene', 'cancer_type'],
           columns='corr_method',
           values='corr')
)

corr_wide_df.head()

Unnamed: 0_level_0,corr_method,ccc,pearson,spearman
gene,cancer_type,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
APC,BLCA,0.018865,-0.060496,-0.045524
APC,COAD,0.094354,0.379339,0.349422
APC,LUAD,0.103156,-0.092334,0.020046
APC,LUSC,0.056537,-0.098106,-0.042405
APC,READ,0.035483,-0.20957,-0.128229


### Compare using absolute differences between Pearson/Spearman and CCC

Large difference = CCC disagrees more with Pearson/Spearman

In [4]:
gene_diff_df = (corr_wide_df
    .reset_index()
    .groupby('gene')
    .agg(np.median)
)
    
gene_diff_df.sort_values(by=['pearson'], ascending=True).head()

corr_method,ccc,pearson,spearman
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
CDH1,0.104627,-0.382411,-0.327914
FGFR2,0.040759,-0.221462,-0.172024
NCOR1,0.016168,-0.150426,0.011873
MAP3K1,0.057779,-0.140612,-0.122424
H3F3A,0.060575,-0.139302,-0.192096


In [5]:
gene_diff_df['ccc_pearson_diff'] = (
    gene_diff_df.ccc - gene_diff_df.pearson
).abs()

gene_diff_df.sort_values(by='ccc_pearson_diff', ascending=False).head(8)

corr_method,ccc,pearson,spearman,ccc_pearson_diff
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CDH1,0.104627,-0.382411,-0.327914,0.487038
FBXW7,0.213084,0.63102,0.627639,0.417936
TP53,0.110604,0.423075,0.442092,0.31247
ARID1A,0.11125,0.412406,0.430575,0.301156
SETD2,0.078582,0.362173,0.369508,0.283591
BAP1,0.096766,0.378014,0.411525,0.281248
CDKN2A,0.090116,0.35382,0.396987,0.263704
FGFR2,0.040759,-0.221462,-0.172024,0.262221


In [6]:
gene_diff_df['ccc_spearman_diff'] = (
    gene_diff_df.ccc - gene_diff_df.spearman
).abs()

gene_diff_df.sort_values(by='ccc_spearman_diff', ascending=False).head(8)

corr_method,ccc,pearson,spearman,ccc_pearson_diff,ccc_spearman_diff
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CDH1,0.104627,-0.382411,-0.327914,0.487038,0.432541
FBXW7,0.213084,0.63102,0.627639,0.417936,0.414555
TP53,0.110604,0.423075,0.442092,0.31247,0.331488
ARID1A,0.11125,0.412406,0.430575,0.301156,0.319325
BAP1,0.096766,0.378014,0.411525,0.281248,0.31476
CDKN2A,0.090116,0.35382,0.396987,0.263704,0.306871
SETD2,0.078582,0.362173,0.369508,0.283591,0.290927
RB1,0.10584,0.352397,0.364214,0.246557,0.258374


### Compare using rank differences between Pearson/Spearman and CCC

The above analyses tend to skew toward genes with large baseline correlations, we want to see if taking ranks will result in different top hits.

In [7]:
# take rank of median correlation across cancer types
# spearman/pearson and CCC have slightly different scales (s/p are
# in [-1, 1] and CCC is in [0, 1] so comparing ranks could make more sense
gene_ranks_df = (corr_wide_df
    .reset_index()
    .groupby('gene')
    .agg(np.median)
    .rank(ascending=False)
)
    
gene_ranks_df.sort_values(by=['pearson'], ascending=True).head()

corr_method,ccc,pearson,spearman
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
FBXW7,1.0,1.0,1.0
TP53,5.0,2.0,2.0
ARID1A,4.0,3.0,3.0
BAP1,9.0,4.0,4.0
SETD2,13.0,5.0,6.0


In [8]:
gene_ranks_df['ccc_pearson_diff'] = (
    gene_ranks_df.ccc - gene_ranks_df.pearson
).abs()

gene_ranks_df.sort_values(by='ccc_pearson_diff', ascending=False).head(8)

corr_method,ccc,pearson,spearman,ccc_pearson_diff
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CDH1,7.0,56.0,56.0,49.0
SETBP1,52.0,21.0,18.0,31.0
SF3B1,12.0,42.0,22.0,30.0
H3F3A,24.0,52.0,55.0,28.0
RNF43,26.0,51.0,53.0,25.0
ATRX,55.0,30.0,30.0,25.0
MAP3K1,28.0,53.0,52.0,25.0
FLT3,8.0,32.0,33.0,24.0


In [9]:
gene_ranks_df['ccc_spearman_diff'] = (
    gene_ranks_df.ccc - gene_ranks_df.spearman
).abs()

gene_ranks_df.sort_values(by='ccc_spearman_diff', ascending=False).head(8)

corr_method,ccc,pearson,spearman,ccc_pearson_diff,ccc_spearman_diff
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
CDH1,7.0,56.0,56.0,49.0,49.0
SETBP1,52.0,21.0,18.0,31.0,34.0
H3F3A,24.0,52.0,55.0,28.0,31.0
RNF43,26.0,51.0,53.0,25.0,27.0
ATRX,55.0,30.0,30.0,25.0,25.0
FLT3,8.0,32.0,33.0,24.0,25.0
JAK3,20.0,36.0,44.0,16.0,24.0
MAP3K1,28.0,53.0,52.0,25.0,24.0


Overall we see that some of the same genes show up (e.g. CDH1) but there are some genes with low overall ranks but high differences (e.g. ATRX, H3F3A).

Most of the genes that show up in the absolute difference lists are genes that we've previously seen accurate mutation prediction for (TP53, RB1, ARID1A, SETD2).