## Calculate distance between means/medoids of mutation groupings

Our goal is to find an unsupervised way of calculating distance/similarity between our mutation groupings ("none"/"one"/"both") which isn't affected by sample size, to the degree that differentially expressed gene count was (see `4_de_analysis` notebooks).

Here, we'll try the extremely simple method of:

1) taking the n-dimensional mean (centroid) or median (medoid) of each group  
2) calculating distance between the centroids and using this to define "expression similarity"

We'll try this for a few different feature selection/embedding methods, and for both gene expression and RPPA (protein expression) data.

In [1]:
from pathlib import Path
import pickle as pkl
import itertools as it

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import sys; sys.path.append('..')
import config as cfg

%load_ext autoreload
%autoreload 2

In [2]:
# whether to use expression or rppa data
# data_type = 'expression'
data_type = 'rppa'

# how to calculate centroids, 'mean' or 'median'
centroid_method = 'mean'

# number of features to subset to, by mean absolute deviation
# TODO try this in PCA/UMAP space too
subset_mad_feats = 100

### Load expression data

We'll also subset to the top features by mean absolute deviation, if that option .

In [3]:
expression_data_file = (
    '/home/jake/research/mpmp/data/tcga_expression_matrix_processed.tsv.gz'
)
expression_sample_info = (
    '/home/jake/research/mpmp/data/sample_info/tcga_expression_sample_identifiers.tsv'
)

rppa_data_file = (
    '/home/jake/research/mpmp/data/tcga_rppa_matrix_processed.tsv'
)
rppa_sample_info = (
    '/home/jake/research/mpmp/data/sample_info/tcga_rppa_sample_identifiers.tsv'
)

if data_type == 'expression':
    data_df = pd.read_csv(expression_data_file, sep='\t', index_col=0)
    sample_info_df = pd.read_csv(expression_data_file, sep='\t', index_col=0)
elif data_type == 'rppa':
    data_df = pd.read_csv(rppa_data_file, sep='\t', index_col=0)
    sample_info_df = pd.read_csv(rppa_data_file, sep='\t', index_col=0)
    
print(data_df.shape)
data_df.iloc[:5, :5]

(7790, 189)


Unnamed: 0_level_0,X1433EPSILON,X4EBP1,X4EBP1_pS65,X4EBP1_pT37T46,X53BP1
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
TCGA-OR-A5J2-01,-0.494,0.592,0.105,-0.135,1.89
TCGA-PA-A5YG-01,-0.535,0.183,-0.106,-0.0401,1.03
TCGA-OR-A5JV-01,-0.109,0.418,0.0247,-0.721,1.66
TCGA-OR-A5JT-01,-0.413,0.259,0.00597,0.563,1.46
TCGA-OR-A5JR-01,-0.288,-0.112,0.0194,-0.00267,2.23


In [4]:
if subset_mad_feats is not None:
    mad_ranking = (
        data_df.mad(axis=0)
               .sort_values(ascending=False)
    )
    top_feats = mad_ranking[:subset_mad_feats].index.astype(str).values
    print(top_feats[:5])
    data_df = data_df.reindex(top_feats, axis='columns')
    
print(data_df.shape)
data_df.head()

['MYH11' 'VHL' 'ECADHERIN' 'ERALPHA' 'CLAUDIN7']
(7790, 100)


Unnamed: 0_level_0,MYH11,VHL,ECADHERIN,ERALPHA,CLAUDIN7,ACETYLATUBULINLYS40,RICTOR,FASN,CAVEOLIN1,GAPDH,...,MSH2,AKT,LCK,KU80,P53,SMAD1,NCADHERIN,G6PD,BAP1C4,TSC1
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-OR-A5J2-01,1.95,-0.315,-1.4,-1.02,0.141,2.48,0.881,0.228,2.16,1.55,...,-0.263,0.696,-0.092,1.68,-0.743,0.0846,-0.245,-0.509,-0.427,2.25
TCGA-PA-A5YG-01,0.602,0.697,-0.223,-0.445,0.517,1.36,0.305,-0.699,0.762,0.587,...,-0.266,0.543,-0.0364,1.44,-0.133,-0.177,-0.181,-0.294,-0.531,1.11
TCGA-OR-A5JV-01,-0.679,-0.301,-1.35,-1.16,0.336,2.22,0.571,-0.2,0.501,1.96,...,-0.402,0.437,0.206,1.32,-0.437,0.115,-0.251,0.0225,-0.301,2.01
TCGA-OR-A5JT-01,-0.14,-0.481,-1.67,-1.21,0.204,2.75,0.141,-0.94,1.45,0.503,...,-0.611,0.533,0.339,1.1,-0.749,0.0357,0.0495,0.779,-0.912,1.51
TCGA-OR-A5JR-01,-1.04,-0.337,-1.23,-1.09,0.265,3.04,0.331,-0.0643,0.708,1.67,...,-0.47,1.19,0.018,1.15,-0.669,-0.113,-0.00368,1.0,-0.631,1.31


### Load Park et al. "hit" data

This was collated/formatted in `0_process_park.ipynb`

In [5]:
with open(cfg.distance_gain_info, 'rb') as f:
    park_gain_info = pkl.load(f)
    
park_gain_info['TP53_BRCA'].head()

Unnamed: 0,class_name,mutation_status,cnv_status,num_hits
TCGA-3C-AAAU-01,class 4,0,1,one
TCGA-3C-AALI-01,class 4,1,1,both
TCGA-3C-AALJ-01,class 4,0,1,one
TCGA-3C-AALK-01,class 4,0,1,one
TCGA-4H-AAAK-01,class 4,0,1,one


In [6]:
with open(cfg.distance_loss_info, 'rb') as f:
    park_loss_info = pkl.load(f)
    
park_loss_info['TP53_BRCA'].head()

Unnamed: 0,class_name,mutation_status,cnv_status,num_hits
TCGA-3C-AAAU-01,class 4,0,0,none
TCGA-3C-AALI-01,class 4,1,0,one
TCGA-3C-AALJ-01,class 4,0,0,none
TCGA-3C-AALK-01,class 4,0,0,none
TCGA-4H-AAAK-01,class 4,0,0,none


### Calculate distance between means/medians for given gene + cancer type

In [7]:
from scipy.spatial.distance import pdist, squareform

def get_centroids_and_distance(identifier, info_df, centroid_method='mean'):
    
    groups = ['both', 'none', 'one']
    group_combinations = list(it.combinations(groups, 2))
    
    # get expression data for samples
    samples = info_df.index.intersection(data_df.index)
    info_df = info_df.reindex(samples)
    
    # if one group has no samples, we have to make sure to assign it 0 count
    class_counts = []
    hit_class_counts = info_df.groupby('num_hits').count().class_name
    for group in groups:
        if group in hit_class_counts.index:
            class_counts.append(hit_class_counts[group])
        else:
            class_counts.append(0)
    
    # group by number of hits, then calculate centroids
    centroids_df = (data_df
        .reindex(samples)
        .merge(info_df['num_hits'], left_index=True, right_index=True)
        .groupby('num_hits')
    )
    
    if centroid_method == 'mean':
        centroids_df = centroids_df.mean()
    elif centroid_method == 'median':
        centroids_df = centroids_df.median()
    else:
        raise NotImplementedError(
            'centroid method {} not implemented'.format(centroid_method)
        )
    
    # calculate distance between centroids
    # make sure this is in the same order for each identifier, and
    # handle NA distances here (if one group doesn't have any samples)
    dists = pdist(centroids_df.values, metric='euclidean')
    dist_combinations = list(it.combinations(hit_class_counts.index, 2))
    ordered_dists = []
    for cmb in group_combinations:
        if cmb not in dist_combinations:
            ordered_dists.append(np.nan)
        else:
            cmb_ix = dist_combinations.index(cmb)
            ordered_dists.append(dists[cmb_ix])
    
    return groups, group_combinations, class_counts, ordered_dists
    
get_centroids_and_distance('TP53_BRCA',
                           park_loss_info['TP53_BRCA'],
                           'median')

(['both', 'none', 'one'],
 [('both', 'none'), ('both', 'one'), ('none', 'one')],
 [19, 491, 287],
 [5.290758839168914, 2.355077712145397, 3.8760241020922455])

### Calculate centroid distance between "hits", per class

Class 1 = look at both loss and gain (should be one-hit in neither)  
Class 2 = only look at loss (should be one-hit here)  
Class 3 = only look at gain (should be one-hit here)  
Class 4 = look at both loss and gain (should be one-hit in both)

In [8]:
class_counts_df = {}
results_df = {}
counts_columns = None
results_columns = None

# get distances for copy loss, for class 1/2/4 genes
for identifier, loss_df in park_loss_info.items():
    
    if loss_df.head(1).class_name.values[0] == 'class 3':
        continue
        
    results = get_centroids_and_distance(identifier, loss_df, 'mean')
    
    if counts_columns is None:
        counts_columns = results[0]
    else:
        assert counts_columns == results[0]
        
    if results_columns is None:
        results_columns = ['{}/{}'.format(i, j) for i, j in results[1]]
            
    class_counts_df[identifier] = results[2]
    results_df[identifier] = results[3]
    
class_counts_loss_df = pd.DataFrame(
    class_counts_df.values(),
    index=class_counts_df.keys(),
    columns=counts_columns
)
    
results_loss_df = pd.DataFrame(
    results_df.values(),
    index=results_df.keys(),
    columns=results_columns
)
    
print(class_counts_loss_df.shape)
class_counts_loss_df.head()

(433, 3)


Unnamed: 0,both,none,one
ACVR1_UCEC,2,309,94
ACVR2A_COADREAD,3,264,96
ACVR2A_LIHC,0,134,29
AJUBA_HNSC,5,132,65
AKT1_BRCA,5,650,142


In [9]:
print(results_loss_df.shape)
results_loss_df.head()

(433, 3)


Unnamed: 0,both/none,both/one,none/one
ACVR1_UCEC,4.716244,3.975686,1.581885
ACVR2A_COADREAD,5.484907,5.235206,0.809157
ACVR2A_LIHC,,,1.546988
AJUBA_HNSC,2.757378,2.631804,0.824527
AKT1_BRCA,3.488913,3.118687,1.075147


In [10]:
class_counts_df = {}
results_df = {}
counts_columns = None
results_columns = None

# get distances for copy gain, for class 1/3/4 genes
for identifier, gain_df in park_gain_info.items():
    
    if gain_df.head(1).class_name.values[0] == 'class 2':
        continue
        
    results = get_centroids_and_distance(identifier, gain_df, 'mean')
    
    if counts_columns is None:
        counts_columns = results[0]
    else:
        assert counts_columns == results[0]
        
    if results_columns is None:
        results_columns = ['{}/{}'.format(i, j) for i, j in results[1]]
            
    class_counts_df[identifier] = results[2]
    results_df[identifier] = results[3]
    
class_counts_gain_df = pd.DataFrame(
    class_counts_df.values(),
    index=class_counts_df.keys(),
    columns=counts_columns
)
    
results_gain_df = pd.DataFrame(
    results_df.values(),
    index=results_df.keys(),
    columns=results_columns
)
    
print(class_counts_gain_df.shape)
class_counts_gain_df.head()

(384, 3)


Unnamed: 0,both,none,one
ACVR1_UCEC,0,362,43
ACVR2A_COADREAD,2,320,41
ACVR2A_LIHC,2,133,28
AKT1_BRCA,0,557,240
AKT1_UCEC,0,347,58


In [11]:
print(results_gain_df.shape)
results_gain_df.head()

(384, 3)


Unnamed: 0,both/none,both/one,none/one
ACVR1_UCEC,,,1.269823
ACVR2A_COADREAD,4.943627,4.712461,1.454358
ACVR2A_LIHC,2.812433,2.941939,1.276057
AKT1_BRCA,,,1.499204
AKT1_UCEC,,,1.672928


### Plot centroid distance results