# Determining the distance of COAD and GBM tumors in different spaces

The script will output distance matrices for pairwise comparisons of all colon cancer (COAD) and glioblastoma (GBM) tumors. The script will output distances for raw gene expression features and gene expression features encoded by a variational autoencoder. Additionally, distance matrices for the subtraction of cancer-type specific means is also output.

In [1]:
import os
import pandas as pd
from scipy.spatial import distance

In [2]:
# Load data
raw_data_file = os.path.join('data', 'pancan_scaled_zeroone_rnaseq.tsv')
raw_data_subtract_file = os.path.join('data', 'cancertype_subtraction_raw.tsv')
encoded_data_file = os.path.join('data', 'vae_encoded_with_clinical.tsv')
encoded_data_subtract_file = os.path.join('data', 'cancertype_subtraction_encoded.tsv')

raw_data_df = pd.read_table(raw_data_file, index_col=0)
raw_data_subtract_df = pd.read_table(raw_data_subtract_file, index_col=0)
encoded_df = pd.read_table(encoded_data_file, index_col=0)
encoded_subtract_df = pd.read_table(encoded_data_subtract_file, index_col=0)

  interactivity=interactivity, compiler=compiler, result=result)
  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
# Load and subset mutation data to TP53, KRAS, and NF1
mutation_data_file = os.path.join('data', 'pancan_mutation.tsv')
mutation_df = pd.read_table(mutation_data_file,
                            usecols = ['#sample', 'TP53', 'KRAS', 'NF1'],
                            index_col=0)

In [4]:
# Subset data to GBM and COAD cancer-types
encoded_subset_df = encoded_df[encoded_df['acronym'].isin(['GBM', 'COAD'])]
encoded_subtract_subset_df = encoded_subtract_df[encoded_subtract_df['acronym'].isin(['GBM', 'COAD'])]
raw_data_subset_df = raw_data_df.loc[encoded_subset_df.index, ]
raw_data_subtract_subset_df = raw_data_subtract_df.loc[encoded_subset_df.index, raw_data_df.columns]

## Get euclidean distances for each feature space

In [5]:
raw_data_distance_file = os.path.join('data', 'distance', 'raw_distance.tsv')
raw_data_subtract_distance_file = os.path.join('data', 'distance', 'raw_subtract_distance.tsv')
encoded_distance_file = os.path.join('data', 'distance', 'encoded_distance.tsv')
encoded_subtract_distance_file = os.path.join('data', 'distance', 'encoded_subtraction_distance.tsv')

In [6]:
data_array_df = distance.pdist(raw_data_subset_df)
data_dist_df = pd.DataFrame(distance.squareform(data_array_df),
                            index=raw_data_subset_df.index,
                            columns=raw_data_subset_df.index)
data_dist_df.to_csv(raw_data_distance_file, sep='\t')

In [7]:
data_subtract_array_df = distance.pdist(raw_data_subtract_subset_df)
data_subtract_dist_df = pd.DataFrame(distance.squareform(data_subtract_array_df),
                            index=raw_data_subtract_subset_df.index,
                            columns=raw_data_subtract_subset_df.index)
data_subtract_dist_df.to_csv(raw_data_subtract_distance_file, sep='\t')

In [8]:
encoded_array_df = distance.pdist(encoded_subset_df.iloc[:,range(0, 100)])
encoded_dist_df = pd.DataFrame(distance.squareform(encoded_array_df),
                               index=encoded_subset_df.index,
                               columns=encoded_subset_df.index)
encoded_dist_df.to_csv(encoded_distance_file, sep='\t')

In [9]:
encoded_sub_array_df = distance.pdist(encoded_subtract_subset_df.iloc[:,range(0, 100)])
encoded_sub_dist_df = pd.DataFrame(distance.squareform(encoded_sub_array_df),
                                   index=encoded_subtract_subset_df.index,
                                   columns=encoded_subtract_subset_df.index)
encoded_sub_dist_df.to_csv(encoded_subtract_distance_file, sep='\t')

## Get NF1 and KRAS mutated GBM and COAD, respectively

In [10]:
gbm_samples = encoded_df[encoded_df['acronym'] == 'GBM'].index
coad_samples = encoded_df[encoded_df['acronym'] == 'COAD'].index

In [11]:
gbm_df = mutation_df[mutation_df.index.isin(gbm_samples) & mutation_df['NF1'] == 1]
gbm_df = gbm_df.assign(acronym = 'GBM')
coad_df = mutation_df[mutation_df.index.isin(coad_samples) & mutation_df['KRAS'] == 1]
coad_df = coad_df.assign(acronym = 'COAD')
ras_samples_df = gbm_df.append(coad_df)

In [12]:
ras_samples_file = os.path.join('data', 'distance', 'gbm_coad_rasmutations.tsv')
ras_samples_df.to_csv(ras_samples_file, sep='\t')