# Curating and Visualizing Sample Correlations

**Gregory Way 2018**

Observing how well samples correlate between input and reconstruction across algorithms and bottleneck dimensions

The data was generated first by running the following:

```bash
bash 2.ensemble-z-analysis/analysis.sh
```

## Structure:

The notebook will collect all the data that track sample correlations and then visualize results in a series of plots for each dataset.
For each dataset the following three steps are sequentially performed:

1. Load phenotype data
2. Load sample correlation data
3. Merge phenotype and sample correlation results

After each step is performed for all datasets, all the plots are generated on the fly.

## Output:

Several sample correlation figures

In [1]:
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))

In [2]:
# Load helper functions
source(file.path("scripts", "util.R"))

In [3]:
# Create theme
correlation_theme <- theme(axis.text.x = element_text(angle = 90, size = 5),
                           plot.title = element_text(hjust = 0.5),
                           legend.text = element_text(size = 8),
                           legend.key.size = unit(0.7, 'lines'))

In [4]:
# Create list for data storage
sample_correlation_list <- list()

## Part 1. TARGET Sample Correlations

In [5]:
dataset_name <- "TARGET"

In [6]:
# 1) Load phenotype data
target_file <- file.path("..", "0.expression-download", "download", "TARGET_phenotype.gz")
target_pheno_df <- readr::read_tsv(target_file,
                                   col_types = readr::cols(
                                       .default = readr::col_character()))

colnames(target_pheno_df)[2] <- 'sample_type'

In [7]:
# 2) Load sample correlation data
sample_correlation_list[[dataset_name]] <- compile_sample_correlation(dataset_name = dataset_name)

In [8]:
# 3) Merge phenotype and sample correlation results
sample_correlation_list[[dataset_name]] <-
    sample_correlation_list[[dataset_name]] %>%
    dplyr::full_join(target_pheno_df, by = c("id" = "sample_id")) %>%
    na.omit

head(sample_correlation_list[[dataset_name]], 2)

algorithm,id,correlation,cor_type,seed,data,num_comp,dataset_id,shuffled,sample_type,_primary_disease,sample_type_code,_sample_type,_PATIENT,_cohort
pca,TARGET-30-PARSBI-01,0.872,pearson,8407,training,10,TARGET,signal,NBL,Neuroblastoma,TP,Primary Solid Tumor,PARSBI,TARGET
ica,TARGET-30-PARSBI-01,0.872,pearson,8407,training,10,TARGET,signal,NBL,Neuroblastoma,TP,Primary Solid Tumor,PARSBI,TARGET


## Part 2. TCGA Sample Correlations

In [9]:
dataset_name <- "TCGA"

In [10]:
# 1) Load phenotype data
tcga_file <- file.path("..", "0.expression-download", "data", "tcga_sample_identifiers.tsv")
tcga_pheno_df <- readr::read_tsv(tcga_file,
                                 col_types = readr::cols(
                                        .default = readr::col_character()))

colnames(tcga_pheno_df)[2] <- 'sample_class'
colnames(tcga_pheno_df)[3] <- 'sample_type'

In [11]:
# 2) Load sample correlation data
sample_correlation_list[[dataset_name]] <- compile_sample_correlation(dataset_name = dataset_name)

In [12]:
# 3) Merge phenotype and sample correlation results
sample_correlation_list[[dataset_name]] <-
    sample_correlation_list[[dataset_name]] %>%
    dplyr::full_join(tcga_pheno_df, by = c("id" = "sample_id")) %>%
    na.omit

head(sample_correlation_list[[dataset_name]], 2)

algorithm,id,correlation,cor_type,seed,data,num_comp,dataset_id,shuffled,sample_class,sample_type
pca,TCGA-LL-A73Z-01,0.905,pearson,904824,training,10,TCGA,signal,Primary Solid Tumor,BRCA
ica,TCGA-LL-A73Z-01,0.905,pearson,904824,training,10,TCGA,signal,Primary Solid Tumor,BRCA


## Part 3. GTEX Sample Correlations

In [13]:
dataset_name <- "GTEX"

In [14]:
# 1) Load phenotype data
gtex_file <- file.path("..", "0.expression-download", "download",
                       "GTEx_v7_Annotations_SampleAttributesDS.txt")
gtex_pheno_df <- readr::read_tsv(gtex_file,
                                 col_types = readr::cols(
                                        .default = readr::col_character()))

colnames(gtex_pheno_df)[1] <- 'sample_id'
colnames(gtex_pheno_df)[6] <- 'sample_type'

# Subset gtex phenotype file for plotting
gtex_pheno_df <- gtex_pheno_df[, c('sample_id', 'sample_type')]
head(gtex_pheno_df, 2)

sample_id,sample_type
GTEX-1117F-0003-SM-58Q7G,Blood
GTEX-1117F-0003-SM-5DWSB,Blood


In [15]:
# 2) Load sample correlation data
sample_correlation_list[[dataset_name]] <- compile_sample_correlation(dataset_name = dataset_name)

In [16]:
# 3) Merge phenotype and sample correlation results
sample_correlation_list[[dataset_name]] <-
    sample_correlation_list[[dataset_name]] %>%
    dplyr::full_join(gtex_pheno_df, by = c("id" = "sample_id")) %>%
    na.omit

head(sample_correlation_list[[dataset_name]], 2)

algorithm,id,correlation,cor_type,seed,data,num_comp,dataset_id,shuffled,sample_type
pca,GTEX-T5JC-1526-SM-4DM68,0.74,pearson,802271,training,10,GTEX,signal,Kidney
ica,GTEX-T5JC-1526-SM-4DM68,0.74,pearson,802271,training,10,GTEX,signal,Kidney


## Part 4. Generate and Save all Sample Correlation Plots

In [17]:
for (dataset_name in names(sample_correlation_list)) {
    # Extract out the specific dataset correlation data
    sample_corr_df <- sample_correlation_list[[dataset_name]]
    
    # Loop through the datatype
    for (data_type in c("signal", "shuffled")) {
        
        # Loop over the correlation type
        for (correlation_type in c("pearson", "spearman")) {
            
            print(paste("Processing... ",
                        dataset_name, data_type, correlation_type))

            # Execute the plotting logic
            plot_sample_correlation(dataset_name = dataset_name,
                                    data_df = sample_corr_df,
                                    data_type = data_type,
                                    correlation_type = correlation_type,
                                    use_theme = correlation_theme,
                                    return_figures = FALSE)
        }
    }
}


[1] "Processing...  TARGET signal pearson"
[1] "Processing...  TARGET signal spearman"
[1] "Processing...  TARGET shuffled pearson"
[1] "Processing...  TARGET shuffled spearman"
[1] "Processing...  TCGA signal pearson"
[1] "Processing...  TCGA signal spearman"
[1] "Processing...  TCGA shuffled pearson"
[1] "Processing...  TCGA shuffled spearman"
[1] "Processing...  GTEX signal pearson"
[1] "Processing...  GTEX signal spearman"
[1] "Processing...  GTEX shuffled pearson"
[1] "Processing...  GTEX shuffled spearman"
