 # Curating and Visualizing Sample Correlations

**Gregory Way 2018**

Observing how well samples correlate between input and reconstruction across algorithms and bottleneck dimensions

The data was generated first by running the following:

```bash
bash 2.ensemble-z-analysis/analysis.sh
```

## Structure:

The notebook will collect all the data that track sample correlations and then visualize results in a series of plots for each dataset.
For each dataset the following four steps are sequentially performed:

1. Load phenotype data
2. Load sample correlation data
3. Merge and output phenotype and sample correlation results
4. Output mean and variance summaries for correlations per sample type

After each step is performed for all datasets, all the plots are generated on the fly.

## Output:

Several sample correlation figures

In [1]:
suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(ggplot2))

In [2]:
# Load helper functions
source(file.path("scripts", "util.R"))

In [3]:
# Create theme
correlation_theme <- theme(axis.text.x = element_text(angle = 90,
                                                      size = 5),
                           plot.title = element_text(hjust = 0.5),
                           strip.background = element_rect(colour = "black",
                                                           fill = "#fdfff4"),
                           legend.text = element_text(size = 8),
                           legend.key.size = unit(0.7, 'lines'))

In [4]:
# Create list for data storage
sample_correlation_list <- list()

## Part 1. TARGET Sample Correlations

In [5]:
dataset_name <- "TARGET"

In [6]:
# 1) Load phenotype data
target_file <- file.path("..", "0.expression-download", "download", "TARGET_phenotype.gz")
target_pheno_df <- readr::read_tsv(target_file,
                                   col_types = readr::cols(
                                       .default = readr::col_character()))

colnames(target_pheno_df)[2] <- 'sample_type'

In [7]:
# 2) Load sample correlation data
sample_correlation_list[[dataset_name]] <- compile_sample_correlation(dataset_name = dataset_name)

In [8]:
# 3) Merge phenotype and sample correlation results
sample_correlation_list[[dataset_name]] <-
    sample_correlation_list[[dataset_name]] %>%
    dplyr::full_join(target_pheno_df, by = c("id" = "sample_id")) %>%
    na.omit

out_file <- file.path("results", "TARGET_sample_correlation_phenotype.tsv.gz")
readr::write_tsv(sample_correlation_list[[dataset_name]], out_file)

head(sample_correlation_list[[dataset_name]], 2)

algorithm,id,correlation,cor_type,seed,data,num_comp,dataset_id,shuffled,sample_type,_primary_disease,sample_type_code,_sample_type,_PATIENT,_cohort
pca,TARGET-30-PARSBI-01,0.872,pearson,486191,training,10,TARGET,signal,NBL,Neuroblastoma,TP,Primary Solid Tumor,PARSBI,TARGET
ica,TARGET-30-PARSBI-01,0.872,pearson,486191,training,10,TARGET,signal,NBL,Neuroblastoma,TP,Primary Solid Tumor,PARSBI,TARGET


In [9]:
# 4) Summarize correlations per sample-type and write to file
disease_summary_df <- sample_correlation_list[[dataset_name]] %>%
    dplyr::group_by(algorithm, sample_type, num_comp, cor_type, shuffled, data) %>%
    dplyr::summarize(mean_cor = mean(correlation),
                     var_cor = var(correlation))

out_file <- file.path("results", paste0(dataset_name, "_sample_correlation_phenotype_summary.tsv.gz"))
readr::write_tsv(disease_summary_df, out_file)

head(disease_summary_df)

algorithm,sample_type,num_comp,cor_type,shuffled,data,mean_cor,var_cor
pca,ALL,2,pearson,shuffled,training,0.1179701,0.0007032166
pca,ALL,2,pearson,shuffled,testing,3.515187e-20,6.614141e-05
pca,ALL,2,pearson,signal,training,0.7132471,0.01361028
pca,ALL,2,pearson,signal,testing,0.6317,0.005577384
pca,ALL,2,spearman,shuffled,training,0.1148954,0.001343776
pca,ALL,2,spearman,shuffled,testing,-0.00074,5.362869e-05


## Part 2. TCGA Sample Correlations

In [10]:
dataset_name <- "TCGA"

In [11]:
# 1) Load phenotype data
tcga_file <- file.path("..", "0.expression-download", "data", "tcga_sample_identifiers.tsv")
tcga_pheno_df <- readr::read_tsv(tcga_file,
                                 col_types = readr::cols(
                                        .default = readr::col_character()))

colnames(tcga_pheno_df)[2] <- 'sample_class'
colnames(tcga_pheno_df)[3] <- 'sample_type'

In [12]:
# 2) Load sample correlation data
sample_correlation_list[[dataset_name]] <- compile_sample_correlation(dataset_name = dataset_name)

In [13]:
# 3) Merge phenotype and sample correlation results
sample_correlation_list[[dataset_name]] <-
    sample_correlation_list[[dataset_name]] %>%
    dplyr::full_join(tcga_pheno_df, by = c("id" = "sample_id")) %>%
    na.omit

out_file <- file.path("results", "TCGA_sample_correlation_phenotype.tsv.gz")
readr::write_tsv(sample_correlation_list[[dataset_name]], out_file)

head(sample_correlation_list[[dataset_name]], 2)

algorithm,id,correlation,cor_type,seed,data,num_comp,dataset_id,shuffled,sample_class,sample_type
pca,TCGA-LL-A73Z-01,0.905,pearson,486191,training,10,TCGA,signal,Primary Solid Tumor,BRCA
ica,TCGA-LL-A73Z-01,0.905,pearson,486191,training,10,TCGA,signal,Primary Solid Tumor,BRCA


In [14]:
# 4) Summarize correlations per sample-type and write to file
disease_summary_df <- sample_correlation_list[[dataset_name]] %>%
    dplyr::group_by(algorithm, sample_type, num_comp, cor_type, shuffled, data) %>%
    dplyr::summarize(mean_cor = mean(correlation),
                     var_cor = var(correlation))

out_file <- file.path("results", paste0(dataset_name, "_sample_correlation_phenotype_summary.tsv.gz"))
readr::write_tsv(disease_summary_df, out_file)

head(disease_summary_df)

algorithm,sample_type,num_comp,cor_type,shuffled,data,mean_cor,var_cor
pca,ACC,2,pearson,shuffled,training,0.1701577,0.0008099581
pca,ACC,2,pearson,shuffled,testing,0.008725,0.0001592814
pca,ACC,2,pearson,signal,training,0.6447324,0.0019596881
pca,ACC,2,pearson,signal,testing,0.4765,0.0010905128
pca,ACC,2,spearman,shuffled,training,0.3729211,0.0004277056
pca,ACC,2,spearman,shuffled,testing,0.0115,0.0001959487


## Part 3. GTEX Sample Correlations

In [15]:
dataset_name <- "GTEX"

In [16]:
# 1) Load phenotype data
gtex_file <- file.path("..", "0.expression-download", "download",
                       "GTEx_v7_Annotations_SampleAttributesDS.txt")
gtex_pheno_df <- readr::read_tsv(gtex_file,
                                 col_types = readr::cols(
                                        .default = readr::col_character()))

colnames(gtex_pheno_df)[1] <- 'sample_id'
colnames(gtex_pheno_df)[6] <- 'sample_type'

# Subset gtex phenotype file for plotting
gtex_pheno_df <- gtex_pheno_df[, c('sample_id', 'sample_type')]
head(gtex_pheno_df, 2)

sample_id,sample_type
GTEX-1117F-0003-SM-58Q7G,Blood
GTEX-1117F-0003-SM-5DWSB,Blood


In [17]:
# 2) Load sample correlation data
sample_correlation_list[[dataset_name]] <- compile_sample_correlation(dataset_name = dataset_name)

In [18]:
# 3) Merge phenotype and sample correlation results
sample_correlation_list[[dataset_name]] <-
    sample_correlation_list[[dataset_name]] %>%
    dplyr::full_join(gtex_pheno_df, by = c("id" = "sample_id")) %>%
    na.omit

out_file <- file.path("results", "GTEX_sample_correlation_phenotype.tsv.gz")
readr::write_tsv(sample_correlation_list[[dataset_name]], out_file)

head(sample_correlation_list[[dataset_name]], 2)

algorithm,id,correlation,cor_type,seed,data,num_comp,dataset_id,shuffled,sample_type
pca,GTEX-T5JC-1526-SM-4DM68,0.74,pearson,486191,training,10,GTEX,signal,Kidney
ica,GTEX-T5JC-1526-SM-4DM68,0.74,pearson,486191,training,10,GTEX,signal,Kidney


In [19]:
# 4) Summarize correlations per sample-type and write to file
disease_summary_df <- sample_correlation_list[[dataset_name]] %>%
    dplyr::group_by(algorithm,
                    sample_type,
                    num_comp,
                    cor_type,
                    shuffled,
                    data) %>%
    dplyr::summarize(mean_cor = mean(correlation),
                     var_cor = var(correlation))

out_file <- file.path("results",
                      paste0(dataset_name, "_sample_correlation_phenotype_summary.tsv.gz"))
readr::write_tsv(disease_summary_df, out_file)

head(disease_summary_df)

algorithm,sample_type,num_comp,cor_type,shuffled,data,mean_cor,var_cor
pca,Adipose Tissue,2,pearson,shuffled,training,0.1707905,0.0008277689
pca,Adipose Tissue,2,pearson,shuffled,testing,0.0007,9.116291e-05
pca,Adipose Tissue,2,pearson,signal,training,0.8364449,0.002390166
pca,Adipose Tissue,2,pearson,signal,testing,0.762275,0.002476491
pca,Adipose Tissue,2,spearman,shuffled,training,0.4381693,0.0004390514
pca,Adipose Tissue,2,spearman,shuffled,testing,0.0009275,0.0002319621


## Part 4. Generate and Save all Sample Correlation Plots

In [20]:
for (dataset_name in names(sample_correlation_list)) {
    # Extract out the specific dataset correlation data
    sample_corr_df <- sample_correlation_list[[dataset_name]]
    
    # Loop through the datatype
    for (data_type in c("signal", "shuffled")) {
        
        # Loop over the correlation type
        for (correlation_type in c("pearson", "spearman")) {
            
            print(paste("Processing... ",
                        dataset_name, data_type, correlation_type))

            # Execute the plotting logic
            plot_sample_correlation(dataset_name = dataset_name,
                                    data_df = sample_corr_df,
                                    data_type = data_type,
                                    correlation_type = correlation_type,
                                    use_theme = correlation_theme,
                                    return_figures = FALSE)
        }
    }
}

[1] "Processing...  TARGET signal pearson"
[1] "Processing...  TARGET signal spearman"
[1] "Processing...  TARGET shuffled pearson"
[1] "Processing...  TARGET shuffled spearman"
[1] "Processing...  TCGA signal pearson"
[1] "Processing...  TCGA signal spearman"
[1] "Processing...  TCGA shuffled pearson"
[1] "Processing...  TCGA shuffled spearman"
[1] "Processing...  GTEX signal pearson"
[1] "Processing...  GTEX signal spearman"
[1] "Processing...  GTEX shuffled pearson"
[1] "Processing...  GTEX shuffled spearman"
