In [None]:
library(tidyverse)
library(DESeq2)
library(BiocParallel)
library(RColorBrewer)
library(pheatmap)
library(umap)

# Custom package
library(rutils)

In [None]:
n_cores <- detectCores() - 2
BiocParallel::register(MulticoreParam(n_cores))

In [None]:
projects <- c("TCGA-CESC", "TCGA-OV", "TCGA-UCS", "TCGA-UCEC")
data_root <- "../../../../../mnt/d"
count_files <- Sys.glob(paths = paste0(data_root, "/", "TCGA", "/", "matrix_count_data", "/*counts.tsv"))
coldata_files <- Sys.glob(paths = paste0(data_root, "/", "TCGA", "/", "matrix_count_data", "/*coldata.tsv"))

In [None]:
count_files
coldata_files

In [None]:
data_ls <- rutils::load_and_combine_count_matrix_data(
    count_files,
    coldata_files,
    count_join_symbols = c("ensembl_gene_id", "external_gene_name")
)

In [None]:
counts_df <- data_ls$counts_df
coldata_df <- data_ls$coldata_df

# Normalize counts

In [None]:
vst_counts_df <- counts_df
vst_counts_df[, -(1:2)] = vst(as.matrix(vst_counts_df[, -(1:2)]), blind = TRUE)

In [None]:
vst_counts_df

# Filter down to matrisome genes only

In [None]:
matrisome_df <- rutils::load_matrisome_df(
    paste0(data_root, "/", "unified_TCGA_GTEx_data", "/", "matrisome", "/", "matrisome_hs_masterlist.tsv")
)

In [None]:
matrisome_mask <- counts_df$external_gene_name %in% matrisome_df$gene_symbol

In [None]:
matrisome_vst_counts_df <- vst_counts_df[matrisome_mask, ]

### Any repeat entries among matrisome genes?

In [None]:
# If these lengths are the same, there are no repeat entries
# among nmatrisome genes
nrow(matrisome_vst_counts_df) == length(unique(matrisome_counts_df$external_gene_name))

### No longer care about Ensembl Gene IDs

In [None]:
matrisome_vst_counts_df <- matrisome_vst_counts_df %>%
    dplyr::select(-ensembl_gene_id)

# Functions

In [None]:
pca_scatter <- function(df, pc_x, pc_y, col) {
    plt <- ggplot(df) +
        aes_string(x = pc_x, y = pc_y, color = col) +
        geom_point(size=1) +
        xlab(paste0(pc_x)) + 
        ylab(paste0(pc_y)) +
        coord_fixed()
    return(plt)
}


get_project_centroids <- function(counts, coldata, projs) {
    centroids <- list()
    for (proj in projs) {
        proj_mask <- coldata$project == proj
        proj_samples <- coldata$sample_name[proj_mask]
        # Use medians as centroids since we know there are many outliers
        centroid <- rowMedians(as.matrix(counts[, proj_samples]))
        centroids[[proj]] <- centroid
    }
    return(as_tibble(centroids))
}


l1_dist <- function(x, y) {
    return(sum(abs(x - y)))
}


z_score <- function(x){
    (x - mean(x)) / sd(x)
}


find_n_closest <- function(counts, centroid, n) {
    res <- apply(counts, 2, function(x) { l1_dist(x, centroid) })
    return(
        res %>%
            as_tibble(rownames = "sample_name") %>%
            top_n(-n, wt = value)
    )
}



heatmap_sample <- function(counts, coldata, centroids, projs, n) {
    samples <- list()
    for (proj in projs) {
        proj_mask <- coldata$project == proj
        proj_counts <- counts[, coldata$sample_name[proj_mask]]
        centroid <- centroids[[proj]]
        res <- find_n_closest(proj_counts, centroid, n) %>%
            mutate(project = proj)
        print(dim(res))
        samples[[proj]] <- res
    }
    return(bind_rows(samples))
}

# PCA

In [None]:
pca_res <- prcomp(t(matrisome_vst_counts), center = TRUE, scale. = FALSE)

In [None]:
pca_res_df <- pca_res$x %>%
    as_tibble(rownames = "sample_name") %>%
    mutate(project = coldata_df$project)

In [None]:
pca_var <- pca_res$sdev ^ 2
pca_pct_var <- round(pca_var / sum(pca_var) * 100, 3)
pca_var_df <- tibble(PC = colnames(pca_res$x), var = pca_var, pct_var = pca_pct_var) %>%
    mutate(PC_fact = factor(colnames(pca_res$x), levels = colnames(pca_res$x)))

# Heatmaps: TODO

## Find centroids
We want to find centroids in a reduced space, so we use PCA results

In [None]:
# Do this with dimensionally reduced data?
centroids <- get_project_centroids(matrisome_vst_counts_df, coldata_df, sub("-", "_", projects))

In [None]:
hm_sample_meta_df <- heatmap_sample(matrisome_vst_counts_df, coldata_df, centroids, sub("-", "_", projects), 10)

In [None]:
hm_sample_df <- matrisome_vst_counts_df[, hm_sample_meta_df$sample_name]

## Drop genes where sd is too low

In [None]:
gene_sd_mask <- rowSds(as.matrix(hm_sample_df)) > 1e-10
hm_sample_df <- hm_sample_df[gene_sd_mask, ]

In [None]:
hm_sample_coldata_df <- coldata_df %>%
    dplyr::filter(sample_name %in% colnames(hm_sample_df))

In [None]:
dim(hm_sample_df)

In [None]:
col_dist <- as.dist(1 - cor(hm_sample_df, method = "spearman"))
row_dist <- as.dist(1 - cor(t(hm_sample_df), method = "pearson"))

col_clust <- hclust(col_dist, method = "complete")
row_clust <- hclust(row_dist, method = "complete")

In [None]:
z_score_mat <- t(apply(hm_sample_df, 1, z_score))

In [None]:
proj_colors <- list(project = brewer.pal(n = 4, name = "Set1"))
names(proj_colors$project) <- unique(hm_sample_coldata_df$project)

In [None]:
hm_sample_coldata_df_sorted <- hm_sample_coldata_df[match(colnames(hm_sample_df), hm_sample_coldata_df$sample_name), ]


In [None]:
hm <- pheatmap(
    z_score_mat,
    cluster_rows = row_clust,
    cluster_cols = col_clust,
    cutree_cols = 4,
    show_rownames = FALSE,
    show_colnames = FALSE
)

# PCA Plots

In [None]:
matrisome_vst_counts <- matrisome_vst_counts_df %>%
    column_to_rownames(var = "external_gene_name") %>%
    as.matrix()

In [None]:
ggplot() +
    geom_bar(aes(x = PC_fact, y = pct_var), data = pca_var_df[1:10, ], stat = "identity", fill = "steelblue") +
    xlab(label = "principal component") +
    ylab(label = "% variance") +
    theme(panel.grid.major = element_blank())
ggsave(filename = paste(data_root, "TCGA", "figures", "screeplot.png", sep = "/"), last_plot())

In [None]:
pca_scatter(pca_res_df, "PC1", "PC2", "project")
ggsave(filename = paste(data_root, "TCGA", "figures", "PC2_vs_PC1.png", sep = "/"), last_plot())
pca_scatter(pca_res_df, "PC2", "PC3", "project")
ggsave(filename = paste(data_root, "TCGA", "figures", "PC3_vs_PC2.png", sep = "/"), last_plot())

In [None]:
pca_pivot_df <- pca_res_df%>%
    dplyr::select(sample_name, PC1:PC4, project) %>%
    pivot_longer(cols = PC1:PC4, names_to = "PC", values_to = "loadings")

In [None]:
head(pca_pivot_df)

In [None]:
ggplot() +
    geom_bar(data = pca_pivot_df, aes(x=sample_name, y=loadings, fill=project), stat = "identity") +
    facet_wrap(~PC) +
    coord_flip() +
    theme_bw() + 
    theme(
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.title.y = element_blank()
    )
ggsave(filename = paste(data_root, "TCGA", "figures", "small_multiples_plot.png", sep = "/"), plot = last_plot())

In [None]:
res_umap <- umap(t(matrisome_vst_counts))

In [None]:
colnames(res_umap$layout) <- c("x1", "x2")
umap_df <- res_umap$layout %>%
    as_tibble() %>%
    bind_cols(coldata_df)
dim(umap_df)
head(umap_df)

In [None]:
ggplot() +
    geom_point(aes(x = x1, y = x2, color = project), data = umap_df, size = 1)
ggsave(filename = paste(data_root, "TCGA", "figures", "umap.png", sep = "/"), plot = last_plot())