# Cluster genes along pseudotime 

Summary of time series clustering in R  
https://journal.r-project.org/archive/2016/RJ-2016-058/index.html  
https://cran.r-project.org/web/packages/dtwclust/vignettes/dtwclust.pdf  

### Distance meassures 

Shape meassures: Lock-step (n=m) and elastic (n!=m)  
Feature based: Fourier or wavelet coefficients, autocorrelation values, etc.  
Structure-based: (i) model-based approaches, where a model is fit to each series and the comparison is made between models, and (ii) complexity-based models, where the similarity between two series is measured based on the quantity of shared information.  
Prediction-based distances analyze the similarity of the forecasts obtained for different time series.  

In [None]:
library_load <- suppressMessages(
    
    list(
        
        # parallelDist
        library(parallelDist), 
        
        # TradeSeq
        library(tradeSeq), 
        
        # GSEA
        library(fgsea), 
        library(msigdbr), 
        
        # Data 
        library(tidyverse), 
        
        # Plotting 
        library(ggplot2), 
        library(patchwork) 
        
    )
)

In [None]:
random_seed <- 42
set.seed(random_seed)

In [None]:
options(warn=-1)

In [None]:
# Set working directory to project root
setwd("/research/peer/fdeckert/FD20200109SPLENO")

In [None]:
# Source files
source("plotting_global.R")
source("bin/tradeseq.R")

In [None]:
# Plotting Theme
ggplot2::theme_set(theme_global_set(size_select=1)) # From project global source()

# Parameter settings and data import 

In [None]:
tradeseq_eb <- readRDS("result/tradeseq/tradeseq_eb.rds")
ptg_eb <- readRDS("result/tradeseq/ptg_eb.rds")

# Get smooth counts and scale

In [None]:
n_points <- 50

In [None]:
genes <- rownames(ptg_eb[ptg_eb$ptag_cpg==TRUE, ])

cnt_log <- predictSmooth(tradeseq_eb[["fitgam"]], rownames(tradeseq_eb[["fitgam"]]), nPoints=n_points, tidy=TRUE)
cnt_log$time <- paste0("lineage_", cnt_log$lineage, ":", cnt_log$condition, ":", cnt_log$time)

# Prepare data for each condition 
cnt_log_nacl <- cnt_log[cnt_log$condition=="NaCl", ]
cnt_log_cpg <- cnt_log[cnt_log$condition=="CpG", ]

cnt_log_nacl <- tidyr::spread(cnt_log_nacl, key=time, value=yhat)
cnt_log_cpg <- tidyr::spread(cnt_log_cpg, key=time, value=yhat)

cnt_log_nacl <- dplyr::select(cnt_log_nacl, -lineage, -condition)
cnt_log_cpg <- dplyr::select(cnt_log_cpg, -lineage, -condition)

cnt_log_nacl <- column_to_rownames(cnt_log_nacl, "gene")
cnt_log_cpg <- column_to_rownames(cnt_log_cpg, "gene")

cnt_scale_nacl <- t(scale(t(cnt_log_nacl))) # Scale per condition for background distribution 
cnt_scale_cpg <- t(scale(t(cnt_log_cpg)))   # Scale per condition for background distribution 

# Combine data 
cnt_log <- cbind(cnt_log_nacl, cnt_log_cpg)
cnt_scale <- t(scale(t(cnt_log)))

In [None]:
genes_ptag <- rownames(ptg_eb[ptg_eb$ptag_nacl & ptg_eb$ptag_cpg, ]) # Select PTAG genes. The list is also used for ORA 

In [None]:
cnt_log_ptag <- cnt_log[genes_ptag, ]
cnt_scale_ptag <- cnt_scale[genes_ptag, ]

# Pseudotime perturbed genes (PTPG) 

## Generate background distribution per condition to estimate mean and sd of the Euclidean distance from similar behaving genes in cluster

In [None]:
cnt_scale_ptag_nacl <- cnt_scale_nacl[rownames(ptg_eb[ptg_eb$ptag_nacl, ]), ]
cnt_scale_ptag_cpg <- cnt_scale_cpg[rownames(ptg_eb[ptg_eb$ptag_cpg, ]), ]

In [None]:
dist_method <- "euclidean"

In [None]:
dist_ptag_nacl <- parallelDist::parDist(cnt_scale_ptag_nacl, method=dist_method)
dist_ptag_cpg <- parallelDist::parDist(cnt_scale_ptag_cpg, method=dist_method)

In [None]:
hclust_method <- "ward.D"

In [None]:
hclust_ptag_nacl <- hclust(dist_ptag_nacl, hclust_method)
hclust_ptag_cpg <- hclust(dist_ptag_cpg, hclust_method)

In [None]:
k_ptag_nacl <- 10
k_ptag_cpg <- 10

In [None]:
elbow_wcss_plot <- function(scale_data, hclust_result, cluster_k=25, cluster_k_thr=15, title="Elbow plot") {

    cluster_k <- 1:cluster_k
    wcss_k <- c()

    for(k in cluster_k) {
        
        cluster_i <- cutree(hclust_result, k)
        wcss_i <- c()
            
        for(i in unique(cluster_i)) {
            
                x <- scale_data[which(cluster_i==i), , drop=FALSE]
                wcss <- (nrow(x)-1)*sum(apply(x, 2, var))
                if(is.na(wcss)) {wcss=0}
                wcss_i[i] <- wcss

        }

        wcss_k[k] <- sum(wcss_i)

    }
    
    elbow_wcss_plot <- ggplot(data.frame(wcss=wcss_k, cluster_k=cluster_k), aes(x=cluster_k, y=wcss)) + 
        geom_line(size=1) + 
        geom_point(size=3, shape=21, fill="black", color="white") + 
        geom_vline(xintercept=cluster_k_thr, color="red", linetype="longdash") + 
        xlab("Number of Clusters") + ylab("Within groups sum of squares") + ggtitle(title)

    return(elbow_wcss_plot)


}

In [None]:
options(repr.plot.width=10, repr.plot.height=5)

elbow_wcss_plot(cnt_scale_ptag_nacl, hclust_ptag_nacl, 50, k_ptag_nacl, title="PTAG (NaCl)") + 
elbow_wcss_plot(cnt_scale_ptag_cpg, hclust_ptag_cpg, 50, k_ptag_cpg, title="PTAG (CpG)") + 
patchwork::plot_layout(ncol=2)

In [None]:
cluster_dist <- function(cnt_scale, hclust, k=10) {
    
    cluster_label <- cutree(hclust, k)
    cluster_label_unique <- unique(cluster_label)[order(unique(cluster_label))]

    cluster_dist <- list()
    for(cluster_label_i in cluster_label_unique) {

        cnt_scale_i <- cnt_scale[which(cluster_label==cluster_label_i), , drop=FALSE] # Can output single row matrix which produces NaN in parDist
        dist_i <- parallelDist::parDist(cnt_scale_i, method=dist_method)

        cluster_dist[[cluster_label_i]] <- data.frame(

            cluster=cluster_label_i, 
            dist=mean(dist_i), 
            dist_norm=mean(dist_i^(1/3))

        )

    }
    
    cluster_dist <- do.call("rbind", cluster_dist)
    
    return(cluster_dist)
   
}

In [None]:
cluster_dist_ptag_nacl <- cluster_dist(cnt_scale_ptag_nacl, hclust_ptag_nacl, k=k_ptag_nacl)
cluster_dist_ptag_cpg <- cluster_dist(cnt_scale_ptag_cpg, hclust_ptag_cpg, k=k_ptag_cpg)

In [None]:
cluster_dist_condition <- data.frame(dist_norm=c(cluster_dist_ptag_nacl$dist_norm, cluster_dist_ptag_cpg$dist_norm), treatment=c(rep("NaCl", nrow(cluster_dist_ptag_nacl)), rep("CpG", nrow(cluster_dist_ptag_cpg))))
cluster_dist_condition <- na.omit(cluster_dist_condition)

## PTPG scoring for each gene between conditions 

In [None]:
dist <- lapply(1:length(genes_ptag), function(i) {parallelDist::parDist(list(cnt_scale_ptag[i, 1:n_points, drop=FALSE], cnt_scale_ptag[i, (n_points+1):(2*n_points), drop=FALSE]), method=dist_method)})
ptpg <- data.frame(gene=rownames(cnt_scale_ptag), dist=do.call("c", dist))

ptpg$dist_norm <- ptpg$dist^(1/3)

In [None]:
options(repr.plot.width=15, repr.plot.height=5)

histo_plot_1 <- ggplot(ptpg, aes(x=dist)) + 
    geom_histogram(aes(y=..density..), bins=25, fill="lightblue") + 
    stat_function(fun=dnorm, args=list(mean=mean(ptpg$dist), sd=sd(ptpg$dist))) + 
    geom_vline(xintercept=mean(ptpg$dist), color="red") + 
    geom_vline(xintercept=c(mean(ptpg$dist)-sd(ptpg$dist), mean(ptpg$dist)+sd(ptpg$dist)), color="red", linetype="longdash") + 
    xlab("Euclidean distance") + ylab("Density") + ggtitle("Density gene-wise distance") 

histo_plot_2 <- ggplot(ptpg, aes(x=dist_norm)) + 
    geom_histogram(aes(y=..density..), bins=25, fill="lightblue") + 
    ylim(c(0, 4)) + 
    stat_function(fun=dnorm, args=list(mean=mean(ptpg$dist_norm), sd=sd(ptpg$dist_norm))) + 
    geom_vline(xintercept=mean(ptpg$dist_norm), color="red") + 
    geom_vline(xintercept=c(mean(ptpg$dist_norm)-sd(ptpg$dist_norm), mean(ptpg$dist_norm)+sd(ptpg$dist_norm)), color="red", linetype="longdash") + 
    xlab("Cube root (Euclidean distance)") + ylab("Density") + ggtitle("Density gene-wise distance")

histo_plot_3 <- ggplot(ptpg, aes(x=dist_norm)) + 
    geom_histogram(aes(y=..density..), bins=25, fill="lightblue") + 
    ylim(c(0, 4)) + 
    stat_function(fun=dnorm, args=list(mean=mean(cluster_dist_condition$dist_norm), sd=sd(cluster_dist_condition$dist_norm))) + 
    geom_vline(xintercept=mean(cluster_dist_condition$dist_norm), color="red") + 
    geom_vline(xintercept=c(mean(cluster_dist_condition$dist_norm)-sd(cluster_dist_condition$dist_norm), mean(cluster_dist_condition$dist_norm)+sd(cluster_dist_condition$dist_norm)), color="red", linetype="longdash") + 
    xlab("Cube root (Euclidean distance)") + ylab("Density") + ggtitle("Density gene-wise distance")

histo_plot_1 + histo_plot_2 + histo_plot_3 + plot_layout(ncol=3)

In [None]:
ptpg$p_value=pnorm(ptpg$dist_norm, mean=mean(cluster_dist_condition$dist_norm), sd=sd(cluster_dist_condition$dist_norm), lower.tail=FALSE)
ptpg$fdr=p.adjust(ptpg$p_value, method="fdr")

ptpg$ptpg <- ifelse(ptpg$fdr<=0.01, TRUE, FALSE)

table(ptpg$ptpg)

## PCA of per-gene Euclidean distance 

In [None]:
pca_ptag <- stats::prcomp(cnt_scale_ptag[, 1:n_points]-cnt_scale_ptag[, (n_points+1):(2*n_points)], center=FALSE, scale=FALSE)

In [None]:
screen_plot <- function(pca_result) {
    
    screen_data=data.frame(pc=1:length(pca_result$sdev), var_explained=pca_result$sdev^2 / sum(pca_result$sdev^2))
    screen_plot <- ggplot(screen_data, aes(x=pc, y=var_explained)) + 
        geom_line(size=1) + 
        geom_point(size=3, shape=21, fill="black", color="white") + 
        xlim(1, 10) + 
        xlab("Principal Component") + ylab("Variance Explained") + ggtitle("Scree Plot") + 
        theme(
            aspect.ratio=1, 
            legend.position="none"
        )
    
    return(screen_plot)
    
}

In [None]:
options(repr.plot.width=5, repr.plot.height=5)

screen_plot_ptag <- screen_plot(pca_ptag)
screen_plot_ptag

In [None]:
pca_plot <- function(pca_result, pc_1=1, pc_2=pc_1+1) {
    
    pca_data <- cbind(as.data.frame(pca_result$x), ptpg)
    
    pca_plot <- ggplot(pca_data, aes_string(x=paste0("PC", pc_1), y=paste0("PC", pc_2), fill="ptpg")) + 
        geom_point(size=3, shape=21, alpha=0.3, color="white") + 
        ggtitle("PCA") +
        scale_fill_manual(values=c("TRUE"="gray", "FALSE"="darkblue")) + 
        theme(
            aspect.ratio=1, 
            legend.position="bottom"
        ) + 
        
        guides(fill = guide_legend(override.aes = list(alpha = 1)))
    
    return(pca_plot)
    
}

In [None]:
pca_plot_ptag <- lapply(1:10, function(i) {pca_plot(pca_ptag, pc_1=i)})

In [None]:
options(repr.plot.width=5*5, repr.plot.height=10)

wrap_plots(pca_plot_ptag, ncol=5, nrow=2)

# Cluster PTCG, PTPG, PTUG 

In [None]:
cnt_scale_ptcg <- cnt_scale_ptag[ptpg[!ptpg$ptpg, ]$gene, ]

dist_ptcg <- parallelDist::parDist(cnt_scale_ptcg, method=dist_method)
hclust_ptcg <- hclust(dist_ptcg, hclust_method)

In [None]:
cnt_scale_ptpg <- cnt_scale_ptag[ptpg[ptpg$ptpg, ]$gene, ]

dist_ptpg <- parallelDist::parDist(cnt_scale_ptpg, method=dist_method)
hclust_ptpg <- hclust(dist_ptpg, hclust_method)

In [None]:
k_ptcg <- 10
k_ptpg <- 30

In [None]:
options(repr.plot.width=10, repr.plot.height=5)

elbow_wcss_plot(cnt_scale_ptcg, hclust_ptcg, 100, k_ptcg, title="PTCG") + 
elbow_wcss_plot(cnt_scale_ptpg, hclust_ptpg, 100, k_ptpg, title="PTPG") + 
patchwork::plot_layout(ncol=2)

## Plot cluster genes 

In [None]:
cluster_plot <- function(cnt_scale, cluster, cluster_filter=NULL, prototype=FALSE) {
    
    cluster_label <- cluster
    cluster_label_unique <- unique(cluster_label)[order(unique(cluster_label))]
    
    if(!is.null(cluster_filter)) {cluster_label_unique <- cluster_filter}
    
    cnt_scale <- as.data.frame(cnt_scale)
    
    p_list <- list()
    for(cluster_label_i in cluster_label_unique) {
        
        data <- cnt_scale[which(cluster_label==cluster_label_i), ]
        data <- reshape2::melt(rownames_to_column(data, var="gene"), id="gene")

        data$condition <- as.character(sapply(strsplit(as.character(data$variable), ":"), `[`, 2))
        data$pseudotime <- as.double(sapply(strsplit(as.character(data$variable), ":"), `[`, 3))
        
        if(prototype) {
            
            suppressMessages(data <- dplyr::group_by(data, pseudotime, condition) %>% dplyr::summarise(value=mean(value)))
            p <- ggplot(data, aes(x=pseudotime, y=value, color=condition, group=paste(condition))) + geom_line(size=3, alpha=1)
            
        } else {
            
            p <- ggplot(data, aes(x=pseudotime, y=value, color=condition, group=paste(condition, gene))) + geom_line(size=1, alpha=1)
            
        }
        
        p_list[[paste("Cluster", cluster_label_i)]] <- p +  
            ylim(-ceiling(max(abs(data$value))), ceiling(max(abs(data$value)))) + 
            geom_hline(yintercept=0, linetype="dashed") + 
            ggtitle(paste("Cluster", cluster_label_i)) + 
            scale_color_manual(values=unlist(color$treatment)) + 
            theme(legend.position="bottom")
        
    }
    
    return(p_list)
    
}

## Cluster genes PTCG 

In [None]:
cluster_genes_ptcg <- cluster_plot(cnt_scale_ptcg, cutree(hclust_ptcg, k_ptcg), prototype=FALSE)

In [None]:
options(repr.plot.width=5*5, repr.plot.height=ceiling(length(cluster_genes_ptcg)/5)*5)
wrap_plots(cluster_genes_ptcg, ncol=5, nrow=ceiling(length(cluster_genes_ptcg)/5))

## Cluster genes PTPG

In [None]:
cluster_genes_ptpg <- cluster_plot(cnt_scale_ptpg, cutree(hclust_ptpg, k_ptpg), prototype=FALSE)

In [None]:
options(repr.plot.width=10*5, repr.plot.height=ceiling(length(cluster_genes_ptpg)/10)*5)
wrap_plots(cluster_genes_ptpg, ncol=10, nrow=ceiling(length(cluster_genes_ptpg)/10))

# Cluster over representation analysis (ORA)

In [None]:
ora_cluster <- function(hclust, k, universe=genes_ptag, category="H", subcategory=NULL) {
    
    # Get gene set
    gene_set <- msigdbr(species="Mus musculus", category=category, subcategory=subcategory)
    gene_set <- split(gene_set, x=gene_set$gene_symbol, f=gene_set$gs_name)
    
    cluster_label <- cutree(hclust, k)
    cluster_label_unique <- unique(cluster_label)[order(unique(cluster_label))]
    
    gsea_plot <- list()
    for(i in cluster_label_unique) {

        ora_result <- fora(

            pathways=gene_set, 
            genes=names(which(cluster_label==i)),
            universe=universe,
            minSize=5, 
            maxSize=Inf

        )

        ora_result$NES <- ora_result$size

        gsea_plot[[i]] <- gsea_plot(ora_result, pval_thr=0.05, title="", pathway_suffix="GOBP|GOCC|GOMF|REACTOME|KEGG|HALLMARK|BIOCARTA|PID", pathway_filter=NULL, scale_size=5, top=15) + ggtitle(paste("Cluster", i))

    }
    
    return(gsea_plot)
    
    
}

## ORA PTCG 

In [None]:
ora_cluster_ptcg <- ora_cluster(hclust_ptcg, k_ptcg)

In [None]:
options(repr.plot.width=8*5, repr.plot.height=ceiling(length(ora_cluster_ptcg)/5)*5)
wrap_plots(ora_cluster_ptcg, ncol=5, nrow=ceiling(length(ora_cluster_ptcg)/5))

## ORA PTPG 

In [None]:
ora_cluster_ptpg <- ora_cluster(hclust_ptpg, k_ptpg)

In [None]:
options(repr.plot.width=8*5, repr.plot.height=ceiling(length(ora_cluster_ptpg)/5)*5)
wrap_plots(ora_cluster_ptpg, ncol=5, nrow=ceiling(length(ora_cluster_ptpg)/5))

# DEV

In [None]:
source("bin/tradeseq.R")

genes <- c("Stat1")

p <- lapply(genes, function(i) plot_smooth(tradeseq_eb[["fitgam"]], i, point=FALSE) + ggtitle(i) + theme(legend.position="bottom") + theme_global_set(size_select=1)) 

In [None]:
options(repr.plot.width=1*5, repr.plot.height=ceiling(length(p)/1)*5)
wrap_plots(p, ncol=1, nrow=ceiling(length(p)/1))