# TCGA expression vs splicing PCA

In [None]:
library(dplyr)
library(ggplot2)
library(ggpubr)
library(ggsci)
library(Rtsne)
library(GEDI)
library(patchwork)

In [None]:
set.seed(7)

### Functions

In [None]:
get_embedding_table<-function(pca,tsne,mode="exp"){

    pca_embed<-pca$x[,1:10] %>%
               as.data.frame()
    tsne_embed<-tsne$Y %>%
                as.data.frame() 
    colnames(tsne_embed)<-sub("V","tSNE",colnames(tsne_embed))
    tsne_embed<-tsne_embed%>%
                mutate(submitter_id=rownames(pca$x)) %>%
                tibble::column_to_rownames("submitter_id")

    gene_embed<-cbind(pca_embed,tsne_embed)
    colnames(gene_embed)<-paste0(colnames(gene_embed),"_",mode)
    gene_embed<-gene_embed %>%
                tibble::rownames_to_column("submitter_id")
    
    return(gene_embed)
}

## Load data

In [None]:
load("../input/gedi.tcga.gene.expression.inputs.RData")
load("../input/gedi.tcga.SE.splicing.inputs.RData")

In [None]:
metadata<-metadata %>% filter(condition=="tumor")

## Compute low dimensional representations

In [None]:
min_counts<-3
min_samples<-50
pseudocount<-1
npcs<-30

### Gene expression

#### On the whole dataset

In [None]:
# Filter low expressed genes
he_genes<-rowSums(gn_cts>min_counts) 
valid_genes<-he_genes>min_samples
gn_cts.filt<-gn_cts[valid_genes,]

# Preprocess data 

gn_cts.filt<-log(t(gn_cts.filt+pseudocount)) # sample by gene matrix
scld_gn_cts<-scale(gn_cts.filt,center=TRUE,scale=FALSE)

# Run PCA
gn_pca<-prcomp(scld_gn_cts,rank=npcs,retx = T,scale=FALSE,center=FALSE)
save(gn_pca,file="../input/tcga.gene.expression.prcomp.RData")

# Run tSNE
gn_tsne<-Rtsne(gn_pca$x,dims=3,initial_dims = npcs,num_threads = 5,pca=FALSE,normalize=FALSE,check_duplicates=FALSE,perplexity=100)
save(gn_tsne,file="../input/tcga.gene.expression.tsne.RData")

# Extract embeddings
gene_embed<-get_embedding_table(pca = gn_pca,tsne = gn_tsne,mode="exp")
save(gene_embed,file="../input/tcga.gene.expression.embeddings.RData")

#### By cancer type

In [None]:
cancers<-unique(metadata$cancer)
npcs<-20
min_samples<-10

In [None]:
for(ctype in cancers){
    message("Analyzing ", ctype)
    
    cancer.metadata<-metadata %>%
                 filter(cancer==ctype)
    
    cgn_cts<-gn_cts[,cancer.metadata$submitter_id]
    
    # Filter low expressed genes
    he_genes<-rowSums(cgn_cts>min_counts) 
    cgn_cts<-cgn_cts[he_genes>min_samples,]
    
    #constant<-
    # Preprocess data 
    cgn_cts<-log(t(cgn_cts+pseudocount)) # sample by gene matrix
    scld_gn_cts<-scale(cgn_cts,center=TRUE,scale=FALSE)

    # Run PCA
    message("Running PCA...")
    gn_pca<-prcomp(scld_gn_cts,rank=npcs,retx = T,scale=FALSE,center=FALSE)
    save(gn_pca,file=paste0("../input/by_cancer/",ctype,".gene.expression.prcomp.RData"))

    # Run tSNE
    message("Running tSNE...")
    perp<-floor(nrow(gn_pca$x)*0.10)
    gn_tsne<-Rtsne(gn_pca$x,dims=3,initial_dims = npcs,num_threads = 5,pca=FALSE,normalize=FALSE,check_duplicates=FALSE,perplexity = perp)
    save(gn_tsne,file=paste0("../input/by_cancer/",ctype,".gene.expression.tsne.RData"))

    # Extract embeddings
    gene_embed<-get_embedding_table(pca = gn_pca,tsne = gn_tsne,mode="exp")
    save(gene_embed,file=paste0("../input/by_cancer/",ctype,".gene.expression.embeddings.RData"))
}

### Alternative splicing

In [None]:
inputs.file<-"../input/gedi.tcga.SE.splicing.inputs.RData"
load(inputs.file,verbose=T)

In [None]:
metadata<-metadata %>% filter(condition=="tumor")

#### On the whole dataset

In [None]:
# Filter low expressed events
he_genes<-rowSums(T_ev_cts>min_counts) 
valid_genes<-he_genes>min_samples 
T_ev_cts<-T_ev_cts[valid_genes,]
A_ev_cts<-A_ev_cts[valid_genes,]

# Pre-process data
psi<-t(A_ev_cts/T_ev_cts)

# Run PCA

se_pca<-prcomp(psi,rank=npcs,retx = T,center=T)
save(se_pca,file="../input/tcga.SE.splicing.prcomp.RData")

# Run tSNE
se_tsne<-Rtsne(se_pca$x,dims=3,initial_dims = npcs,num_threads = 5,pca=FALSE,normalize=FALSE,check_duplicates=FALSE,perplexity=100)
save(se_tsne,file="../input/tcga.SE.splicing.tsne.RData")

# Extract embeddings
spl_embed<-get_embedding_table(pca=se_pca,tsne = se_tsne,mode = "spl")
save(spl_embed,file="../input/tcga.SE.splicing.embeddings.RData")

#### On each cancer type

In [None]:
cancers<-unique(metadata$cancer)
npcs<-20
min_samples<-10

In [None]:
for(ctype in cancers){
    message("Analyzing ", ctype)
    
    cancer.metadata<-metadata %>%
                     filter(cancer==ctype)
    
    cT_ev_cts<-T_ev_cts[,cancer.metadata$submitter_id]
    cA_ev_cts<-A_ev_cts[,cancer.metadata$submitter_id]
    
    # Filter low expressed genes and samples
    
    he_genes<-rowSums(cT_ev_cts>min_counts) 
    valid_genes<-he_genes>min_samples
    cT_ev_cts<-cT_ev_cts[valid_genes,]
    cA_ev_cts<-cA_ev_cts[valid_genes,]
    
    # Preprocess data 
    psi<-t(cA_ev_cts/cT_ev_cts)
    scld_spl_cts<-scale(psi,center=TRUE,scale=FALSE)

    # Run PCA
    message("Running PCA...")
    spl_pca<-prcomp(scld_spl_cts,rank=npcs,retx = T,scale=FALSE,center=FALSE)
    save(spl_pca,file=paste0("../input/confounder_embeddings/",ctype,".SE.splicing.prcomp.RData"))

    # Run tSNE
    message("Running tSNE...")
    perp<-floor(nrow(spl_pca$x)*0.10)
    spl_tsne<-Rtsne(spl_pca$x,dims=3,initial_dims = npcs,num_threads = 5,pca=FALSE,normalize=FALSE,check_duplicates=FALSE,perplexity = perp)
    save(spl_tsne,file=paste0("../input/confounder_embeddings/",ctype,".SE.splicing.tsne.RData"))

    # Extract embeddings
    spl_embed<-get_embedding_table(pca = spl_pca,tsne = spl_tsne,mode="spl")
    save(spl_embed,file=paste0("../input/confounder_embeddings/",ctype,".SE.splicing.embeddings.RData"))
}