# Feature score from contrast results
     From Daniel 5/2023: If the goal is to identify features where we have the strongest significant differences between pairs of cell types, I wonder if we could do some sort of “nested” meta analysis like this:
- Do a meta-analysis with the absolute value of  effect sizes (sign doesn’t matter) and standard errors of the pairwise cell type differences within each SNP for a feature - this would give you the meta analysis estimate of the average cell type difference in effect sizes between any pair of cell types
- Do a second round of meta-analysis on the SNP-level meta-analyzed effect size and standard error across all the SNPs in the gene - this will give you a meta-analysis estimate of the average cell type difference in the average SNP between any pair of cell types in any SNP

That would ultimately yield a single, unsigned Z-score for each feature that could be used to rank the features

In [None]:
[global]
import os
# Work directory & output directory
parameter: cwd = path('./')
# The filename prefix for output data
parameter: job_size = 1
parameter: mem = '60G'
parameter: meta_method = "REML" #Restricted Maximum Likelihood
parameter: container = ''
import re
parameter: entrypoint= ('micromamba run -a "" -n' + ' ' + re.sub(r'(_apptainer:latest|_docker:latest|\.sif)$', '', container.split('/')[-1])) if container else ""
#parameter: orig_file = path
parameter: analysis_unit = path
#parameter:contrast_dir = "/home/rf2872/Work/MASH_test_csg/MASH_6_celltypes_Dan/contrast/"
##  conditions can be excluded if needs arise. If nothing to exclude keep the default 0
import pandas as pd
#parameter: analysis_units = path
# handle N = per_chunk data-set in one job
#parameter: per_chunk = 1
regions = [x.replace("\"","").strip().split() for x in open(analysis_unit).readlines() if x.strip() and not x.strip().startswith('#')]

analysis_unit = file_target(f"{cwd:a}/feature_score/posterior_contrast_files")



In [None]:
# ls the contrast results
#[contrast_results]
#input: group_by = "all"
output: f"{cwd}/feature_score/posterior_contrast_files"
task: trunk_workers = 1, walltime = '2h', trunk_size = 1, mem = '16G', cores = 1, tags = f'{_output:bn}'
bash: expand ='${ }', stderr = f"{_output:n}.stderr", stdout = f"{_output:n}.stdout"
     ls ${cwd}/*posterior_contrast > ${_output:r}


In [2]:
# compute feature score from contrast results
#[feature_score_meta_1]
#regions = [x.replace("\"","").strip().split() for x in open(analysis_unit).readlines() if x.strip() and not x.strip().startswith('#')]
parameter: per_chunk = '100'
parameter: downsample_ratio = '1'
parameter: meta_method = 'REML'
parameter: contrast_input = [path(x[0]) for x in regions]
input: contrast_input, group_by = per_chunk
output: f"{cwd}/feature_score_meta/cache/mash_posterior_contrast_featurescore{_index+1}.rds"
task: trunk_workers = 1, trunk_size = job_size, walltime = '24h',  mem = '20G', tags = f'{_output:bn}'  

R: expand = "${ }",stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout' 
    suppressMessages(library(data.table))
    suppressMessages(library(tidyverse))
    suppressMessages(library(metafor))
    out<-NULL
    set.seed(999)
        for (res in c(${_input:r,})) {
            gene <- basename(res) %>%
                stringr::str_split(., "norminal.cis_long_table.", simplify = TRUE) %>%
                .[, 2] %>%
                gsub("_posterior_contrast.rds", "", .)

            tmp_contrast_results <- readRDS(res)
            if (${downsample_ratio} < 1) {
                # downsample
                sample_size <- round(nrow(tmp_contrast_results) * ${downsample_ratio})
                sample_rows <- sample(nrow(tmp_contrast_results), size = sample_size)
                tmp_contrast_results <- tmp_contrast_results[sample_rows, ]
            }

            if (tmp_contrast_results %>% select(matches("mean_contrast.*deviation")) %>% ncol() > 0) {
                effect_sizes <- tmp_contrast_results %>%
                    select(matches("mean_contrast.*deviation")) %>%
                    as.matrix()
                se_values <- tmp_contrast_results %>%
                    select(matches("se_contrast.*deviation")) %>%
                    as.matrix()

                # Initialize an empty data table to store the feature scores for each condition
                feature_scores <- data.table()

                # Iterate over each condition
                for (i in 1:ncol(effect_sizes)) {
                    # Subset the effect sizes and standard errors for the current condition
                    effect_sizes_condition <- effect_sizes[, i]
                    se_values_condition <- se_values[, i]

                    # Step 1: Meta-analysis of pairwise cell type differences within each SNP for a feature

                    # Calculate absolute values of effect sizes
                    absolute_effect_sizes <- abs(as.numeric(effect_sizes_condition))

                    # Calculate standard errors (SE) of pairwise cell type differences
                    pairwise_standard_errors <- as.numeric(se_values_condition)

                    # Perform meta-analysis
                    meta_result <- rma(yi = absolute_effect_sizes, sei = pairwise_standard_errors, method = "${meta_method}")


                    z_scores <- meta_result$b / meta_result$se

                    # Create a data table to store the feature scores for the current condition
                    feature_scores_condition <- data.table(ZScore = z_scores)

                    # Append the feature scores for the current condition to the overall feature_scores data table
                    feature_scores <- rbindlist(list(feature_scores, feature_scores_condition))
                }
                colnames(feature_scores) <- gene
                condition_name <- colnames(effect_sizes) %>%
                    gsub("mean_contrast_", "", .) %>%
                    gsub("_deviation", "", .)
                feature_scores <- feature_scores[, condition := condition_name]
                saveRDS(feature_scores, str_c("${cwd}","/feature_score_meta/cache/",gsub(".rds","_featurescore.rds",basename(res))))
            }
            if (is.null(out)) {
                out <- feature_scores
            } else {
                out <- merge(out, feature_scores, by = "condition", all = TRUE)
            }
        }
       saveRDS(out, "${_output}")

## feature_score_meta

In [2]:
# compute feature score from contrast results
[feature_score_meta_1]
#regions = [x.replace("\"","").strip().split() for x in open(analysis_unit).readlines() if x.strip() and not x.strip().startswith('#')]
parameter: plink_path = "/mnt/mfs/cluster/bin/plink1.9.10/plink"
parameter: bfile_path = "/mnt/vast/hpc/csg/molecular_phenotype_calling/WashU_genotype/genotype_qc/MAP_Brain-xQTL_Gwas_geno_0.1_maf_0.0005.filtered."
parameter: extract = "/home/rf2872/Work/Multivariate/MASH/From_SuSiE/2023.5_new/add_pQTL/LDcache/snp.list"
parameter: window_size = '100'
parameter: step_size = '10'
parameter: r2_threshold = '0.2'
parameter: per_chunk = '100'
parameter: LD_prune = "TRUE"
parameter: downsample_ratio = '1'
parameter: meta_method = 'REML'
parameter: LDcache = "/home/rf2872/Work/Multivariate/MASH/From_SuSiE/2023.5_new/add_pQTL/LDcache/"
parameter: contrast_input = [path(x[0]) for x in regions]
input: contrast_input, group_by = per_chunk
output: f"{cwd}/feature_score_metaLD/cache/mash_posterior_contrast_featurescore{_index+1}.rds"
task: trunk_workers = 1, trunk_size = job_size, walltime = '24h',  mem = '20G', tags = f'{_output:bn}'  

R: expand = "${ }",stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout' 
    suppressMessages(library(data.table))
    suppressMessages(library(tidyverse))
    suppressMessages(library(metafor))
    plink_path <- "${plink_path}"
    bfile_path <- "${bfile_path}"
    
    
    indep_pairwise <- paste("${window_size}","${step_size}","${r2_threshold}")
    out<-NULL
    set.seed(999)
        for (res in c(${_input:r,})) try({
            chr <- res %>%
                basename() %>%
                str_extract(., paste0("\\.(.*?)\\.")) %>%
                str_replace_all("\\.", "")%>%
                str_replace_all("chr", "")
            gene <- basename(res) %>%
                stringr::str_split(., "norminal.cis_long_table.", simplify = TRUE) %>%
                .[, 2] %>%
                gsub("_posterior_contrast.rds", "", .)
            extract <- str_c("${LDcache}",gene,"_snp.list")
            output <- str_c("${LDcache}",gene,"_output")
            tmp_contrast_results <- readRDS(res)
            #downsample with LD prune
            if ("${LD_prune}" == "TRUE") {
                tmp_contrast_results <- readRDS(res) %>% as.matrix()
      
                #  add the rownames for contrast result, should be deleted with new pipeline
                orig_data <- res %>%
                    basename() %>%
                    gsub("_posterior_contrast", "", .) %>%
                    paste0("/mnt/vast/hpc/csg/rf2872/Work/Multivariate/MASH/From_SuSiE/2023.4/add_pQTL/output/RDS/", .) %>%
                    read_rds() %>%
                    .[["bhat"]]
                posterior_data <- res %>%
                    basename() %>%
                    gsub("_posterior_contrast", ".posterior", .) %>%
                    paste0("/mnt/vast/hpc/csg/rf2872/Work/Multivariate/MASH/From_SuSiE/2023.5_new/add_pQTL/MASH_6_celltypes_udr_miss/cache/", .) %>%
                    read_rds()
                posterior_mean <- posterior_data$PosteriorMean
                posterior_cov <- posterior_data$PosteriorCov
                orig_data <- orig_data[, colnames(posterior_mean), drop = FALSE]
                # orig_data[is.na(orig_data)]<-0#just for test
                orig_data[which(is.nan(orig_data))] <- 0 # just for test
                rownames(tmp_contrast_results) <- rownames(posterior_mean)

                # save the snps in tmp_contrast_results in bash environment
                write.table(rownames(tmp_contrast_results), extract, quote = F)

                ## run plink
                #system(paste0("rm ",output,"*"), intern = TRUE)
                bfile <- str_c(bfile_path, chr)
                command <- paste(plink_path, "--bfile", bfile, "--extract", extract, "--indep-pairwise", indep_pairwise, "--out", output)
                system(command, intern = TRUE)

                # read the output from plink
                ld_output <- read.table(str_c("${LDcache}",gene,"_output.prune.in"))

                # new tmp
                tmp_contrast_results <- tmp_contrast_results[ld_output$V1, ] %>% as.data.table()
            }
            #calculate score with meta analysis
            if (tmp_contrast_results %>% select(matches("mean_contrast.*deviation")) %>% ncol() > 0) {
                effect_sizes <- tmp_contrast_results %>%
                    select(matches("mean_contrast.*deviation")) %>%
                    as.matrix()
                se_values <- tmp_contrast_results %>%
                    select(matches("se_contrast.*deviation")) %>%
                    as.matrix()

                # Initialize an empty data table to store the feature scores for each condition
                feature_scores <- data.table()

                # Iterate over each condition
                for (i in 1:ncol(effect_sizes)) {
                    # Subset the effect sizes and standard errors for the current condition
                    effect_sizes_condition <- effect_sizes[, i]
                    se_values_condition <- se_values[, i]

                    # Step 1: Meta-analysis of pairwise cell type differences within each SNP for a feature

                    # Calculate absolute values of effect sizes
                    absolute_effect_sizes <- abs(as.numeric(effect_sizes_condition))

                    # Calculate standard errors (SE) of pairwise cell type differences
                    pairwise_standard_errors <- as.numeric(se_values_condition)

                    # Perform meta-analysis
                    meta_result <- rma(yi = absolute_effect_sizes, sei = pairwise_standard_errors, method = "${meta_method}")

                    z_scores <- meta_result$b / meta_result$se

                    # Create a data table to store the feature scores for the current condition
                    feature_scores_condition <- data.table(ZScore = z_scores)

                    # Append the feature scores for the current condition to the overall feature_scores data table
                    feature_scores <- rbindlist(list(feature_scores, feature_scores_condition))
                }
                colnames(feature_scores) <- gene
                condition_name <- colnames(effect_sizes) %>%
                    gsub("mean_contrast_", "", .) %>%
                    gsub("_deviation", "", .)
                feature_scores <- feature_scores[, condition := condition_name]
                saveRDS(feature_scores, str_c("${_output:d}","/",gsub(".rds","_featurescore.rds",basename(res))))
            }
            if (is.null(out)) {
                out <- feature_scores
            } else {
                out <- merge(out, feature_scores, by = "condition", all = TRUE)
            }
        })
       saveRDS(out, "${_output}")

In [1]:
# merge the feature score from contrast results
[feature_score_meta_2]
input: group_by = "all"
output: f"{cwd}/feature_score_metaLD/posterior_feature_score_sum.csv"
task: trunk_workers = 1, trunk_size = job_size, walltime = '24h',  mem = '10G', tags = f'{_output:bn}'  

R: expand = "${ }",stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout' 
    library(dplyr)
    library(tidyverse)
    out <- NULL

    all.list <- stringr::str_split("${_input}", " ", simplify = T)
    for (i in all.list) {
        feature_scores <- readRDS(i)
        # Print the feature scores for each condition
        if (!is.null(feature_scores)) {
            if (is.null(out)) {
                out <- feature_scores
            } else {
                out <- merge(out, feature_scores, by = "condition", all = TRUE)
            }
        }
    }
      out<-t(out)
      #out<-out[-1,]
    write.csv(out, "${_output}",col.names=F)

In [2]:
# compute feature score from contrast results
#[feature_score_1]
#regions = [x.replace("\"","").strip().split() for x in open(analysis_unit).readlines() if x.strip() and not x.strip().startswith('#')]
input: regions, group_by = 1
output: f"{cwd}/feature_score/{_input:bn}_posterior_contrast_featurescore.rds"
task: trunk_workers = 1, trunk_size = job_size, walltime = '24h',  mem = '20G', tags = f'{_output:bn}'  

R: expand = "${ }",stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout' 
    suppressMessages(library(data.table))
    suppressMessages(library(tidyverse))
    suppressMessages(library(metafor))

    gene <- basename("${_input}") %>%
        stringr::str_split(., "norminal.cis_long_table.", simplify = TRUE) %>%
        .[, 2] %>%
        gsub("_posterior_contrast.rds", "", .)

    tmp_contrast_results <- readRDS("${_input}")
    if (tmp_contrast_results %>% select(matches("mean_contrast.*deviation")) %>% ncol() > 0) {
        feature_out<-list()
        effect_sizes <- tmp_contrast_results %>%
            select(matches("mean_contrast.*deviation")) %>%
            as.matrix()
        se_values <- tmp_contrast_results %>%
            select(matches("se_contrast.*deviation")) %>%
            as.matrix()

        # Initialize an empty data table to store the feature scores for each condition
        feature_scores <- data.table()
        avg_snp_diff <- data.table()
        avg_diff <- data.table()

        # Iterate over each condition
        for (i in 1:ncol(effect_sizes)) {
            # Subset the effect sizes and standard errors for the current condition
            effect_sizes_condition <- effect_sizes[, i]
            se_values_condition <- se_values[, i]

            # Step 1: Meta-analysis of pairwise cell type differences within each SNP for a feature

            # Calculate absolute values of effect sizes
            absolute_effect_sizes <- abs(as.numeric(effect_sizes_condition))

            # Calculate standard errors (SE) of pairwise cell type differences
            pairwise_standard_errors <- as.numeric(se_values_condition)

            # Perform meta-analysis
            meta_result <- rma(yi = absolute_effect_sizes, sei = pairwise_standard_errors, method = "${meta_method}")

            # Get meta-analysis estimate of the average cell type difference
            average_difference <- meta_result$b
            avg_diff <- rbindlist(list(avg_diff, as.data.table(average_difference)))

            # Step 2: Second round of meta-analysis on SNP-level meta-analyzed effect size and standard error across all the SNPs in the gene

            # Calculate effect size and standard error across all SNPs in the gene
            gene_effect_size <- mean(as.numeric(effect_sizes_condition))
            gene_standard_error <- sqrt(sum(as.numeric(se_values_condition)^2))

            # Perform meta-analysis
            meta_result_gene <- rma(yi = gene_effect_size, sei = gene_standard_error, method = "${meta_method}")

            # Get meta-analysis estimate of the average cell type difference in the average SNP
            average_snp_difference <- meta_result_gene$b
            avg_snp_diff <- rbindlist(list(avg_snp_diff, as.data.table(average_snp_difference)))

            # Calculate z-scores for each feature
            z_scores <- (average_difference - average_snp_difference) / meta_result_gene$se

            # Create a data table to store the feature scores for the current condition
            feature_scores_condition <- data.table(Feature = rownames(effect_sizes), ZScore = z_scores)

            # Append the feature scores for the current condition to the overall feature_scores data table
            feature_scores <- rbindlist(list(feature_scores, feature_scores_condition))
        }
        colnames(avg_diff)<-colnames(avg_snp_diff)<-colnames(feature_scores)<-gene
        condition_name<-colnames(effect_sizes)%>%gsub("mean_contrast_","",.)%>%gsub("_deviation","",.)
        avg_diff<-avg_diff[,condition := condition_name]
        avg_snp_diff<-avg_snp_diff[,condition := condition_name]
        feature_scores<-feature_scores[,condition := condition_name]
        feature_out$avg_diff<-avg_diff
        feature_out$avg_snp_diff<-avg_snp_diff
        feature_out$feature_scores<-feature_scores
    } else {
        feature_out <- NULL
    }
    
    saveRDS(feature_out,"${_output}")

In [1]:
# merge the feature score from contrast results
#[feature_score_2]
input: group_by = "all"
output: f"{cwd}/feature_score/posterior_feature_score_sum.csv"
task: trunk_workers = 1, trunk_size = job_size, walltime = '24h',  mem = '10G', tags = f'{_output:bn}'  

R: expand = "${ }",stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout' 
    library(dplyr)
    library(tidyverse)
    out <- list()

    all.list <- stringr::str_split("${_input}", " ", simplify = T)
    for (i in all.list) {
        feature_scores <- readRDS(i)
        # Print the feature scores for each condition
        if (!is.null(feature_scores)) {
           if (is.null(out$zscore)) {
                out$avg_diff<-feature_scores$avg_diff
                out$avg_snp_diff<-feature_scores$avg_snp_diff
                out$zscore <- feature_scores$feature_scores
              } else {
                out$avg_diff<- merge(out$avg_diff, feature_scores$feature_scores, by = "condition", all = TRUE)
                out$avg_snp_diff<- merge(out$avg_snp_diff, feature_scores$feature_scores, by = "condition", all = TRUE)
                out$zscore <- merge(out$zscore, feature_scores$feature_scores, by = "condition", all = TRUE)
              }
        }
    }
    write.csv(out, "${_output}")

In [None]:
# compute feature score from contrast results with pairwise data
#[feature_score_pw_1]
#regions = [x.replace("\"","").strip().split() for x in open(analysis_unit).readlines() if x.strip() and not x.strip().startswith('#')]
parameter: job_size = 100
parameter: significance_threshold = 1 
input: regions
output: f"{cwd}/feature_score_{meta_method}/cache/{_input:bn}_posterior_contrast_featurescore.rds"
task: trunk_workers = 1, trunk_size = job_size, walltime = '24h',  mem = '60G', tags = f'{_output:bn}'  

R: expand = "${ }",stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout' 
        suppressMessages(library(data.table))
        suppressMessages(library(tidyverse))
        suppressMessages(library(metafor))

        gene <- basename("${_input}") %>%
            stringr::str_split(., "norminal.cis_long_table.", simplify = TRUE) %>%
            .[, 2] %>%
            gsub("_posterior_contrast.rds", "", .)

        tmp_contrast_results <- readRDS("${_input}")
        if (${significance_threshold} < 1) tmp_contrast_results <- tmp_contrast_results%>%select(matches("p_contrast_.*_vs_*"))%>% .[rowSums(. < ${significance_threshold}) >0, ]## reduce calculation
  
        if (tmp_contrast_results %>% select(matches("mean_contrast.*_vs_*")) %>% ncol() > 0) {
            effect_sizes <- tmp_contrast_results %>%
                select(matches("mean_contrast.*_vs_*")) %>%
                as.matrix()
            se_values <- tmp_contrast_results %>%
                select(matches("se_contrast.*_vs_*")) %>%
                as.matrix()
        
            conditions <- colnames(effect_sizes) %>%
                sub("mean_contrast_", "", .) %>%
                unique()
          
            cells <- c(sub("_vs_.*", "", conditions), sub(".*_vs_", "", conditions)) %>% unique()

            df <- data.frame()
            for (cell in cells) {
                print(cell)
                cell.b <- effect_sizes[, grep(cell, colnames(effect_sizes)),drop=F]
                cell.se <- se_values[, grep(cell, colnames(se_values)),drop=F]

                # step1
                avg_diff <- c()
                avg_se <- c()
                for (i in 1:nrow(cell.b)) {
                    # Subset the effect sizes and standard errors for the current condition
                    effect_sizes_condition <- cell.b[i, ]
                    se_values_condition <- cell.se[i, ]

                    # Calculate absolute values of effect sizes
                    absolute_effect_sizes <- abs(as.numeric(effect_sizes_condition))

                    # Calculate standard errors (SE) of pairwise cell type differences
                    pairwise_standard_errors <- as.numeric(se_values_condition)

                    # Perform meta-analysis using random effects model (REML)
                    meta_result <- rma(yi = absolute_effect_sizes, sei = pairwise_standard_errors, method = "${meta_method}")

                    # Get meta-analysis estimate of the average cell type difference
                    avg_diff <- c(avg_diff, meta_result$b)
                    avg_se <- c(avg_se, meta_result$se)
                    # Calculate z-scores for each SNP
                }
                # step2
                meta_result_con <- rma(yi = avg_diff, sei = avg_se, method = "${meta_method}")
                # Calculate z-scores for each condition
                z_scores <- meta_result_con$b / meta_result_con$se
                df[cell, 1] <- z_scores
            }
            colnames(df) <- gene
        } else {
            df <- NULL
        }

        saveRDS(df, "${_output}")


In [1]:
# merge the feature score from contrast results with pairwise data
#[feature_score_pw_2]
input: group_by = "all"
output: f"{cwd}/feature_score_{meta_method}/posterior_feature_score_sum.csv"
task: trunk_workers = 1, trunk_size = job_size, walltime = '24h',  mem = '10G', tags = f'{_output:bn}'  

R: expand = "${ }",stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout' 
    library(dplyr)
    library(tidyverse)
    out <- list()

    all.list <- stringr::str_split("${_input}", " ", simplify = T)
    for (i in all.list) {
        feature_scores <- readRDS(i)
        # Print the feature scores for each condition
        if (!is.null(feature_scores)) {
           if (is.null(out$zscore)) {
                out <- t(feature_scores)%>%as.data.table
              } else {
                out <- rbindlist(list(out, t(feature_scores)%>%as.data.table), use.names = TRUE)
              }
        }
    }
    write.csv(out, "${_output}")

## feature_pval_pair

In [None]:
# compute feature score from contrast results
[feature_pval_pair_1]
#regions = [x.replace("\"","").strip().split() for x in open(analysis_unit).readlines() if x.strip() and not x.strip().startswith('#')]
parameter: plink_path = "/mnt/mfs/cluster/bin/plink1.9.10/plink"
parameter: bfile_path = "/mnt/vast/hpc/csg/molecular_phenotype_calling/WashU_genotype/genotype_qc/MAP_Brain-xQTL_Gwas_geno_0.1_maf_0.0005.filtered."
parameter: extract = "/home/rf2872/Work/Multivariate/MASH/From_SuSiE/2023.5_new/add_pQTL/LDcache/snp.list"
parameter: window_size = '100'
parameter: step_size = '10'
parameter: r2_threshold = '0.2'
parameter: per_chunk = '100'
parameter: LD_prune = "TRUE"
parameter: downsample_ratio = '1'
parameter: meta_method = 'REML'
parameter: se_cutoff = '1E-03'
parameter: LDcache = "/home/rf2872/Work/Multivariate/MASH/From_SuSiE/2023.5_new/add_pQTL/LDcache/"
parameter: contrast_input = [path(x[0]) for x in regions]
input: contrast_input, group_by = per_chunk
output: f"{cwd}/feature_pval_pair/cache/mash_posterior_contrast_featurescore{_index+1}.rds"
task: trunk_workers = 1, trunk_size = job_size, walltime = '24h',  mem = mem, tags = f'{_output:bn}'  


R: expand = "${ }",stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout' 
    suppressMessages(library(data.table))
    suppressMessages(library(tidyverse))
    suppressMessages(library(metafor))
    out<-NULL
    set.seed(999)
        for (res in c(${_input:r,})) {
        gene <- basename(res) %>%
            stringr::str_split(., "norminal.cis_long_table.", simplify = TRUE) %>%
            .[, 2] %>%
            gsub("_posterior_contrast.rds", "", .)
        print(gene)
        #res <- results[grep(str_c("norminal.cis_long_table.", gene, "_posterior_contrast.rds"), results)]
        tmp_contrast_results <- readRDS(res)
  
        ###  add the rownames for contrast result, should be deleted with new pipeline
        orig_data <- res %>%
            basename() %>%
            gsub("_posterior_contrast", "", .) %>%
            paste0("/mnt/vast/hpc/csg/rf2872/Work/Multivariate/MASH/From_SuSiE/2023.4/add_pQTL/output/RDS/", .) %>%
            read_rds() %>%
            .[["bhat"]]
        posterior_data <- res %>%
            basename() %>%
            gsub("_posterior_contrast", ".posterior", .) %>%
            paste0("/mnt/vast/hpc/csg/rf2872/Work/Multivariate/MASH/From_SuSiE/2023.5_new/add_pQTL/MASH_6_celltypes_udr_miss/cache/", .) %>%
            read_rds()
        posterior_mean <- posterior_data$PosteriorMean
        posterior_cov <- posterior_data$PosteriorCov
        orig_data <- orig_data[, colnames(posterior_mean), drop = FALSE]
        # orig_data[is.na(orig_data)]<-0#just for test
        orig_data[which(is.nan(orig_data))] <- 0 # just for test
        rownames(tmp_contrast_results) <- rownames(posterior_mean)
        ##
  
        # read the output from plink
        ld_output <- read.table(str_c("${LDcache}",gene,"_output.prune.in"))
        # new tmp
        tmp_contrast_results <- tmp_contrast_results[ld_output$V1, ] %>% as.data.table(keep.rownames = T)

        effect_sizes <- tmp_contrast_results %>%
            select(matches("mean_contrast.*_vs_*")) %>%
            as.matrix()
        se_values <- tmp_contrast_results %>%
            select(matches("se_contrast.*_vs_*")) %>%
            as.matrix()

        conditions <- colnames(effect_sizes) %>%
            sub("mean_contrast_", "", .) %>%
            unique()
        cells <- c(sub("_vs_.*", "", conditions), sub(".*_vs_", "", conditions)) %>% unique()

        # perform meta analysis per cell (different cell type)
        df <- data.table()
        for (cell in cells) {
            #print(cell)
            cell.b <- effect_sizes[, grep(cell, colnames(effect_sizes)),drop=F]
            cell.se <- se_values[, grep(cell, colnames(se_values)),drop=F]

            feature_pvals <- data.table()
            for (i in 1:ncol(cell.b)) {
                # Subset the effect sizes and standard errors for the current condition
                effect_sizes_condition <- cell.b[, i]
                se_values_condition <- cell.b[, i]
                #delete the snps with super small se
                effect_sizes_condition<-effect_sizes_condition[which(se_values_condition>as.numeric("${se_cutoff}"))]
                se_values_condition<-se_values_condition[which(se_values_condition>as.numeric("${se_cutoff}"))]
                # Step 1: Meta-analysis of pairwise cell type differences within each SNP for a feature

                # Calculate absolute values of effect sizes
                absolute_effect_sizes <- abs(as.numeric(effect_sizes_condition))

                # Calculate standard errors (SE) of pairwise cell type differences
                pairwise_standard_errors <- as.numeric(se_values_condition)

                # Perform meta-analysis
                meta_result <- rma(yi = absolute_effect_sizes, sei = pairwise_standard_errors, method = "REML")


                # Create a data table to store the feature scores for the current condition
                feature_pval_condition <- data.table(pavlue = meta_result$pval)

                # Append the feature scores for the current condition to the overall feature_scores data table
                feature_pvals <- rbindlist(list(feature_pvals, feature_pval_condition))
            }
            colnames(feature_pvals) <- gene
            condition_name <- colnames(cell.b) %>%
                gsub("mean_contrast_", "", .)
            feature_pvals <- feature_pvals[, condition := condition_name]
            df <- rbindlist(list(df, feature_pvals))
            saveRDS(df, str_c("${_output:d}","/",gsub(".rds","_featurescore_pw.rds",basename(res))))
        }
    
    if (is.null(out)) {
        out <- df
    } else {
        out <- merge(out, df, by = "condition", all = TRUE, allow.cartesian=TRUE)
    }
    }
    saveRDS(out, "${_output}")

In [1]:
# merge the feature score from contrast results
[feature_pval_pair_2]
input: group_by = "all"
output: f"{cwd}/feature_pval_pair/posterior_feature_score_sum.csv"
task: trunk_workers = 1, trunk_size = job_size, walltime = '24h',  mem = '10G', tags = f'{_output:bn}'  

R: expand = "${ }",stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout' 
    library(dplyr)
    library(tidyverse)
    out <- NULL

    all.list <- stringr::str_split("${_input}", " ", simplify = T)
    for (i in all.list) {
        feature_scores <- readRDS(i)
        # Print the feature scores for each condition
        if (!is.null(feature_scores)) {
            if (is.null(out)) {
                out <- feature_scores
            } else {
                out <- merge(out, feature_scores, by = "condition", all = TRUE)
            }
        }
    }
      out<-t(out)
      #out<-out[-1,]
    write.csv(out, "${_output}",col.names=F)

## feature_score_finemap

In [None]:
# compute feature score from contrast results with eQTL and pQTL finemapped signals
[feature_score_finemap_1]
#regions = [x.replace("\"","").strip().split() for x in open(analysis_unit).readlines() if x.strip() and not x.strip().startswith('#')]
parameter: per_chunk = '100'
parameter: pfine_path = "/mnt/vast/hpc/csg/molecular_phenotype_calling/QTL_fine_mapping/pqtl.all_variants.tsv"
parameter: efine_path = "/mnt/vast/hpc/csg/hs3393/susie_eqtl/Result_new/tsv/"
parameter: efine_suffix = ".unisusie.fit.variant.tsv"
parameter: gene_ref_path = "/mnt/vast/hpc/csg/molecular_phenotype_calling/reference_data/Homo_sapiens.GRCh38.103.chr.reformatted.collapse_only.gene.region_list"
parameter: contrast_input = [path(x[0]) for x in regions]
input: contrast_input, group_by = per_chunk
output: f"{cwd}/feature_score_finemap/cache/mash_posterior_contrast_featurescore{_index+1}.rds"
task: trunk_workers = 1, trunk_size = job_size, walltime = '24h',  mem = '40G', tags = f'{_output:bn}'  


R: expand = "${ }",stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout' 
        suppressMessages(library(data.table))
        suppressMessages(library(tidyverse))
        score_from_cs <- function(fine.file, contrast_results, con) { # find top signal in each CS
            CSs <- unique(fine.file$cs_order) %>% .[. != 0]
            max.cs.df <- NULL
            # could be more than 1 CS in one file, pick all the most significant snps from them
            for (cs in CSs) {
                tmp <- fine.file[fine.file$cs_order == cs, ]
                max.cs <- tmp[tmp$pip == max(tmp$pip), ]
                max.cs.df <- rbind(max.cs.df, max.cs)
            }
            # pick the snp with the most significant pvalue in contrast result
            # some strongest signal in CS are not present in the sumstats file because of the difference between cis and tad, which won't be a problem in new generated data.
            # here I just picked the overlap (may not be the most correct way)
            snps <- intersect(rownames(contrast_results), max.cs.df$variants)
            if (length(snps) > 0) {
                # if only 2 groups and no deviation result
                if (contrast_results %>% as.data.frame() %>% select(matches(str_c("p_contrast_", con, "_deviation"))) %>% ncol() > 0) {
                    contrast_p <- contrast_results[snps, str_c("p_contrast_", con, "_deviation"), drop = F]
                    # in case some snps may have more than 1 most significant p value
                    max.snp <- contrast_p[contrast_p[, 1] == max(contrast_p[, 1]), , drop = F] %>% rownames()
                    # compute zscore with beta and se
                    score <- abs(contrast_results[max.snp, str_c("mean_contrast_", con, "_deviation")]) / contrast_results[max.snp, str_c("se_contrast_", con, "_deviation")]
                    score <- max(score)
                } else if (contrast_results %>% as.data.frame() %>% select(matches("p_contrast.*_vs_*")) %>% ncol() == 1) {
                    contrast_p <- contrast_results %>%
                        as.data.frame() %>%
                        select(matches("p_contrast.*_vs_*")) %>%
                        .[snps, , drop = F]
                    # in case some snps may have more than 1 most significant p value
                    max.snp <- contrast_p[contrast_p[, 1] == max(contrast_p[, 1]), , drop = F] %>% rownames()
                    # compute zscore with beta and se
                    score <- abs(contrast_results %>% as.data.frame() %>% select(matches("mean_contrast.*_vs_*")) %>% .[max.snp, ]) / contrast_results %>%
                        as.data.frame() %>%
                        select(matches("se_contrast.*_vs_*")) %>%
                        .[max.snp, ]
                    score <- max(score)
                }
            } else {
                score <- NA
            }

            return(score)
        }


        # read fine mapped pQTL file
        pfine.mapped.result <- fread("${pfine_path}")
        pfine.mapped.result$Gene <- stringr::str_split(pfine.mapped.result$molecular_trait_id, "_", simplify = T)[, 2]
        gene.ref <- read.table("${gene_ref_path}")


        # results <- list.files(path = "/home/rf2872/Work/Multivariate/MASH/From_SuSiE/2023.5_new/add_pQTL/MASH_6_celltypes_udr_miss/contrast", "posterior_contrast.rds", full.names = T)
        out <- data.table()
        # for (res in results[1:10]) {
        for (res in c(${_input:r,})) {
            df <- data.frame()

            tmp_contrast_results <- readRDS(res) %>% as.matrix()


            #  add the rownames for contrast result, should be deleted with new pipeline
            orig_data <- res %>%
                basename() %>%
                gsub("_posterior_contrast", "", .) %>%
                paste0("/mnt/vast/hpc/csg/rf2872/Work/Multivariate/MASH/From_SuSiE/2023.4/add_pQTL/output/RDS/", .) %>%
                read_rds() %>%
                .[["bhat"]]
            posterior_data <- res %>%
                basename() %>%
                gsub("_posterior_contrast", ".posterior", .) %>%
                paste0("/mnt/vast/hpc/csg/rf2872/Work/Multivariate/MASH/From_SuSiE/2023.5_new/add_pQTL/MASH_6_celltypes_udr_miss/cache/", .) %>%
                read_rds()
            posterior_mean <- posterior_data$PosteriorMean
            posterior_cov <- posterior_data$PosteriorCov
            orig_data <- orig_data[, colnames(posterior_mean), drop = FALSE]
            orig_data[which(is.nan(orig_data))] <- 0 # just for test
            rownames(tmp_contrast_results) <- rownames(posterior_mean)

            #


            gene <- basename(res) %>%
                stringr::str_split(., "norminal.cis_long_table.", simplify = TRUE) %>%
                .[, 2] %>%
                gsub("_posterior_contrast.rds", "", .)


            ensemble <- gene.ref[gene.ref$V5 == gene, ]$V4

            efine.mapped.result <- list.files(path = "${efine_path}", pattern = paste0(ensemble, "${efine_suffix}"), full.names = T)

            pfine.file <- pfine.mapped.result[pfine.mapped.result$Gene == gene & pfine.mapped.result$cs_order > 0, ]

            if (nrow(pfine.file) > 0) {
                message("Extracting signal from pQTL in ", gene)

                df[gene, "DLPFC_pQTL"] <- score_from_cs(fine.file = pfine.file, contrast_results = tmp_contrast_results, con = "DLPFC_pQTL")
            }


            if (length(efine.mapped.result) > 0) {
                print(gene)
                fine.conditions <- basename(efine.mapped.result) %>%
                    sub("demo.", "", .) %>%
                    sub(paste0(".", ensemble, "${efine_suffix}"), "", .) %>%
                    .[. != "ALL"] %>%
                    .[. != "End"]

                for (con in fine.conditions) {
                    # print(con)
                    # con="Ast"
                    con.file <- efine.mapped.result[grep(con, efine.mapped.result)] %>% fread()
                    con.fine.file <- con.file[con.file$cs_order > 0, ]
                    if (nrow(con.fine.file) > 0) {
                        df[gene, con] <- score_from_cs(fine.file = con.fine.file, contrast_results = tmp_contrast_results, con = con)
                    }
                }
            }
            saveRDS(df, str_c("${cwd}","/feature_score_finemap/cache/",gsub(".rds","_featurescore.rds",basename(res))))
            out <- rbindlist(list(out, as.data.table(df, keep.rownames = TRUE)), use.names = TRUE, fill = TRUE)
        }
        saveRDS(out, "${_output}")

In [1]:
# merge the feature score from contrast results with eQTL and pQTL finemapped signals
[feature_score_finemap_2]
input: group_by = "all"
output: f"{cwd}/feature_score_finemap/posterior_feature_score_sum.csv"
task: trunk_workers = 1, trunk_size = job_size, walltime = '24h',  mem = '10G', tags = f'{_output:bn}'  

R: expand = "${ }",stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout' 
    suppressMessages(library(data.table))
    suppressMessages(library(tidyverse))
    
    out <- data.table()

    all.list <- stringr::str_split("${_input}", " ", simplify = T)
    for (i in all.list) {
        feature_scores <- readRDS(i)
        # Print the feature scores for each condition
        if (!is.null(feature_scores)) {
           if (is.null(out)) {
                out <- as.data.table(feature_scores, keep.rownames = TRUE)
              } else {
                out <-  rbindlist(list(out, as.data.table(feature_scores, keep.rownames = TRUE)), use.names = TRUE, fill = TRUE)
              }
        }
    }
    write.csv(out, "${_output}")

## feature_score_nsig

In [12]:
cd ~/Work/Multivariate/MASH/From_SuSiE/2023.5_new/ep_MiGA/
nohup sos run /home/rf2872/codes/xqtl-pipeline/pipeline/contrast_feature_score.ipynb feature_score_nsig    \
--analysis_unit   <(ls ~/Work/Multivariate/MASH/From_SuSiE/2023.5_new/ep_MiGA/MASH_6_celltypes_udr_miss/contrast_merge/*posterior_contrast.rds) \
--cwd MASH_6_celltypes_udr_miss/contrast_merge \
-s force -J 200 -q csg -c ~/test/csg.yml  &> mash_contrast_score_nsig.log &  

[1]+  Done                    nohup sos run /home/rf2872/codes/xqtl-pipeline/pipeline/contrast_feature_score.ipynb feature_score_nsig --analysis_unit <(ls ~/Work/Multivariate/MASH/From_SuSiE/2023.5_new/ep_MiGA/MASH_6_celltypes_udr_miss/contrast_merge/*posterior_contrast.rds) --cwd MASH_6_celltypes_udr_miss/contrast_merge -s force -J 200 -q csg -c ~/test/csg.yml &> mash_contrast_score_nsig.log
[1] 45008


In [13]:
cd ~/Work/Multivariate/MASH/From_SuSiE/2023.5_new/ep_MiGA/
nohup sos run /home/rf2872/codes/xqtl-pipeline/pipeline/contrast_feature_score.ipynb feature_score_nsig    \
--analysis_unit   <(ls ~/Work/Multivariate/MASH/From_SuSiE/2023.5_new/ep_MiGA/MASH_6_celltypes_udr_miss/contrast_merge_non/*posterior_contrast.rds) \
--cwd MASH_6_celltypes_udr_miss/contrast_merge_non \
-s build -J 200 -q csg -c ~/test/csg.yml  &> mash_contrast_score_non_nsig.log &  

[1]+  Done                    nohup sos run /home/rf2872/codes/xqtl-pipeline/pipeline/contrast_feature_score.ipynb feature_score_nsig --analysis_unit <(ls ~/Work/Multivariate/MASH/From_SuSiE/2023.5_new/ep_MiGA/MASH_6_celltypes_udr_miss/contrast_merge/*posterior_contrast.rds) --cwd MASH_6_celltypes_udr_miss/contrast_merge -s force -J 200 -q csg -c ~/test/csg.yml &> mash_contrast_score_nsig.log
[1] 48608


In [5]:
# compute feature score from contrast results
[feature_score_nsig_1]
#regions = [x.replace("\"","").strip().split() for x in open(analysis_unit).readlines() if x.strip() and not x.strip().startswith('#')]
parameter: per_chunk = '100'
parameter: p_cut=0.00001
parameter: contrast_input = [path(x[0]) for x in regions]
input: contrast_input, group_by = per_chunk
output: f"{cwd}/feature_score_nsig/cache/mash_posterior_contrast_featurescore_nsig{_index+1}.rds"
task: trunk_workers = 1, trunk_size = job_size, walltime = '24h',  mem = '20G', tags = f'{_output:bn}'  

R: expand = "${ }",stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout' 
    suppressMessages(library(data.table))
    suppressMessages(library(tidyverse))
    suppressMessages(library(metafor))
    p_cut <-${p_cut}%>%as.numeric
    out<-NULL
    set.seed(999)
        for (res in c(${_input:r,})){
            try({
                #res <- readRDS(res)
                gene <- basename(res) %>%
                    stringr::str_split(., "norminal.cis_long_table.", simplify = TRUE) %>%
                    .[, 2] %>%
                    gsub("_posterior_contrast.rds", "", .)
                print(gene)
                tmp_contrast_results <- readRDS(res) %>%
                    select(matches("p_contrast_.*deviation")) %>%
                    as.matrix()
                # Iterate over each condition
                feature.out <- data.table()
                for (i in 1:ncol(tmp_contrast_results)) {
                    feature.out <- feature.out[, c("n_sig_snp", "n_snp", "condition") := list(colSums(tmp_contrast_results < p_cut, na.rm = TRUE), colSums(tmp_contrast_results, na.rm = TRUE), colnames(tmp_contrast_results) %>% gsub("p_contrast_", "", .) %>% gsub("_deviation", "", .))]
                    feature.out[, gene] <- feature.out$n_sig_snp / feature.out$n_snp
                }
                saveRDS(feature.out, paste0("${_output:d}","/", gsub(".rds", "_n_sig_ratio.rds", basename(res))))
                if (is.null(out)) {
                    out <- feature.out %>%
                        as.data.frame() %>%
                        .[, c(gene, "condition")]
                } else {
                    out <- merge(out, feature.out %>% as.data.frame() %>% .[, c(gene, "condition")], by = "condition", all = TRUE)
                }
            })
        }
       saveRDS(out, "${_output}")

In [1]:
# merge the feature score from contrast results
[feature_score_nsig_2]
input: group_by = "all"
output: f"{cwd}/feature_score_nsig/posterior_feature_score_sum.csv"
task: trunk_workers = 1, trunk_size = job_size, walltime = '24h',  mem = '10G', tags = f'{_output:bn}'  

R: expand = "${ }",stderr = f'{_output:n}.stderr', stdout = f'{_output:n}.stdout' 
    library(dplyr)
    library(tidyverse)
    out <- NULL

    all.list <- stringr::str_split("${_input}", " ", simplify = T)
    for (i in all.list) {
        feature_scores <- readRDS(i)
        # Print the feature scores for each condition
        if (!is.null(feature_scores)) {
            if (is.null(out)) {
                out <- feature_scores
            } else {
                out <- merge(out, feature_scores, by = "condition", all = TRUE)
            }
        }
    }
      out<-t(out)
      #out<-out[-1,]
    write.csv(out, "${_output}",col.names=F)