Skip to content

Commit

Permalink
Merge pull request #56 from mumichae/export_counts
Browse files Browse the repository at this point in the history
Export counts
  • Loading branch information
mumichae committed Jun 1, 2020
2 parents f15dd9c + e380c1e commit b08cc8d
Show file tree
Hide file tree
Showing 13 changed files with 103 additions and 61 deletions.
16 changes: 9 additions & 7 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,10 +58,12 @@ before_script:
- cd $PROJECT_DIR
- drop demo

script: true
# - cd $PROJECT_DIR
# - snakemake -n
# - snakemake aberrantExpression --cores 2
# - snakemake aberrantSplicing --cores 2
# - snakemake mae --cores 2
# - snakemake --cores 2
script:
- cd $PROJECT_DIR
- snakemake -n
- snakemake aberrantExpression --cores 2
- snakemake aberrantSplicing --cores 2
- snakemake mae --cores 2
- snakemake --cores 2
- snakemake exportCounts --cores 2

6 changes: 4 additions & 2 deletions drop/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,10 @@ def demo():
setFiles()

# download data
module_dir = str(pathlib.Path(drop.__file__).parent)
subprocess.run(["bash", f"{module_dir}/download_data.sh"], stderr=subprocess.PIPE)
logger.info("download data")
download_script = str(pathlib.Path(drop.__file__).parent / "download_data.sh")
response = subprocess.run(["bash", download_script], stderr=subprocess.STDOUT)
response.check_returncode()

# fix config file
with open("config.yaml", "r") as f:
Expand Down
42 changes: 42 additions & 0 deletions drop/configHelper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import wbuild
import pathlib
from snakemake.logging import logger
from snakemake.io import expand
import warnings
warnings.filterwarnings("ignore", 'This pattern has match groups')

Expand Down Expand Up @@ -110,6 +111,19 @@ def setDefaults(self, config, method=None):
setKey(config, None, "genomeAssembly", "hg19")
setKey(config, None, "scanBamParam", "null")

# export settings
setKey(config, None, "exportCounts", dict(), verbose=VERBOSE)
gene_annotations = list(config["geneAnnotation"].keys())
setKey(config, ["exportCounts"], "geneAnnotation", gene_annotations, verbose=VERBOSE)
setKey(config, ["exportCounts"], "excludeGroups", list(), verbose=VERBOSE)

# check consistency of gene annotations
anno_incomp = set(config["exportCounts"]["geneAnnotations"]) - set(gene_annotations)
if len(anno_incomp) > 0:
message = f"{anno_incomp} are not valid annotation version in 'geneAnnotation'"
message += "but required in 'exportCounts'.\n Please make sure they match."
raise ValueError(message)

if self.method is None:
tmp_dir = submodules.getTmpDir()
else:
Expand Down Expand Up @@ -162,6 +176,8 @@ def setDefaults(self, config, method=None):

def setKey(self, dict_, sub, key, default, verbose=False):
if sub is not None:
if not isinstance(sub, list):
raise TypeError(f"{sub} is not of type list")
for x in sub:
dict_ = dict_[x]
if key not in dict_ or dict_[key] is None:
Expand Down Expand Up @@ -369,6 +385,32 @@ def getMaeAll(self):

def getGeneAnnotationFile(self, annotation):
return self.config["geneAnnotation"][annotation]

def getExportGroups(self, modules=["aberrantExpression", "aberrantSplicing"]):
groups = [] # get all active groups
for module in modules:
groups.extend(self.config[module]["groups"])
exclude = self.config["exportCounts"]["excludeGroups"]
return set(groups) - set(exclude)

def getExportCountFiles(self, prefix):

count_type_map = {"geneCounts":"aberrantExpression",
"splitCounts":"aberrantSplicing",
"spliceSiteOverlapCounts":"aberrantSplicing"}
if prefix not in count_type_map.keys():
raise ValueError(f"{prefix} not a valid file type for exported counts")

datasets = self.getExportGroups([count_type_map[prefix]])
annotations = self.config["exportCounts"]["geneAnnotations"]

pattern = self.getProcResultsDir()
if prefix == "geneCounts":
pattern += f"/exported_counts/{{dataset}}/{prefix}_{{dataset}}--{{annotation}}.tsv.gz"
return expand(pattern, annotation=annotations, dataset=datasets)
else:
pattern += f"/exported_counts/{{dataset}}/{prefix}_{{dataset}}.tsv.gz"
return expand(pattern, dataset=datasets)



Expand Down
4 changes: 2 additions & 2 deletions drop/download_data.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,8 @@
set -e

# get data
wget -Nc "https://i12g-gagneurweb.informatik.tu-muenchen.de/public/paper/drop_analysis/resource.tar.gz"
resource_url="https://www.cmm.in.tum.de/public/paper/drop_analysis/resource.tar.gz"
wget -Nc $resource_url
tar -zxvf resource.tar.gz
rm -rf Data
mv resource Data
Expand All @@ -11,7 +12,6 @@ mv resource Data
cd Data
python fix_sample_anno.py
gunzip chr21.fa.gz
samtools faidx chr21.fa

# copy config
cp config_relative.yaml ../config.yaml
2 changes: 1 addition & 1 deletion drop/installRPackages.R
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ for (i in 1:nrow(packages)) {
pckg_name = tail(unlist(strsplit(packages[i,1], split = "/")), n = 1)

if (pckg_name %in% installed) {
message(paste(pckg_name, "already installed"))
#message(paste(pckg_name, "already installed"))
} else {
if (packages[i,2] == TRUE) {
INSTALL <- BiocManager::install
Expand Down
7 changes: 4 additions & 3 deletions drop/modules/helpers/add_HPO_cols.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@
add_HPO_cols <- function(RES, sample_id_col = 'sampleID', gene_name_col = 'hgncSymbol'){
require(data.table)

hpo_dt <- fread('https://i12g-gagneurweb.in.tum.de/public/paper/drop_analysis/resource/hpo_genes.tsv.gz')
filename <- 'https://www.cmm.in.tum.de/public/paper/drop_analysis/resource/hpo_genes.tsv.gz'
hpo_dt <- fread(filename)

# change column names
setnames(RES, old = c(sample_id_col, gene_name_col), new = c('sampleID', 'hgncSymbol'))
Expand All @@ -14,7 +15,7 @@ add_HPO_cols <- function(RES, sample_id_col = 'sampleID', gene_name_col = 'hgncS

if(nrow(f2) > 0){
f3 <- merge(f2, sa[,.(RNA_ID, HPO_TERMS)], by.x = 'sampleID', by.y = 'RNA_ID')
f3[, HPO_match := HPO_id %in% unlist(strsplit(HPO_TERMS, split = ',')), by = 1:nrow(f3)]
f3[, HPO_match := HPO_id %in% unlist(strsplit(HPO_TERMS, split = ','))]
f3 <- f3[HPO_match == TRUE]
if(nrow(f3) > 0){
f4 <- f3[, .(HPO_id_overlap = paste(HPO_id, collapse = ', '),
Expand All @@ -26,4 +27,4 @@ add_HPO_cols <- function(RES, sample_id_col = 'sampleID', gene_name_col = 'hgncS

setnames(RES, old = c('sampleID', 'hgncSymbol'), new = c(sample_id_col, gene_name_col))
return(RES)
}
}
4 changes: 2 additions & 2 deletions drop/requirementsR.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,5 @@ tidyr FALSE
magrittr FALSE
devtools FALSE
BSgenome.Hsapiens.UCSC.hg19 TRUE
MafDb.gnomAD.r2.1.hs37d5 TRUE
MafDb.gnomAD.r2.1.GRCh38 TRUE
#MafDb.gnomAD.r2.1.hs37d5 TRUE
#MafDb.gnomAD.r2.1.GRCh38 TRUE
23 changes: 2 additions & 21 deletions drop/setupDrop.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,25 +24,6 @@ def installRPackages():
script = pathlib.Path(drop.__file__).parent / "installRPackages.R"
requirements = pathlib.Path(drop.__file__).parent / 'requirementsR.txt'

#packages = [x.strip().split("#")[0] for x in open(requirements, 'r')]
#packages = [x for x in packages if x != '']

#for package in packages:
# logger.info(f"check {package}")
call = subprocess.Popen(
["Rscript", script, requirements],
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT
)
response = subprocess.run(["Rscript", script, requirements], stderr=subprocess.STDOUT)
response.check_returncode()

# check output for errors
stdout, stderr = call.communicate()
if stderr:
print(stderr)
exit(1)
stdout = stdout.decode()
ep = re.compile("Execution halted|^ERROR", re.M)
if ep.search(stdout):
print(stdout)
exit(1)

41 changes: 21 additions & 20 deletions drop/template/Scripts/Pipeline/SampleAnnotation.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
#' wb:
#' params:
#' - tmpdir: '`sm drop.getTmpDir()`'
#' - export_dir: '`sm parser.getProcResultsDir() + "/exported_counts"`'
#' - groups: '`sm parser.getExportGroups()`'
#' input:
#' - sampleAnnotation: '`sm config["sampleAnnotation"]`'
#' output:
#' - done: '`sm parser.getProcDataDir() + "/sample_anno/sample_anno.done"`'
#' - export: '`sm touch(parser.getProcResultsDir() + "/exported_counts/sample_anno.done")`'
#' - done: '`sm touch(parser.getProcDataDir() + "/sample_anno/sample_anno.done")`'
#' output:
#' html_document:
#' code_folding: hide
Expand Down Expand Up @@ -77,7 +80,7 @@ unique(sa[,.(RNA_ID, DROP_GROUP)])$DROP_GROUP %>% strsplit(',') %>% unlist %>%
#+echo=F
if(!is.null(sa$HPO_TERMS)){
sa2 <- sa[, .SD[1], by = RNA_ID]
hpo_dt <- fread('https://i12g-gagneurweb.in.tum.de/public/paper/drop_analysis/resource/hpo_genes.tsv.gz')
hpo_dt <- fread('https://www.cmm.in.tum.de/public/paper/drop_analysis/resource/hpo_genes.tsv.gz')

sapply(1:nrow(sa2), function(i){
hpos <- strsplit(sa2[i, HPO_TERMS], split = ',') %>% unlist
Expand All @@ -91,23 +94,21 @@ if(!is.null(sa$HPO_TERMS)){
na = NA, sep = "\t", row.names = F, quote = F)
}

if(snakemake@config$exportCounts == TRUE){
sa[, DROP_GROUP := gsub(' ', '', DROP_GROUP)]
if(!is.null(sa$SEX)) sa[, SEX := tolower(SEX)]

list_groups <- strsplit(sa$DROP_GROUP, split = ',')
drop_groups <- list_groups %>% unlist %>% unique

for(dataset in drop_groups){
path <- file.path(snakemake@config$root, 'processed_data/exported_counts', dataset)
dir.create(path)
cols <- intersect(colnames(sa),
c('RNA_ID', 'DNA_ID', 'PATIENT_ID', 'PAIRED_END', 'STRAND',
'TISSUE', 'SEX', 'AFFECTED', 'ICD_10'))
fwrite(sa[sapply(list_groups, function(x) dataset %in% x), cols, with = F],
file = file.path(path, paste0('sampleAnnotation_', dataset, '.tsv')),
quote = FALSE, row.names = FALSE, sep = '\t')
}
sa[, DROP_GROUP := gsub(' ', '', DROP_GROUP)]
if(!is.null(sa$SEX)) sa[, SEX := tolower(SEX)]

list_groups <- strsplit(sa$DROP_GROUP, split = ',')
drop_groups <- snakemake@params$groups # list_groups %>% unlist %>% unique

# export processed sample annotation
for(dataset in drop_groups){
cols <- intersect(colnames(sa),
c('RNA_ID', 'DNA_ID', 'PATIENT_ID', 'PAIRED_END', 'STRAND',
'TISSUE', 'SEX', 'AFFECTED', 'ICD_10'))
sa_sub <- sa[sapply(list_groups, function(x) dataset %in% x), cols, with = F]
path <- file.path(snakemake@params$export_dir, dataset)
dir.create(path)
filename <- file.path(path, paste0('sampleAnnotation_', dataset, '.tsv'))
fwrite(sa_sub, file = filename, quote = FALSE, row.names = FALSE, sep = '\t')
}

file.create(snakemake@output$done) %>% invisible
7 changes: 7 additions & 0 deletions drop/template/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,13 @@ rule mae:
rule sampleQC:
input: MAE(drop.getTmpDir() + "/sampleQC.done")

rule exportCounts:
input:
AE(parser.getExportCountFiles("geneCounts")),
AS(parser.getExportCountFiles("splitCounts")),
AS(parser.getExportCountFiles("spliceSiteOverlapCounts")),
parser.getProcResultsDir() + "/exported_counts/sample_anno.done"

rule dependencyGraph:
input:
MAE(config["htmlOutputPath"] + "/MAE_rulegraph.svg"),
Expand Down
8 changes: 7 additions & 1 deletion drop/template/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,13 @@ geneAnnotation:

genomeAssembly: hg19 # either hg19 or hg38
scanBamParam: null # or a path to an Rds file containing a scanBamParam object
exportCounts: false
exportCounts:
# specify which gene annotations to include and which
# groups to exclude when exporting counts
geneAnnotations:
- v29
excludeGroups:
- group1

aberrantExpression:
groups:
Expand Down

0 comments on commit b08cc8d

Please sign in to comment.