Skip to content

Commit

Permalink
Export counts meta (#144)
Browse files Browse the repository at this point in the history
* refactored writing export count meta files in separate python script rather than SampleAnnotation script

* include python scripts in non-module directories upon installation

Co-authored-by: mumichae <mi.mueller@tum.de>
  • Loading branch information
mumichae and mumichae committed Jan 14, 2021
1 parent 7572ba2 commit 44bc135
Show file tree
Hide file tree
Showing 8 changed files with 143 additions and 92 deletions.
3 changes: 2 additions & 1 deletion drop/config/DropConfig.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ def setDefaults(self, config_dict):
setKey = utils.setKey
setKey(config_dict, None, "fileRegex", r".*\.(R|md)")
setKey(config_dict, None, "genomeAssembly", "hg19")
setKey(config_dict, None, "hpoFile", None)
hpo_url = 'https://www.cmm.in.tum.de/public/paper/drop_analysis/resource/hpo_genes.tsv.gz'
setKey(config_dict, None, "hpoFile", hpo_url)

# set submodule dictionaries
setKey(config_dict, None, "aberrantExpression", dict())
Expand Down
48 changes: 30 additions & 18 deletions drop/config/ExportCounts.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,15 @@ def __init__(self, dict_, outputRoot, sampleAnnotation, geneAnnotations, genomeA
"""
self.CONFIG_KEYS = ["geneAnnotations", "excludeGroups"]
self.config_dict = self.setDefaults(dict_, geneAnnotations)
self.outputRoot = outputRoot
self.outputRoot = outputRoot / "exported_counts"
self.sa = sampleAnnotation
self.genomeAssembly = genomeAssembly
self.geneAnnotations = self.get("geneAnnotations")
self.modules = {
"aberrantExpression": aberrantExpression,
"aberrantSplicing": aberrantSplicing
}
self.pattern = self.outputRoot / "exported_counts" / "{dataset}--{genomeAssembly}--{annotation}"
self.pattern = self.outputRoot / "{dataset}--{genomeAssembly}--{annotation}"

def setDefaults(self, config_dict, gene_annotations):
utils.setKey(config_dict, None, "geneAnnotations", gene_annotations)
Expand All @@ -49,7 +50,6 @@ def get(self, key):
def getFilePattern(self, str_=True, expandStr=False):
pattern = self.pattern
if expandStr:
str_=True
pattern = pattern.__str__().replace("{", "{{").replace("}", "}}")
return utils.returnPath(pattern, str_=str_)

Expand All @@ -67,23 +67,35 @@ def getExportGroups(self, modules=None):
for module in modules:
groups.extend(self.modules[module].groups)
export_groups = set(groups) - set(self.get("excludeGroups"))
return export_groups
return sorted(list(export_groups))

def getExportCountFiles(self, prefix, expandPattern=None, **kwargs):
def getFiles(self, filename, datasets=None, **kwargs):
"""
Determine export count files.
:param prefix: name of file
:return: list of files to
Determine files for export count groups.
:param filename: name of file
:return: list of export files
"""
if prefix not in self.COUNT_TYPE_MAP.keys():
raise ValueError(f"{prefix} not a valid file type for exported counts")
if datasets is None:
datasets = self.getExportGroups()
file_pattern = str(self.pattern / f"{filename}")
return expand(
file_pattern,
dataset=datasets,
annotation=self.geneAnnotations,
genomeAssembly=self.genomeAssembly,
**kwargs
)

datasets = self.getExportGroups([self.COUNT_TYPE_MAP[prefix]])
if expandPattern is None:
file_pattern = str(self.pattern / f"{prefix}.tsv.gz")
else:
file_pattern = str(self.pattern / f"{expandPattern}.tsv.gz")
count_files = expand(file_pattern, annotation=self.get("geneAnnotations"),
dataset=datasets, genomeAssembly=self.genomeAssembly, **kwargs)
return count_files
def getExportCountFiles(self, count_type, suffix="tsv.gz", expandPattern=None, **kwargs):
"""
Determine export count files.
:param count_type: count type for mapping the submodule
:param suffix: file type suffix (without dot)
:return: list of export count files
"""
if count_type not in self.COUNT_TYPE_MAP.keys():
raise ValueError(f"'{count_type}' not a valid file type for exported counts")
datasets = self.getExportGroups([self.COUNT_TYPE_MAP[count_type]])
expandPattern = count_type if expandPattern is None else expandPattern
return self.getFiles(f"{expandPattern}.{suffix}", datasets, **kwargs)

5 changes: 4 additions & 1 deletion drop/config/SampleAnnotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,10 @@ def createSampleFileMapping(self):

def createGroupIds(self, group_key="DROP_GROUP", file_type=None, sep=','):
"""
Create a mapping of DROP groups to lists of sample IDs
:param group_key: name of group column in sample annotation
:param file_type: name of file column e.g. "RNA_BAM_FILE", "DNA_VCF_FILE"
:param sep: separator of multiple groups in group column
:return: mapping of drop group and ID
"""
if not file_type:
file_type = "RNA_BAM_FILE"
Expand Down
68 changes: 4 additions & 64 deletions drop/template/Scripts/Pipeline/SampleAnnotation.R
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,11 @@
#' log:
#' - snakemake: '`sm str(tmp_dir / "SampleAnnotation.Rds")`'
#' params:
#' - export_dir: '`sm cfg.getProcessedResultsDir() + "/exported_counts"`'
#' - groups: '`sm cfg.exportCounts.getExportGroups()`'
#' - hpoFile: '`sm cfg.get("hpoFile")`'
#' input:
#' - sampleAnnotation: '`sm config["sampleAnnotation"]`'
#' - sampleAnnotation: '`sm sa.file`'
#' output:
#' - export: '`sm touch(cfg.getProcessedResultsDir() + "/exported_counts/sample_anno.done")`'
#' - done: '`sm touch(cfg.getProcessedDataDir() + "/sample_anno/sample_anno.done")`'
#' - hpoOverlap: '`sm touch(cfg.getProcessedDataDir() + "/sample_anno/genes_overlapping_HPO_terms.tsv")`'
#' output:
#' html_document:
#' code_folding: hide
Expand Down Expand Up @@ -80,8 +77,7 @@ unique(sa[,.(RNA_ID, DROP_GROUP)])$DROP_GROUP %>% strsplit(',') %>% unlist %>%

# Obtain genes that overlap with HPO terms
#+echo=F
if(!is.null(sa$HPO_TERMS)){
if(!all(is.na(sa$HPO_TERMS)) & ! all(sa$HPO_TERMS == '')){
if(!is.null(sa$HPO_TERMS) & !all(is.na(sa$HPO_TERMS)) & ! all(sa$HPO_TERMS == '')){
sa2 <- sa[, .SD[1], by = RNA_ID]

filename <- ifelse(is.null(snakemake@params$hpo_file),
Expand All @@ -96,62 +92,6 @@ if(!is.null(sa$HPO_TERMS)){
}) %>% invisible() # don't print result
sa2 <- sa2[, .(RNA_ID, HPO_matching_genes)]

fwrite(sa2,
file.path(snakemake@config$root,
'processed_data/sample_anno/genes_overlapping_HPO_terms.tsv'),
fwrite(sa2, snakemake@output$hpoOverlap,
na = NA, sep = "\t", row.names = F, quote = F)
}
}
sa[, DROP_GROUP := gsub(' ', '', DROP_GROUP)]
if(!is.null(sa$ICD_10))
sa[, ICD_10 := toupper(ICD_10)]

list_groups <- strsplit(sa$DROP_GROUP, split = ',')
drop_groups <- snakemake@params$groups # list_groups %>% unlist %>% unique

genomeAssembly <- snakemake@config$genomeAssembly
geneAnnotation <- snakemake@config$exportCounts$geneAnnotations

sa[STRAND == 'no', STRAND_SPECIFIC := FALSE]
sa[STRAND %in% c('yes', 'reverse'), STRAND_SPECIFIC := TRUE]
sa[, STRAND := NULL]

# export processed sample annotation and DESCRIPTION file
for(dataset in drop_groups){
cols <- intersect(c('RNA_ID', 'INDIVIDUAL_ID', 'TISSUE', 'SEX', 'AFFECTED',
'ICD_10', 'PAIRED_END', 'STRAND_SPECIFIC'),
colnames(sa))
sa_sub <- sa[sapply(list_groups, function(x) dataset %in% x), cols, with = F]
path <- file.path(snakemake@params$export_dir, paste(dataset, genomeAssembly, geneAnnotation, sep = '--'))
dir.create(path)
fwrite(sa_sub, file = file.path(path, 'sampleAnnotation.tsv'),
quote = FALSE, row.names = FALSE, sep = '\t')

# DESCRIPTION file
v0 <- 'Title: # Add a title'
v1 <- paste0('Number of samples: ', nrow(sa_sub))
v2 <- ifelse(!is.null(sa_sub$TISSUE),
ifelse(uniqueN(sa_sub$TISSUE) > 1,
'More than 1 tissue in dataset is not allowed!',
paste0('Tissue: ', unique(sa_sub$TISSUE))),
'No tissue information')
v3 <- 'Organism: Homo sapiens'
v4 <- paste0('Genome assembly: ', genomeAssembly)
v5 <- paste0('Gene annotation: ', geneAnnotation)
v6 <- ifelse(!is.null(sa_sub$ICD_10),
paste0('Disease (ICD-10: N): ',
paste(unite(data.table(table(sa_sub$ICD_10)), col = 'aux', 'V1', 'N', sep = ': ')$aux, collapse = ', ' )),
'No disease information')
v7 <- ifelse(uniqueN(sa_sub$STRAND_SPECIFIC) > 1,
'All samples should be either strand- or non-strand-specific!',
paste0('Strand specific: ', unique(sa_sub$STRAND_SPECIFIC)))
v8 <- ifelse(uniqueN(sa_sub$PAIRED_END) > 1,
'All samples should be either single end or paired end!',
paste0('Paired end: ', unique(sa_sub$PAIRED_END)))
v9 <- 'Cite as: RNA-Seq count tables were taken from: # add your citation(s)'
v10 <- 'Dataset contact: # Use format Name Last_Name, <email address>'
v11 <- 'Comments: # add any comments, if needed, otherwise remove'

writeLines(c(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11), file.path(path, 'DESCRIPTION.txt'))
}

90 changes: 90 additions & 0 deletions drop/template/Scripts/Pipeline/exportCountsMeta.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
import itertools

cfg = snakemake.params.dropConfig
exportCounts = cfg.exportCounts
sampleAnnotation = snakemake.params.sampleAnnotation

# adapt sample annotation data frame
sa = sampleAnnotation.sa.copy()
sa["STRAND_SPECIFIC"] = sa["STRAND"] != "no"
if "ICD_10" in sa:
sa["ICD_10"] = sa["ICD_10"].str.upper()

sa_cols = frozenset(['RNA_ID', 'INDIVIDUAL_ID', 'TISSUE', 'SEX', 'AFFECTED', 'ICD_10', 'PAIRED_END',
'STRAND_SPECIFIC']).intersection(sa.columns)
sa_cols = list(sa_cols)


# Getters for DESCRIPTION file
def get_tissue_info(sa):
if "TISSUE" not in sa or sa["TISSUE"].is_null():
return 'No tissue information'
uniq_tissues = sa.TISSUE.unique()
if len(uniq_tissues) > 1:
return "More than 1 tissue in dataset is not allowed!"
return "".join(uniq_tissues)


def get_disease_info(sa):
if "ICD_10" in sa:
# unite(data.table(table(sa_sub$ICD_10)), col = 'aux', 'V1', 'N', sep = ': ')$aux, collapse = ', ' )
table = sa.ICD_10.value_counts()
return "\n" + "\n".join([f"\t{d}: {c}" for d, c in zip(table.index, table)])
return "No disease information"


def get_strand(sa):
if sa.STRAND_SPECIFIC.nunique() > 1:
return "All samples should be either strand- or non-strand-specific!"
else:
return sa.STRAND_SPECIFIC.unique()[0]


def get_pairedEnd(sa):
if sa.PAIRED_END.nunique() > 1:
return 'All samples should be either single end or paired end!',
return sa.PAIRED_END.unique()[0]


desc = """Title: # Add a title
Number of samples: {}
Tissue: {}
Organism: {}
Genome assembly: {}
Gene annotation: {}
Disease (ICD-10: N): {}
Strand specific: {}
Paired end: {}
Cite as: RNA-Seq count tables were taken from # add your citation(s)
Dataset contact: # Use format Name Last_Name, <email address>
Comments: # add any comments, if needed, otherwise remove
"""

group_with_anno = itertools.product(exportCounts.getExportGroups(), exportCounts.geneAnnotations)
sa_files = snakemake.output.sampleAnnotations
desc_files = snakemake.output.descriptions

for ga, sa_file, desc_file in zip(group_with_anno, sa_files, desc_files):
group, annotation = ga # unpack

# subset by group
rna_ids = sampleAnnotation.rnaIDs[group]
dna_ids = sampleAnnotation.dnaIDs[group]
sa_sub = sa.loc[(sa["RNA_ID"].isin(rna_ids)) & (sa["DNA_ID"].isin(dna_ids))]

# save sample annotation subset
sa_sub.to_csv(sa_file, sep='\t', index=False, columns=sa_cols)

# save DESCRIPTION file
with open(desc_file, "w") as f:
desc_output = desc.format(
len(sa_sub), # number samples
get_tissue_info(sa_sub), # tissue
"Homo sapiens", # organism
exportCounts.genomeAssembly,
annotation,
get_disease_info(sa_sub), # disease
get_strand(sa_sub), # strand specific
get_pairedEnd(sa_sub) # paired end
)
f.write(desc_output)
10 changes: 8 additions & 2 deletions drop/template/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -33,14 +33,20 @@ rule all:
"""

rule sampleAnnotation:
input: cfg.getProcessedDataDir() + "/sample_anno/sample_anno.done"
input: rules.Pipeline_SampleAnnotation_R.output

rule exportCounts:
input:
cfg.exportCounts.getExportCountFiles("geneCounts"),
cfg.exportCounts.getExportCountFiles("splicingCounts", expandPattern="k_{type}_counts", type=["j", "theta"]),
cfg.exportCounts.getExportCountFiles("splicingCounts", expandPattern="n_{type}_counts", type=["psi5", "psi3", "theta"]),
cfg.getProcessedResultsDir() + "/exported_counts/sample_anno.done"
output:
sampleAnnotations = cfg.exportCounts.getFiles("sampleAnnotation.tsv"),
descriptions = cfg.exportCounts.getFiles("DESCRIPTION.txt"),
params:
dropConfig = cfg,
sampleAnnotation = sa
script: "Scripts/Pipeline/exportCountsMeta.py"

rule dependencyGraph:
input:
Expand Down
5 changes: 2 additions & 3 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,8 @@

extra_files = []
for (path, directories, filenames) in os.walk('drop/'):
directories[:] = [d for d in directories if not (d.startswith('.') or d == 'Data')]
filenames[:] = [f for f in filenames if
not (f.startswith('.') or f.endswith('.Rproj') or f.endswith('.py'))]
directories[:] = [d for d in directories if not d.startswith('.')]
filenames[:] = [f for f in filenames if not f.startswith('.') and not f.endswith('.Rproj')]
for filename in filenames:
extra_files.append(os.path.join('..', path, filename))

Expand Down
6 changes: 3 additions & 3 deletions tests/config/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,9 @@ def test_DropConfigPaths(demo_dir, dropConfig):
@pytest.mark.parametrize(
"modules,groups",
[
("aberrantExpression", {"outrider"}),
("aberrantSplicing", {"fraser"}),
(["aberrantExpression", "aberrantSplicing"], {"outrider", "fraser"})
("aberrantExpression", ["outrider"]),
("aberrantSplicing", ["fraser"]),
(["aberrantExpression", "aberrantSplicing"], ["fraser", "outrider"])
]
)
def test_cfgExportGroups(dropConfig, modules, groups):
Expand Down

0 comments on commit 44bc135

Please sign in to comment.