Skip to content

Commit

Permalink
Merge pull request #70 from vyepez88/documentation
Browse files Browse the repository at this point in the history
Documentation
  • Loading branch information
vyepez88 committed Jun 12, 2020
2 parents 22adccd + 8b2ee73 commit c2c4d3c
Show file tree
Hide file tree
Showing 5 changed files with 56 additions and 25 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ before_script:
- cd $PROJECT_DIR
- drop demo

script:
script:
- cd $PROJECT_DIR
- snakemake -n
- snakemake aberrantExpression --cores 2
Expand Down
18 changes: 6 additions & 12 deletions drop/configHelper.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def setDefaults(self, config, method=None):
setKey(config, ["aberrantExpression"], "groups", None, verbose=VERBOSE)
setKey(config, ["aberrantExpression"], "padjCutoff", .05, verbose=VERBOSE)
setKey(config, ["aberrantExpression"], "zScoreCutoff", 0, verbose=VERBOSE)
setKey(config, ["aberrantSplicing"], "maxTestedDimensionProportion", 3, verbose=VERBOSE)
setKey(config, ["aberrantExpression"], "maxTestedDimensionProportion", 3, verbose=VERBOSE)

# aberrant splicing
if self.method == "AS" or self.method is None:
Expand Down Expand Up @@ -276,7 +276,8 @@ def subsetGroups(self, ids_by_group, subset_groups, warn=30, error=10):
for group in subset_groups:
if len(subset[group]) < error:
message = f'Too few IDs in DROP_GROUP {group}'
message += ', please ensure that it has at least {error} IDs'
message += f', please ensure that it has at least {error} IDs'
message += f', groups: {subset[group]}'
raise ValueError(message)
elif len(subset[group]) < warn:
logger.info(f'WARNING: Less than {warn} IDs in DROP_GROUP {group}')
Expand Down Expand Up @@ -403,15 +404,8 @@ def getExportCountFiles(self, prefix):

datasets = self.getExportGroups([count_type_map[prefix]])
annotations = self.config["exportCounts"]["geneAnnotations"]
genomeAssembly = self.config["genomeAssembly"]

pattern = self.getProcResultsDir()
if prefix == "geneCounts":
pattern += f"/exported_counts/{{dataset}}/{prefix}_{{dataset}}--{{annotation}}.tsv.gz"
return expand(pattern, annotation=annotations, dataset=datasets)
else:
pattern += f"/exported_counts/{{dataset}}/{prefix}_{{dataset}}.tsv.gz"
return expand(pattern, dataset=datasets)


pattern = self.getProcResultsDir() + f"/exported_counts/{{dataset}}--{{genomeAssembly}}--{{annotation}}/{prefix}.tsv.gz"
return expand(pattern, annotation=annotations, dataset=datasets, genomeAssembly=genomeAssembly)


2 changes: 1 addition & 1 deletion drop/modules/aberrant-expression-pipeline
57 changes: 47 additions & 10 deletions drop/template/Scripts/Pipeline/SampleAnnotation.R
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ saveRDS(snakemake, file.path(snakemake@params$tmpdir, "sample_anno_overview.snak
suppressPackageStartupMessages({
library(data.table)
library(magrittr)
library(tidyr)
})

sa <- fread(snakemake@input$sampleAnnotation)
Expand Down Expand Up @@ -63,7 +64,7 @@ if('DNA_VCF_FILE' %in% colnames(sa)){
if(any(sa$aux1 == F)){
print('The following VCF files do not exist: ')
DT::datatable(sa[aux1 == F])
}
}
}

#' Check for RNA_IDs with more than one RNA_BAM_FILE
Expand All @@ -90,25 +91,61 @@ if(!is.null(sa$HPO_TERMS)){
sa2 <- sa2[, .(RNA_ID, HPO_matching_genes)]

fwrite(sa2,
file.path(snakemake@config$root, 'processed_data/sample_anno/genes_overlapping_HPO_terms.tsv'),
file.path(snakemake@config$root,
'processed_data/sample_anno/genes_overlapping_HPO_terms.tsv'),
na = NA, sep = "\t", row.names = F, quote = F)
}

sa[, DROP_GROUP := gsub(' ', '', DROP_GROUP)]
if(!is.null(sa$SEX)) sa[, SEX := tolower(SEX)]
if(!is.null(sa$ICD_10))
sa[, ICD_10 := toupper(ICD_10)]

list_groups <- strsplit(sa$DROP_GROUP, split = ',')
drop_groups <- snakemake@params$groups # list_groups %>% unlist %>% unique

# export processed sample annotation
genomeAssembly <- snakemake@config$genomeAssembly
geneAnnotation <- snakemake@config$exportCounts$geneAnnotations

sa[STRAND == 'no', STRAND_SPECIFIC := FALSE]
sa[STRAND %in% c('yes', 'reverse'), STRAND_SPECIFIC := TRUE]
sa[, STRAND := NULL]

# export processed sample annotation and DESCRIPTION file
for(dataset in drop_groups){
cols <- intersect(colnames(sa),
c('RNA_ID', 'DNA_ID', 'PATIENT_ID', 'PAIRED_END', 'STRAND',
'TISSUE', 'SEX', 'AFFECTED', 'ICD_10'))
cols <- intersect(c('RNA_ID', 'INDIVIDUAL_ID', 'TISSUE', 'SEX', 'AFFECTED',
'ICD_10', 'PAIRED_END', 'STRAND_SPECIFIC'),
colnames(sa))
sa_sub <- sa[sapply(list_groups, function(x) dataset %in% x), cols, with = F]
path <- file.path(snakemake@params$export_dir, dataset)
path <- file.path(snakemake@params$export_dir, paste(dataset, genomeAssembly, geneAnnotation, sep = '--'))
dir.create(path)
filename <- file.path(path, paste0('sampleAnnotation_', dataset, '.tsv'))
fwrite(sa_sub, file = filename, quote = FALSE, row.names = FALSE, sep = '\t')
fwrite(sa_sub, file = file.path(path, 'sampleAnnotation.tsv'),
quote = FALSE, row.names = FALSE, sep = '\t')

# DESCRIPTION file
v0 <- 'Title: # Add a title'
v1 <- paste0('Number of samples: ', nrow(sa_sub))
v2 <- ifelse(!is.null(sa_sub$TISSUE),
ifelse(uniqueN(sa_sub$TISSUE) > 1,
'More than 1 tissue in dataset is not allowed!',
paste0('Tissue: ', unique(sa_sub$TISSUE))),
'No tissue information')
v3 <- 'Organism: Homo sapiens'
v4 <- paste0('Genome assembly: ', genomeAssembly)
v5 <- paste0('Gene annotation: ', geneAnnotation)
v6 <- ifelse(!is.null(sa_sub$ICD_10),
paste0('Disease (ICD-10: N): ',
paste(unite(data.table(table(sa_sub$ICD_10)), col = 'aux', 'V1', 'N', sep = ': ')$aux, collapse = ', ' )),
'No disease information')
v7 <- ifelse(uniqueN(sa_sub$STRAND_SPECIFIC) > 1,
'All samples should be either strand- or non-strand-specific!',
paste0('Strand specific: ', unique(sa_sub$STRAND_SPECIFIC)))
v8 <- ifelse(uniqueN(sa_sub$PAIRED_END) > 1,
'All samples should be either single end or paired end!',
paste0('Paired end: ', unique(sa_sub$PAIRED_END)))
v9 <- 'Cite as: RNA-Seq count tables were taken from: # add your citation(s)'
v10 <- 'Dataset contact: # Use format Name Last_Name, <email address>'
v11 <- 'Comments: # add any comments, if needed, otherwise remove'

writeLines(c(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11), file.path(path, 'DESCRIPTION.txt'))
}

0 comments on commit c2c4d3c

Please sign in to comment.