diff --git a/README.md b/README.md index 34686c8..0ec26f3 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ The workflow, configured to run Hisat2 and Stringtie, can then be launched by ru $ ./submit -From here, the user may follow our documentation to modify the software options as well as point to their own input datasets. +From here, the user may follow our documentation to modify the software options as well as point to their own input datasets. Note that there are no test reference genome indices available for STAR, because they are too large to upload to github. @@ -96,8 +96,15 @@ file and gene annotation in GTF/GFF3 format, the following commands can be used $ tophat2 -G GRCh38.gencode.v24.annotation.gff3 --transcriptome-index=transcriptome_data/GRCh38 GRCh38 $ tar czf GRCh38.transcriptome_data.tar.gz transcriptome_data/ +### If the user would like to use STAR: + # cd into the "star_index" directory within "reference". Place all genome files here + $ cd reference/star_index + $ STAR-2.5.2b/bin/Linux_x86_64_static/STAR --runMode genomeGenerate --runThreadN 4 --genomeDir ./ --genomeFastaFiles ./GRCh38.fa --sjdbGTFfile ./GRCh38.gencode.v24.annotation.gff3 +#### Generate Tab delimited list of splice sites using gene model GTF file as input (Python DefaultDictionary Module necessary) + + $ python hisat2_extract_splice_sites.py GRCh38-gencode.v24.annotation.gtf > GRCh38.Splice_Sites.txt ## Workflow Configuration @@ -128,6 +135,19 @@ $REF_PREFIX.transcriptome_data.tar.gz $REF_PREFIX.gff3 +#### If the user would like to use, STAR, the following files must be present in the _reference/star_index_ directory: + +chrLength.txt +chrNameLength.txt +chrName.txt +chrStart.txt +Genome +genomeParameters.txt +Log.out +SA +SAindex +*Splice_Sites.txt + ### User Input Datasets OSG-GEM supports the processing of multiple input datasets into a single Gene Expression Matrix(GEM). The user diff --git a/osg-gem.conf.example b/osg-gem.conf.example index 7351f12..e14df28 100644 --- a/osg-gem.conf.example +++ b/osg-gem.conf.example @@ -43,6 +43,9 @@ tophat2 = False # process using Hisat2 hisat2 = True +# process using STAR +star = False + # process using Cufflinks cufflinks = False diff --git a/reference/star_index/temp b/reference/star_index/temp new file mode 100644 index 0000000..e69de29 diff --git a/task-files/gem-merge/Log_merge.py b/task-files/gem-merge/Log_merge.py index a15aa09..8795cb1 100644 --- a/task-files/gem-merge/Log_merge.py +++ b/task-files/gem-merge/Log_merge.py @@ -1,3 +1,4 @@ + ## William Poehlman ## ## LogParse.py v1.1 ## ## wpoehlm@clemson.edu ## @@ -18,6 +19,7 @@ def ParseLogs(trimlist, maplist, software, sample, layout): maplist --> a list of hisat2 or tophat2 log files software --> read mapping software that generated log files. 'hisat' or 'tophat' sample --> A sample number that is present at the beginning of each filename + layout --> Sequencing layout. 'single' or 'paired' output: A dictionary, where the sample ID number is the key, and the following statistics are stored in the list value: @@ -34,10 +36,14 @@ def ParseLogs(trimlist, maplist, software, sample, layout): ((left mapped reads + right mapped reads)/2)/(cleaned input pairs) + The overall Alignment rate for STAR is calculated as follows: + (uniquely mapped reads + multiple alignments) / (cleaned input pairs) + Logfile names must be formatted as follows: --align_summary.txt (only for tophat) --fastq-out.txt (for hisat) + --Log.final.out -unique file identifier>-fastq-trimmomatic.txt for example: @@ -45,7 +51,7 @@ def ParseLogs(trimlist, maplist, software, sample, layout): 1-000000-000000.fastq-out.txt 1-000000-000000.fastq-trimmomatic.txt - NOTE: do not mix log files for tophat and hisat + NOTE: do not mix log files for tophat/hisat/star """ triminput = [] @@ -134,6 +140,91 @@ def ParseLogs(trimlist, maplist, software, sample, layout): concordant.append(concordantpairs) mapped.append(mappedsingles) + elif software == 'star' and layout == 'paired': + multimapped = [] + uniqmapped = [] + for file in trimlist: + basename = os.path.splitext(file)[0] + dataset = basename.split('-')[0] + if dataset == sample: + for line in open(file): + if 'Input Read Pairs' in line: + inp = int(line.split(' ')[3]) + triminput.append(inp) + out = int(line.split(' ')[6]) + trimoutput.append(out) + for file in maplist: + basename = os.path.splitext(file)[0] + dataset = basename.split('-')[0] + if dataset == sample: + count = 0 + for line in open(file): + count +=1 + if count == 9: + uniq = int(line.split('|')[1].strip()) + uniqmapped.append(uniq) + if count == 24: + multi = int(line.split('|')[1].strip()) + multimapped.append(multi) + + + concordant = sum(uniqmapped) + mult = sum(multimapped) + mapped = concordant + mult + inputraw = sum(triminput) + inputclean = sum(trimoutput) + float1 = float(mapped) + float2 = float(inputclean) + rate = float1/float2 + + report[sample] = [inputraw, inputclean, concordant, rate*100] + + return report + + if software == 'star' and layout == 'single': + multimapped = [] + uniqmapped = [] + for file in trimlist: + basename = os.path.splitext(file)[0] + dataset = basename.split('-')[0] + if dataset == sample: + for line in open(file): + if 'Input Reads' in line: + inp = int(line.split(' ')[2]) + triminput.append(inp) + out = int(line.split(' ')[4]) + trimoutput.append(out) + for file in maplist: + basename = os.path.splitext(file)[0] + dataset = basename.split('-')[0] + if dataset == sample: + count = 0 + + for line in open(file): + count +=1 + if count == 9: + uniq = int(line.split('|')[1].strip()) + uniqmapped.append(uniq) + if count == 24: + multi = int(line.split('|')[1].strip()) + multimapped.append(multi) + + concordant = sum(uniqmapped) + mult = sum(multimapped) + mapped = concordant + mult + inputraw = sum(triminput) + inputclean = sum(trimoutput) + + float1 = float(mapped) + float2 = float(inputclean) + rate = float1/float2 + + report[sample] = [inputraw, inputclean, concordant, rate*100] + + return report + + + if software == 'tophat2' and layout == 'paired': inputraw = sum(triminput) @@ -252,11 +343,12 @@ def main(): #locate input log files and append to lists #tophat alignment summary - maploglist = [file for file in glob.glob(('*summary.txt'))] - - if len(maploglist) == 0: - #If there are no tophat alignment summaries, look for hisat standard output instead + if software == 'tophat2': + maploglist = [file for file in glob.glob(('*summary.txt'))] + elif software == 'hisat2': maploglist = [file for file in glob.glob('*fastq-out.txt')] + elif software == 'star': + maploglist = [file for file in glob.glob('*Log.final.out')] trimloglist = [file for file in glob.glob('*-trimmomatic.txt')] if len(maploglist) != len(trimloglist): diff --git a/task-files/star/2.5.2b.zip b/task-files/star/2.5.2b.zip new file mode 100644 index 0000000..3428d14 Binary files /dev/null and b/task-files/star/2.5.2b.zip differ diff --git a/task-files/star/fasta_adapter.txt b/task-files/star/fasta_adapter.txt new file mode 100644 index 0000000..dc20587 --- /dev/null +++ b/task-files/star/fasta_adapter.txt @@ -0,0 +1,130 @@ +>TRUSEQUNIVERSAL ADAPTER +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +>TRUSEQADAPTER, INDEX 1 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 2 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 3 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 4 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 5 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 6 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 7 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 8 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 9 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 10 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 11 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 12 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 13 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTCAACAATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 14 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTTCCGTATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 15 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACATGTCAGAATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 16 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCGTCCCGATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 18 4 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCACATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 19 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGAAACGATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 20 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGGCCTTATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 21 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTTTCGGAATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 22 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGTACGTAATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 23 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGTGGATATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 25 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTGATATATCTCGTATGCCGTCTTCTGCTTG +>TRUSEQADAPTER, INDEX 27 +GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCCTTTATCTCGTATGCCGTCTTCTGCTTG +>PE ADAPTER A +GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG +>PE ADAPTER B +ACACTCTTTCCCTACACGACGCTCTTCCGATCT +>PE PCR PRIMER 1.0 +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT +>PE PCR PRIMER 2.0 +CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT +>PE READ 1 SEQUENCING PRIMER +ACACTCTTTCCCTACACGACGCTCTTCCGATCT +>PE READ 2 SEQUENCING PRIMER +CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT +>NEXTERA_CIRCULARIZED_DUPLICATE_JUNCTION_ADAPTER +CTGTCTCTTATACACATCTAGATGTGTATAAGAGACAG +>NEXTERA_CIRCULARIZED_SINGLE_JUNCTION_ADAPTER +CTGTCTCTTATACACATCT +>NEXTERA_CIRCULARIZED_SINGLE_JUNCTION_ADAPTER_REVERSE_COMPLEMENT +AGATGTGTATAAGAGACAG +>NEXTERA_READ_1_EXTERNAL_ADAPTER +ATCGGAAGAGCACACGTCTGAACTCCAGTCAC +>NEXTERA_READ_2_EXTERNAL_ADAPTER +GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +>NEXTERA_CIRCULARIZED_DUPLICATE_JUNCTION_ADAPTER_REVERSE_COMPLEMENT +CTGTCTCTTATACACATCTAGATGTGTATAAGAGACAG +>NEXTERA_READ_1_EXTERNAL_ADAPTER_REVERSE_COMPLEMENT +GTGACTGGAGTTCAGACGTGTGCTCTTCCGAT +>NEXTERA_READ_2_EXTERNAL_ADAPTER_REVERSE_COMPLEMENT +ACACTCTTTCCCTACACGACGCTCTTCCGATC +>GH_TBH_HINDIII_LEFT +GTTGTAAAACGACGGCCAGTGAATTGTAATACGACTCACTATAGGGCGAATTCGAGCTCGGTACCCGGGGATCCTCTAGAGTCGACCTGCAGGCATGCA +>GH_TBH_HINDIII_RGHT +AGCTTGAGTATTCTATAGTGTCACCTAAATAGCTTGGCGTAATCATGGTCATAGCTGTTTCCTGTGTGAAATTGTTATCCGCTCACAATTCCACACAAC +>GH_TBR_HINDIII_LEFT +CTGAGCCTTTCGTTTTATTTGACCATGTTGGTATGATTTAAATTAATACGACTCACTATAGGGTCAGTGCGGCCGCGACTTCAAGTCACGTGAATTCCA +>GH_TBR_HINDIII_RIGHT +AGCTTCGGATCCCACTGTGGTGGATGCATTTTAACCCATCACATATACCTGCCGTTCACTATTATTTAGTGAAATGAGATATTATGATATTTTCTGAAT +>GH_TBB_BAMHI_LEFT +GGGTAACGCCAGGGTTTTCCCAGTCACGACGTTGTAAAACGACGGCCAGTGAATTGTAATACGACTCACTATAGGGCGAATTCGAGCTCGGTACCCGGG +>GH_TBB_BAMHI_RGHT +GATCCTCTAGAGTCGACCTGCAGGCATGCAAGCTTGAGTATTCTATAGTGTCACCTAAATAGCTTGGCGTAATCATGGTCATAGCTGTTTCCTGTGTGA +>PCC2FOS_ECO27I_LEFT +CGTTGTAAAACGACGGCCAGTGAATTGTAATACGACTCACTATAGGGCGAATTCGAGCTCGGTACCCGGGGATCCCACGTACAACGACACCTAGACCAC +>PCC2FOS_ECO27I_RGHT +GTGTTCCTAGGCTGTTTCCTGGTGGGATCCTCTAGAGTCGACCTGCAGGCATGCAAGCTTGAGTATTCTATAGTCTCACCTAAATAGCTTGGCGTAATC +>GH_TBH_HINDIII_LEFT_RC +TGCATGCCTGCAGGTCGACTCTAGAGGATCCCCGGGTACCGAGCTCGAATTCGCCCTATAGTGAGTCGTATTACAATTCACTGGCCGTCGTTTTACAAC +>GH_TBH_HINDIII_RGHT_RC +GTTGTGTGGAATTGTGAGCGGATAACAATTTCACACAGGAAACAGCTATGACCATGATTACGCCAAGCTATTTAGGTGACACTATAGAATACTCAAGCT +>GH_TBR_HINDIII_LEFT_RC +TGGAATTCACGTGACTTGAAGTCGCGGCCGCACTGACCCTATAGTGAGTCGTATTAATTTAAATCATACCAACATGGTCAAATAAAACGAAAGGCTCAG +>GH_TBR_HINDIII_RIGHT_RC +ATTCAGAAAATATCATAATATCTCATTTCACTAAATAATAGTGAACGGCAGGTATATGTGATGGGTTAAAATGCATCCACCACAGTGGGATCCGAAGCT +>GH_TBB_BAMHI_LEFT_RC +CCCGGGTACCGAGCTCGAATTCGCCCTATAGTGAGTCGTATTACAATTCACTGGCCGTCGTTTTACAACGTCGTGACTGGGAAAACCCTGGCGTTACCC +>GH_TBB_BAMHI_RGHT_RC +TCACACAGGAAACAGCTATGACCATGATTACGCCAAGCTATTTAGGTGACACTATAGAATACTCAAGCTTGCATGCCTGCAGGTCGACTCTAGAGGATC +>PCC2FOS_ECO27I_LEFT_RC +GTGGTCTAGGTGTCGTTGTACGTGGGATCCCCGGGTACCGAGCTCGAATTCGCCCTATAGTGAGTCGTATTACAATTCACTGGCCGTCGTTTTACAACG +>PCC2FOS_ECO27I_RGHT_RC +GATTACGCCAAGCTATTTAGGTGAGACTATAGAATACTCAAGCTTGCATGCCTGCAGGTCGACTCTAGAGGATCCCACCAGGAAACAGCCTAGGAACAC +>CDJA +CTGTCTCTTATACACATCTAGATGTGTATAAGAGACAG +>CSJA +CTGTCTCTTATACACATCT +>CSJARC +AGATGTGTATAAGAGACAG +>R1EA +GATCGGAAGAGCACACGTCTGAACTCCAGTCAC +>R2EA +GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT +>PREFIXPE/1 +TACACTCTTTCCCTACACGACGCTCTTCCGATCT +>PREFIXPE/2 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>INDEX_PRIMER_SEQUENCE +CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC +>NEBNEXT_UNIVERSAL_FOR_ILLLUMINA +AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATC +>NEBNEXT_ADAPTER_ILLUMINA +GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTCTTTCCCTACACGA diff --git a/task-files/star/trimmomatic-0.36.jar b/task-files/star/trimmomatic-0.36.jar new file mode 100644 index 0000000..64c3104 Binary files /dev/null and b/task-files/star/trimmomatic-0.36.jar differ diff --git a/tools/AutoADAG.pyc b/tools/AutoADAG.pyc index b98c7b2..b8edf6a 100644 Binary files a/tools/AutoADAG.pyc and b/tools/AutoADAG.pyc differ diff --git a/tools/StringTie b/tools/StringTie index 74ae16a..26c839d 100755 --- a/tools/StringTie +++ b/tools/StringTie @@ -45,3 +45,4 @@ python ST_ttab_parse.py t_data.ctab > $BASE_NAME-merged_counts.fpkm mv t_data.ctab $BASE_NAME-t_data.ctab mv e_data.ctab $BASE_NAME-e_data.ctab mv i_data.ctab $BASE_NAME-i_data.ctab + diff --git a/tools/dax-level-1 b/tools/dax-level-1 index 3df4f57..80ec330 100755 --- a/tools/dax-level-1 +++ b/tools/dax-level-1 @@ -26,32 +26,40 @@ if len(r) != 1: sys.exit(1) # validate the settings in the config file -if not conf.getboolean("config", "hisat2") and not conf.getboolean("config", "tophat2"): - print("Please enable either hisat2 or tophat2 in the config file") +if not conf.getboolean("config", "hisat2") and not conf.getboolean("config", "tophat2") and not conf.getboolean("config", "star"): + print("Please enable either hisat2 or tophat2 or star in the config file") sys.exit(1) -if conf.getboolean("config", "hisat2") and conf.getboolean("config", "tophat2"): +if conf.getboolean("config", "hisat2") and conf.getboolean("config", "tophat2") and conf.getboolean("config", "star"): + print("Please enable only one of hisat2 or tophat2 or star in the config file") + sys.exit(1) +if conf.getboolean("config", "hisat2") and conf.getboolean("config", "tophat2") and not conf.getboolean("config", "star"): print("Please enable only one of hisat2 and tophat2 in the config file") sys.exit(1) +if conf.getboolean("config", "hisat2") and not conf.getboolean("config", "tophat2") and conf.getboolean("config", "star"): + print("Please enable only one of hisat2 and star in the config file") + sys.exit(1) +if not conf.getboolean("config", "hisat2") and conf.getboolean("config", "tophat2") and conf.getboolean("config", "star"): + print("Please enable only one of tophat2 and star in the config file") + sys.exit(1) if not conf.getboolean("config", "stringtie") and not conf.getboolean("config", "cufflinks"): print("Please enable either stringtie or cufflinks in the config file") sys.exit(1) if conf.getboolean("config", "stringtie") and conf.getboolean("config", "cufflinks"): print("Please enable only one of stringtie and cufflinks in the config file") sys.exit(1) -if conf.getboolean("config", "paired") and conf.getboolean("config", "single"): - print("Please enable only paired OR single end reads") - sys.exit(1) -if not conf.getboolean("config", "paired") and not conf.getboolean("config", "single"): - print("Please enable either single or paired end reads") - sys.exit(1) # making sure the user specified a reference set print("\nAdding reference files ...") count = 0 -for fname in os.listdir(base_dir + "/reference/"): - if re.search("^" + conf.get("reference", "reference_prefix"), fname): - print(" " + fname) - count += 1 + +for fname in os.listdir(base_dir + "/reference/star_index/"): +# if re.search("^" + conf.get("reference", "reference_prefix"), fname): +# print(" " + fname) +# count += 1 +# if re.search("^" + conf.get("reference","reference_path"), fname): + print(" " + fname) + count += 1 + if count == 0: print("ERROR: Unable to find reference files in the reference/ directory" + " with the specified prefix: " + conf.get("reference", "reference_prefix")) @@ -124,14 +132,14 @@ if conf.getboolean("config", "paired"): #dl.addProfile(Profile("hints", "execution.site", "local")) dl.addProfile(Profile("dagman", "CATEGORY", "sradownload")) dax.addJob(dl) - + # set up split jobs split1 = Job(name="prepare-inputs") split1.uses(forward_file, link=Link.INPUT) split1.addArguments(base_dir, forward_file, data_dir + "/" + str(input_id), str(input_id) + "-forward") split1.addProfile(Profile("hints", "execution.site", "local")) dax.addJob(split1) - + split2 = Job(name="prepare-inputs") split2.uses(reverse_file, link=Link.INPUT) split2.addArguments(base_dir, reverse_file , data_dir + "/" + str(input_id), str(input_id) + "-reverse") @@ -182,7 +190,6 @@ elif conf.getboolean("config", "single"): prepare_jobs.append(split1) - # generate sub workflow j2 = Job(name="dax-level-2") diff --git a/tools/dax-level-2 b/tools/dax-level-2 index 9ab9f84..f08f520 100755 --- a/tools/dax-level-2 +++ b/tools/dax-level-2 @@ -55,21 +55,38 @@ def add_ref_files(dax, job, ref_prefix): """ add a set of reference input files """ - global ref_files - if ref_prefix not in ref_files: - ref_files[ref_prefix] = {} - # add to DAX-level replica catalog - for fname in os.listdir(base_dir + "/reference/"): - if (not re.search("^" + ref_prefix, fname)) or (re.search("\.gff3$", fname)): - # skip non-matching files - continue - if fname not in ref_files[ref_prefix]: - ref_files[ref_prefix][fname] = File(fname) - ref_files[ref_prefix][fname].addPFN( \ - PFN("file://" + base_dir + "/reference/" + fname, "local")) - dax.addFile(ref_files[ref_prefix][fname]) - for f in ref_files[ref_prefix]: - job.uses(f, link=Link.INPUT) + + + if conf.getboolean("config", "star"): + global ref_files + # add to DAX-level replica catalog + for fname in os.listdir(base_dir + "/reference/star_index/"): + if fname not in ref_files: + ref_files[fname] = File(fname) + ref_files[fname].addPFN( \ + PFN("file://" + base_dir + "/reference/star_index/" + fname, "local")) + dax.addFile(ref_files[fname]) + for f in ref_files: + job.uses(f, link=Link.INPUT) + + + else: + global ref_files + if ref_prefix not in ref_files: + ref_files[ref_prefix] = {} + # add to DAX-level replica catalog + for fname in os.listdir(base_dir + "/reference/"): + if (not re.search("^" + ref_prefix, fname)) or (re.search("\.gff3$", fname)): + # skip non-matching files + continue + if fname not in ref_files[ref_prefix]: + ref_files[ref_prefix][fname] = File(fname) + ref_files[ref_prefix][fname].addPFN( \ + PFN("file://" + base_dir + "/reference/" + fname, "local")) + dax.addFile(ref_files[ref_prefix][fname]) + for f in ref_files[ref_prefix]: + job.uses(f, link=Link.INPUT) + def add_gff3_file(dax, job): @@ -154,9 +171,40 @@ if conf.getboolean("config", "paired"): dax.addJob(j) return f1 + + + def star(task_files, dax, base_name, part, common_part, forward_file, reverse_file): + # Add job + j = Job(name="star") + j.addArguments(getpass.getuser(), \ + run_id, \ + conf.get("reference", "reference_prefix"), \ + base_name, \ + part, \ + common_part, "paired") + add_task_files(task_files, dax, j, "star") + add_ref_files(dax, j, conf.get("reference", "reference_prefix")) + j.uses(forward_file, link=Link.INPUT) + j.uses(reverse_file, link=Link.INPUT) + # output files + f1 = File(base_name + "-" + common_part + "-accepted_hits.bam") + j.uses(f1, link=Link.OUTPUT, transfer=False) + f3 = File(base_name + "-" + common_part + "-out.txt") + log_files.append(f3) + l = File(base_name + "-" + common_part + "-Log.final.out") + j.uses(l, link=Link.OUTPUT, transfer=False) + log_files.append(l) + j.uses(f3, link=Link.OUTPUT, transfer=False) + f4 = File(base_name + "-" + common_part + "-trimmomatic.txt") + j.uses(f4, link=Link.OUTPUT, transfer=False) + log_files.append(f4) + dax.addJob(j) + + return f1 + elif conf.getboolean("config", "single"): - + def hisat2(task_files, dax, base_name, part, common_part, forward_file): @@ -213,6 +261,36 @@ elif conf.getboolean("config", "single"): dax.addJob(j) return f1 + + + def star(task_files, dax, base_name, part, common_part, forward_file): + # Add job + j = Job(name="star") + j.addArguments(getpass.getuser(), \ + run_id, \ + conf.get("reference", "reference_prefix"), \ + base_name, \ + part, \ + common_part, "single") + add_task_files(task_files, dax, j, "star") + add_ref_files(dax, j, conf.get("reference", "reference_prefix")) + j.uses(forward_file, link=Link.INPUT) + # output files + f1 = File(base_name + "-" + common_part + "-accepted_hits.bam") + j.uses(f1, link=Link.OUTPUT, transfer=False) + f3 = File(base_name + "-" + common_part + "-out.txt") + log_files.append(f3) + j.uses(f3, link=Link.OUTPUT, transfer=False) + l = File(base_name + "-" + common_part + "-Log.final.out") + j.uses(l, link=Link.OUTPUT, transfer=False) + log_files.append(l) + f4 = File(base_name + "-" + common_part + "-trimmomatic.txt") + log_files.append(f4) + j.uses(f4, link=Link.OUTPUT, transfer=False) + dax.addJob(j) + + return f1 + def merge(dax, base_name, in_list): @@ -233,7 +311,7 @@ def merge(dax, base_name, in_list): def cuff(task_files, dax, base_name, merged): - + global fpkm_files # Add job @@ -256,7 +334,7 @@ def cuff(task_files, dax, base_name, merged): def stringtie(task_files, dax, base_name, merged): - + global fpkm_files # Add job @@ -315,18 +393,18 @@ def main(): for base_name in os.listdir(data_dir): parts = [] - + for part in os.listdir(data_dir + "/" + base_name): my_data_dir = run_dir + "/data/" + base_name + "/" + part - + accepted_hits = [] for in_name in os.listdir(my_data_dir): - + # only need the forward file if not "forward" in in_name: continue - + # use stash urls for the data so we can bypass and grab it directly from # the jobs (OSG Connect uses stash) if os.path.exists("/stash2"): @@ -336,40 +414,46 @@ def main(): else: base_url = "file://" + run_dir + "/data/" + base_name + "/" + part site_name = "local" - + common_part = in_name common_part = re.sub(".*-forward\-", "", common_part) common_part = re.sub("\.gz", "", common_part) - + for_file = File(base_name + "-forward-" + common_part) for_file.addPFN(PFN(base_url + "/" + base_name + "-forward-" + common_part, site_name)) dax.addFile(for_file) - + if conf.getboolean("config", "paired"): rev_file = File(base_name + "-reverse-" + common_part) rev_file.addPFN(PFN(base_url + "/" + base_name + "-reverse-" + common_part, site_name)) dax.addFile(rev_file) - + if conf.getboolean("config", "hisat2") and conf.getboolean("config", "paired"): out_file = hisat2(task_files, dax, base_name, part, common_part, for_file, rev_file) - + elif conf.getboolean("config", "hisat2") and conf.getboolean("config", "single"): out_file = hisat2(task_files, dax, base_name, part, common_part, for_file) elif conf.getboolean("config", "tophat2") and conf.getboolean("config", "paired"): out_file = tophat(task_files, dax, base_name, part, common_part, for_file, rev_file) - + elif conf.getboolean("config", "tophat2") and conf.getboolean("config", "single"): out_file = tophat(task_files, dax, base_name, part, common_part, for_file) - + + elif conf.getboolean("config", "star") and conf.getboolean("config", "paired"): + out_file = star(task_files, dax, base_name, part, common_part, for_file, rev_file) + + elif conf.getboolean("config", "star") and conf.getboolean("config", "single"): + out_file = star(task_files, dax, base_name, part, common_part, for_file) + accepted_hits.append(out_file) - + # merge merged = merge(dax, base_name + "-" + part, accepted_hits) parts.append(part) - - # second level merge + + # second level merge accepted_hits = [] for part in parts: f = File(base_name + "-" + part + "-merged.bam") @@ -390,6 +474,8 @@ def main(): j.addArguments("hisat2") elif conf.getboolean("config", "tophat2"): j.addArguments("tophat2") + elif conf.getboolean("config", "star"): + j.addArguments("star") if conf.getboolean("config", "paired"): j.addArguments("paired") elif conf.getboolean("config", "single"): @@ -407,7 +493,7 @@ def main(): j.uses(f, link=Link.OUTPUT, transfer=True) j.uses(g, link=Link.OUTPUT, transfer=True) dax.addJob(j) - + # Write the DAX f = open(run_dir + "/workflow/level-2.dax", "w") dax.writeXML(f) @@ -415,4 +501,3 @@ def main(): main() - diff --git a/tools/star b/tools/star new file mode 100755 index 0000000..e9bef69 --- /dev/null +++ b/tools/star @@ -0,0 +1,77 @@ +#!/bin/bash + +# module init required when running on non-OSG resources, and has to sourced +# before set -e as sometimes it exits non-0 when a module environment is +# already set up +. /cvmfs/oasis.opensciencegrid.org/osg/sw/module-init.sh + +set -e + +module load samtools/1.3.1 +module load java/8u25 + +set -v +set -o pipefail + +CONNECT_USER=$1 +RUN_ID=$2 +REF_PATH=$3 +BASE_NAME=$4 +PART=$5 +COMMON_NAME=$6 +LAYOUT=$7 + +unzip 2.5.2b.zip + +if [ $LAYOUT = "paired" ] ; then + + + java -Xmx512m \ + -jar trimmomatic-0.36.jar PE -threads 1 -phred33 \ + $BASE_NAME-forward-$COMMON_NAME $BASE_NAME-reverse-$COMMON_NAME \ + left_result.paired_trim.forward_$COMMON_NAME.fastq /dev/null \ + right_result.paired_trim.reverse_$COMMON_NAME.fastq /dev/null \ + ILLUMINACLIP:fasta_adapter.txt:2:40:15 LEADING:3 TRAILING:6 SLIDINGWINDOW:4:15 MINLEN:50 \ + 2>&1 | tee $BASE_NAME-$COMMON_NAME-trimmomatic.txt + + # empty outputs? + LEFT_SIZE=`ls -l left_result.paired_trim.forward_$COMMON_NAME.fastq | cut -d ' ' -f 5` + RIGHT_SIZE=`ls -l right_result.paired_trim.reverse_$COMMON_NAME.fastq | cut -d ' ' -f 5` + if [ $LEFT_SIZE -lt 50 -o $RIGHT_SIZE -lt 50 ]; then + echo "Warning: trimmomatic output was 0 sequences" + touch $BASE_NAME-$COMMON_NAME-accepted_hits.bam + exit 0 + fi + + + STAR-2.5.2b/bin/Linux_x86_64_static/STAR --genomeDir . --sjdbFileChrStartEnd ./*.Splice_Sites.txt --runThreadN 1 \ + --outSAMtype BAM SortedByCoordinate \ + --readFilesIn left_result.paired_trim.forward_$COMMON_NAME.fastq right_result.paired_trim.reverse_$COMMON_NAME.fastq | tee $BASE_NAME-$COMMON_NAME-out.txt + +elif [ $LAYOUT = "single" ] ; then + + + java -Xmx512m \ + -jar trimmomatic-0.36.jar SE -threads 1 -phred33 \ + $BASE_NAME-forward-$COMMON_NAME \ + left_result.paired_trim.forward_$COMMON_NAME.fastq \ + ILLUMINACLIP:fasta_adapter.txt:2:40:15 LEADING:3 TRAILING:6 SLIDINGWINDOW:4:15 MINLEN:50 \ + 2>&1 | tee $BASE_NAME-$COMMON_NAME-trimmomatic.txt + + # empty outputs? + LEFT_SIZE=`ls -l left_result.paired_trim.forward_$COMMON_NAME.fastq | cut -d ' ' -f 5` + if [ $LEFT_SIZE -lt 50 ] ; then + echo "Warning: trimmomatic output was 0 sequences" + touch $BASE_NAME-$COMMON_NAME-accepted_hits.bam + exit 0 + fi + + + STAR-2.5.2b/bin/Linux_x86_64_static/STAR --genomeDir . --sjdbFileChrStartEnd ./*.Splice_Sites.txt --runThreadN 1 \ + --outSAMtype BAM SortedByCoordinate \ + --readFilesIn left_result.paired_trim.forward_$COMMON_NAME.fastq | tee $BASE_NAME-$COMMON_NAME-out.txt +fi + +mv Aligned.sortedByCoord.out.bam $BASE_NAME-$COMMON_NAME-accepted_hits.bam + +mv Log.final.out $BASE_NAME-$COMMON_NAME-Log.final.out