Permalink
Browse files

Added support for STAR read alignment software

  • Loading branch information...
1 parent d02f8ab commit 8c8833408863afd1931713e78b020a1910e06988 @wpoehlm wpoehlm committed May 4, 2017
View
@@ -65,7 +65,7 @@ The workflow, configured to run Hisat2 and Stringtie, can then be launched by ru
$ ./submit
-From here, the user may follow our documentation to modify the software options as well as point to their own input datasets.
+From here, the user may follow our documentation to modify the software options as well as point to their own input datasets. Note that there are no test reference genome indices available for STAR, because they are too large to upload to github.
@@ -96,8 +96,15 @@ file and gene annotation in GTF/GFF3 format, the following commands can be used
$ tophat2 -G GRCh38.gencode.v24.annotation.gff3 --transcriptome-index=transcriptome_data/GRCh38 GRCh38
$ tar czf GRCh38.transcriptome_data.tar.gz transcriptome_data/
+### If the user would like to use STAR:
+ # cd into the "star_index" directory within "reference". Place all genome files here
+ $ cd reference/star_index
+ $ STAR-2.5.2b/bin/Linux_x86_64_static/STAR --runMode genomeGenerate --runThreadN 4 --genomeDir ./ --genomeFastaFiles ./GRCh38.fa --sjdbGTFfile ./GRCh38.gencode.v24.annotation.gff3
+#### Generate Tab delimited list of splice sites using gene model GTF file as input (Python DefaultDictionary Module necessary)
+
+ $ python hisat2_extract_splice_sites.py GRCh38-gencode.v24.annotation.gtf > GRCh38.Splice_Sites.txt
## Workflow Configuration
@@ -128,6 +135,19 @@ $REF_PREFIX.transcriptome_data.tar.gz
$REF_PREFIX.gff3
+#### If the user would like to use, STAR, the following files must be present in the _reference/star_index_ directory:
+
+chrLength.txt
+chrNameLength.txt
+chrName.txt
+chrStart.txt
+Genome
+genomeParameters.txt
+Log.out
+SA
+SAindex
+*Splice_Sites.txt
+
### User Input Datasets
OSG-GEM supports the processing of multiple input datasets into a single Gene Expression Matrix(GEM). The user
View
@@ -43,6 +43,9 @@ tophat2 = False
# process using Hisat2
hisat2 = True
+# process using STAR
+star = False
+
# process using Cufflinks
cufflinks = False
No changes.
@@ -1,3 +1,4 @@
+
## William Poehlman ##
## LogParse.py v1.1 ##
## wpoehlm@clemson.edu ##
@@ -18,6 +19,7 @@ def ParseLogs(trimlist, maplist, software, sample, layout):
maplist --> a list of hisat2 or tophat2 log files
software --> read mapping software that generated log files. 'hisat' or 'tophat'
sample --> A sample number that is present at the beginning of each filename
+ layout --> Sequencing layout. 'single' or 'paired'
output:
A dictionary, where the sample ID number is the key, and the following statistics are stored in the list value:
@@ -34,18 +36,22 @@ def ParseLogs(trimlist, maplist, software, sample, layout):
((left mapped reads + right mapped reads)/2)/(cleaned input pairs)
+ The overall Alignment rate for STAR is calculated as follows:
+ (uniquely mapped reads + multiple alignments) / (cleaned input pairs)
+
Logfile names must be formatted as follows:
<sample number>-<unique file identifier>-align_summary.txt (only for tophat)
<sample number>-<unique file identifier>-fastq-out.txt (for hisat)
+ <sample number>-<unique file identifier>-Log.final.out
<sample number>-unique file identifier>-fastq-trimmomatic.txt
for example:
2-000000-000000.fastq-align_summary.txt
1-000000-000000.fastq-out.txt
1-000000-000000.fastq-trimmomatic.txt
- NOTE: do not mix log files for tophat and hisat
+ NOTE: do not mix log files for tophat/hisat/star
"""
triminput = []
@@ -134,6 +140,91 @@ def ParseLogs(trimlist, maplist, software, sample, layout):
concordant.append(concordantpairs)
mapped.append(mappedsingles)
+ elif software == 'star' and layout == 'paired':
+ multimapped = []
+ uniqmapped = []
+ for file in trimlist:
+ basename = os.path.splitext(file)[0]
+ dataset = basename.split('-')[0]
+ if dataset == sample:
+ for line in open(file):
+ if 'Input Read Pairs' in line:
+ inp = int(line.split(' ')[3])
+ triminput.append(inp)
+ out = int(line.split(' ')[6])
+ trimoutput.append(out)
+ for file in maplist:
+ basename = os.path.splitext(file)[0]
+ dataset = basename.split('-')[0]
+ if dataset == sample:
+ count = 0
+ for line in open(file):
+ count +=1
+ if count == 9:
+ uniq = int(line.split('|')[1].strip())
+ uniqmapped.append(uniq)
+ if count == 24:
+ multi = int(line.split('|')[1].strip())
+ multimapped.append(multi)
+
+
+ concordant = sum(uniqmapped)
+ mult = sum(multimapped)
+ mapped = concordant + mult
+ inputraw = sum(triminput)
+ inputclean = sum(trimoutput)
+ float1 = float(mapped)
+ float2 = float(inputclean)
+ rate = float1/float2
+
+ report[sample] = [inputraw, inputclean, concordant, rate*100]
+
+ return report
+
+ if software == 'star' and layout == 'single':
+ multimapped = []
+ uniqmapped = []
+ for file in trimlist:
+ basename = os.path.splitext(file)[0]
+ dataset = basename.split('-')[0]
+ if dataset == sample:
+ for line in open(file):
+ if 'Input Reads' in line:
+ inp = int(line.split(' ')[2])
+ triminput.append(inp)
+ out = int(line.split(' ')[4])
+ trimoutput.append(out)
+ for file in maplist:
+ basename = os.path.splitext(file)[0]
+ dataset = basename.split('-')[0]
+ if dataset == sample:
+ count = 0
+
+ for line in open(file):
+ count +=1
+ if count == 9:
+ uniq = int(line.split('|')[1].strip())
+ uniqmapped.append(uniq)
+ if count == 24:
+ multi = int(line.split('|')[1].strip())
+ multimapped.append(multi)
+
+ concordant = sum(uniqmapped)
+ mult = sum(multimapped)
+ mapped = concordant + mult
+ inputraw = sum(triminput)
+ inputclean = sum(trimoutput)
+
+ float1 = float(mapped)
+ float2 = float(inputclean)
+ rate = float1/float2
+
+ report[sample] = [inputraw, inputclean, concordant, rate*100]
+
+ return report
+
+
+
if software == 'tophat2' and layout == 'paired':
inputraw = sum(triminput)
@@ -252,11 +343,12 @@ def main():
#locate input log files and append to lists
#tophat alignment summary
- maploglist = [file for file in glob.glob(('*summary.txt'))]
-
- if len(maploglist) == 0:
- #If there are no tophat alignment summaries, look for hisat standard output instead
+ if software == 'tophat2':
+ maploglist = [file for file in glob.glob(('*summary.txt'))]
+ elif software == 'hisat2':
maploglist = [file for file in glob.glob('*fastq-out.txt')]
+ elif software == 'star':
+ maploglist = [file for file in glob.glob('*Log.final.out')]
trimloglist = [file for file in glob.glob('*-trimmomatic.txt')]
if len(maploglist) != len(trimloglist):
View
Binary file not shown.
@@ -0,0 +1,130 @@
+>TRUSEQUNIVERSAL ADAPTER
+AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
+>TRUSEQADAPTER, INDEX 1
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACATCACGATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 2
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGATGTATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 3
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTTAGGCATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 4
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTGACCAATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 5
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACACAGTGATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 6
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGCCAATATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 7
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCAGATCATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 8
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTTGAATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 9
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGATCAGATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 10
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 11
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGGCTACATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 12
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCTTGTAATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 13
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTCAACAATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 14
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACAGTTCCGTATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 15
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACATGTCAGAATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 16
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCCGTCCCGATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 18 4
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTCCGCACATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 19
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGAAACGATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 20
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTGGCCTTATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 21
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGTTTCGGAATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 22
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACCGTACGTAATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 23
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACGAGTGGATATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 25
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTGATATATCTCGTATGCCGTCTTCTGCTTG
+>TRUSEQADAPTER, INDEX 27
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACATTCCTTTATCTCGTATGCCGTCTTCTGCTTG
+>PE ADAPTER A
+GATCGGAAGAGCGGTTCAGCAGGAATGCCGAG
+>PE ADAPTER B
+ACACTCTTTCCCTACACGACGCTCTTCCGATCT
+>PE PCR PRIMER 1.0
+AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATCT
+>PE PCR PRIMER 2.0
+CAAGCAGAAGACGGCATACGAGATCGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT
+>PE READ 1 SEQUENCING PRIMER
+ACACTCTTTCCCTACACGACGCTCTTCCGATCT
+>PE READ 2 SEQUENCING PRIMER
+CGGTCTCGGCATTCCTGCTGAACCGCTCTTCCGATCT
+>NEXTERA_CIRCULARIZED_DUPLICATE_JUNCTION_ADAPTER
+CTGTCTCTTATACACATCTAGATGTGTATAAGAGACAG
+>NEXTERA_CIRCULARIZED_SINGLE_JUNCTION_ADAPTER
+CTGTCTCTTATACACATCT
+>NEXTERA_CIRCULARIZED_SINGLE_JUNCTION_ADAPTER_REVERSE_COMPLEMENT
+AGATGTGTATAAGAGACAG
+>NEXTERA_READ_1_EXTERNAL_ADAPTER
+ATCGGAAGAGCACACGTCTGAACTCCAGTCAC
+>NEXTERA_READ_2_EXTERNAL_ADAPTER
+GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
+>NEXTERA_CIRCULARIZED_DUPLICATE_JUNCTION_ADAPTER_REVERSE_COMPLEMENT
+CTGTCTCTTATACACATCTAGATGTGTATAAGAGACAG
+>NEXTERA_READ_1_EXTERNAL_ADAPTER_REVERSE_COMPLEMENT
+GTGACTGGAGTTCAGACGTGTGCTCTTCCGAT
+>NEXTERA_READ_2_EXTERNAL_ADAPTER_REVERSE_COMPLEMENT
+ACACTCTTTCCCTACACGACGCTCTTCCGATC
+>GH_TBH_HINDIII_LEFT
+GTTGTAAAACGACGGCCAGTGAATTGTAATACGACTCACTATAGGGCGAATTCGAGCTCGGTACCCGGGGATCCTCTAGAGTCGACCTGCAGGCATGCA
+>GH_TBH_HINDIII_RGHT
+AGCTTGAGTATTCTATAGTGTCACCTAAATAGCTTGGCGTAATCATGGTCATAGCTGTTTCCTGTGTGAAATTGTTATCCGCTCACAATTCCACACAAC
+>GH_TBR_HINDIII_LEFT
+CTGAGCCTTTCGTTTTATTTGACCATGTTGGTATGATTTAAATTAATACGACTCACTATAGGGTCAGTGCGGCCGCGACTTCAAGTCACGTGAATTCCA
+>GH_TBR_HINDIII_RIGHT
+AGCTTCGGATCCCACTGTGGTGGATGCATTTTAACCCATCACATATACCTGCCGTTCACTATTATTTAGTGAAATGAGATATTATGATATTTTCTGAAT
+>GH_TBB_BAMHI_LEFT
+GGGTAACGCCAGGGTTTTCCCAGTCACGACGTTGTAAAACGACGGCCAGTGAATTGTAATACGACTCACTATAGGGCGAATTCGAGCTCGGTACCCGGG
+>GH_TBB_BAMHI_RGHT
+GATCCTCTAGAGTCGACCTGCAGGCATGCAAGCTTGAGTATTCTATAGTGTCACCTAAATAGCTTGGCGTAATCATGGTCATAGCTGTTTCCTGTGTGA
+>PCC2FOS_ECO27I_LEFT
+CGTTGTAAAACGACGGCCAGTGAATTGTAATACGACTCACTATAGGGCGAATTCGAGCTCGGTACCCGGGGATCCCACGTACAACGACACCTAGACCAC
+>PCC2FOS_ECO27I_RGHT
+GTGTTCCTAGGCTGTTTCCTGGTGGGATCCTCTAGAGTCGACCTGCAGGCATGCAAGCTTGAGTATTCTATAGTCTCACCTAAATAGCTTGGCGTAATC
+>GH_TBH_HINDIII_LEFT_RC
+TGCATGCCTGCAGGTCGACTCTAGAGGATCCCCGGGTACCGAGCTCGAATTCGCCCTATAGTGAGTCGTATTACAATTCACTGGCCGTCGTTTTACAAC
+>GH_TBH_HINDIII_RGHT_RC
+GTTGTGTGGAATTGTGAGCGGATAACAATTTCACACAGGAAACAGCTATGACCATGATTACGCCAAGCTATTTAGGTGACACTATAGAATACTCAAGCT
+>GH_TBR_HINDIII_LEFT_RC
+TGGAATTCACGTGACTTGAAGTCGCGGCCGCACTGACCCTATAGTGAGTCGTATTAATTTAAATCATACCAACATGGTCAAATAAAACGAAAGGCTCAG
+>GH_TBR_HINDIII_RIGHT_RC
+ATTCAGAAAATATCATAATATCTCATTTCACTAAATAATAGTGAACGGCAGGTATATGTGATGGGTTAAAATGCATCCACCACAGTGGGATCCGAAGCT
+>GH_TBB_BAMHI_LEFT_RC
+CCCGGGTACCGAGCTCGAATTCGCCCTATAGTGAGTCGTATTACAATTCACTGGCCGTCGTTTTACAACGTCGTGACTGGGAAAACCCTGGCGTTACCC
+>GH_TBB_BAMHI_RGHT_RC
+TCACACAGGAAACAGCTATGACCATGATTACGCCAAGCTATTTAGGTGACACTATAGAATACTCAAGCTTGCATGCCTGCAGGTCGACTCTAGAGGATC
+>PCC2FOS_ECO27I_LEFT_RC
+GTGGTCTAGGTGTCGTTGTACGTGGGATCCCCGGGTACCGAGCTCGAATTCGCCCTATAGTGAGTCGTATTACAATTCACTGGCCGTCGTTTTACAACG
+>PCC2FOS_ECO27I_RGHT_RC
+GATTACGCCAAGCTATTTAGGTGAGACTATAGAATACTCAAGCTTGCATGCCTGCAGGTCGACTCTAGAGGATCCCACCAGGAAACAGCCTAGGAACAC
+>CDJA
+CTGTCTCTTATACACATCTAGATGTGTATAAGAGACAG
+>CSJA
+CTGTCTCTTATACACATCT
+>CSJARC
+AGATGTGTATAAGAGACAG
+>R1EA
+GATCGGAAGAGCACACGTCTGAACTCCAGTCAC
+>R2EA
+GATCGGAAGAGCGTCGTGTAGGGAAAGAGTGT
+>PREFIXPE/1
+TACACTCTTTCCCTACACGACGCTCTTCCGATCT
+>PREFIXPE/2
+GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT
+>INDEX_PRIMER_SEQUENCE
+CAAGCAGAAGACGGCATACGAGATATTGGCGTGACTGGAGTTCAGACGTGTGCTCTTCCGATC
+>NEBNEXT_UNIVERSAL_FOR_ILLLUMINA
+AATGATACGGCGACCACCGAGATCTACACTCTTTCCCTACACGACGCTCTTCCGATC
+>NEBNEXT_ADAPTER_ILLUMINA
+GATCGGAAGAGCACACGTCTGAACTCCAGTCACACTCTTTCCCTACACGA
Binary file not shown.
View
Binary file not shown.
View
@@ -45,3 +45,4 @@ python ST_ttab_parse.py t_data.ctab > $BASE_NAME-merged_counts.fpkm
mv t_data.ctab $BASE_NAME-t_data.ctab
mv e_data.ctab $BASE_NAME-e_data.ctab
mv i_data.ctab $BASE_NAME-i_data.ctab
+
Oops, something went wrong.

0 comments on commit 8c88334

Please sign in to comment.