Modified rsem-prepare-reference so that not adding poly(A) tails beco…

…mes default
deweylab · Nov 5, 2014 · b32e364 · b32e364
1 parent 6ee9d4c
commit b32e364
Show file tree

Hide file tree

Showing 3 changed files with 38 additions and 25 deletions.
diff --git a/README.md b/README.md
@@ -175,9 +175,7 @@ of 'rsem-calculate-expression'. However, if you have run
 indices for your aligner, you can use 'rsem-tbam2gbam' to convert your
 transcript coordinate BAM alignments file into a genomic coordinate
 BAM alignments file without the need to run the whole RSEM
-pipeline. Please note that 'rsem-prepare-reference' will convert all
-'N' into 'G' by default for 'reference_name.idx.fa'. If you do not
-want this to happen, please use '--no-ntog' option.
+pipeline.
 
 Usage:
 

diff --git a/cnt_file_description.txt b/cnt_file_description.txt
@@ -0,0 +1,15 @@
+# '#' marks the start of comments (till the end of the line) 
+# *.cnt file contains alignment statistics based purely on the alignment results obtained from aligners
+N0 N1 N2 N_tot   #  N0, number of unalignable reads; N1, number of alignable reads; N2, number of filtered reads due to too many alignments; N_tot = N0 + N1 + N2
+nUnique nMulti nUncertain   # nUnique, number of reads aligned uniquely to a gene; nMulti, number of reads aligned to multiple genes; nUnique + nMulti = N1; 
+                            # nUncertain, number of reads aligned to multiple locations in the given reference sequences, which include isoform-level multi-mapping reads
+nHits read_type             # nHits, number of total alignments. 
+                            # read_type: 0, single-end read, no quality score; 1, single-end read, with quality score; 2, paired-end read, no quality score; 3, paired-end read, with quality score
+
+# The next section counts reads by the number of alignments they have. Each line contains two values separated by a TAB character. The first value is number of alignments. 'Inf' refers to reads filtered due to too many alignments. The second value is the number of reads that contain such many alignments
+
+0                             N0
+...
+number_of_alignments          number_of_reads_with_that_many_alignments
+...                           
+Inf                           N2
diff --git a/rsem-prepare-reference b/rsem-prepare-reference
@@ -15,10 +15,10 @@ my $status;
 
 my $gtfF = "";
 my $mappingF = "";
-my $polyAChoice = 0; # 0, default, pad polyA tails for all isoforms; 1, --no-polyA; 2, --no-polyA-subset
-my $no_polyA = 0; # for option --no-polyA 
-my $subsetFile = "";
+my $polyAChoice = 1; # 0, --polyA, add polyA tails for all isoforms; 1, default, no polyA tails; 2, --no-polyA-subset
+my $polyA = 0; # for option --polyA, default off 
 my $polyALen = 125;
+my $subsetFile = "";
 my $bowtie = 0;
 my $bowtie_path = "";
 my $bowtie2 = 0;
@@ -31,9 +31,9 @@ my $alleleMappingF = "";
 GetOptions("gtf=s" => \$gtfF,
 	   "transcript-to-gene-map=s" => \$mappingF,
 	   "allele-to-gene-map=s" => \$alleleMappingF,
-	   "no-polyA" => \$no_polyA, 
-	   "no-polyA-subset=s" => \$subsetFile,
+	   "polyA" => \$polyA, 
 	   "polyA-length=i" => \$polyALen,
+	   "no-polyA-subset=s" => \$subsetFile,
 	   "bowtie" => \$bowtie,
 	   "bowtie-path=s" => \$bowtie_path,
 	   "bowtie2" => \$bowtie2,
@@ -42,7 +42,6 @@ GetOptions("gtf=s" => \$gtfF,
 	   "h|help" => \$help) or pod2usage(-exitval => 2, -verbose => 2);
 
 pod2usage(-verbose => 2) if ($help == 1);
-pod2usage(-msg => "Set --no-polyA & --no-polyA-subset at the same time!", -exitval => 2, -verbose => 2) if ($no_polyA == 1 && $subsetFile ne '');
 pod2usage(-msg => "--transcript-to-gene-map and --allele-to-gene-map are mutually exclusive!", -exitval => 2, -verbose => 2) if (($mappingF ne "") && ($alleleMappingF ne ""));
 pod2usage(-msg => "--gtf and --allele-to-gene-map are mutually exclusive!", -exitval => 2, -verbose => 2) if (($gtfF ne "") && ($alleleMappingF ne ""));
 pod2usage(-msg => "Invalid number of arguments!", -exitval => 2, -verbose => 2) if (scalar(@ARGV) != 2);
@@ -66,8 +65,9 @@ if ($size == 1 && (-d $list[0])) {
 
 pod2usage(-msg => "reference_fasta_file(s) is empty! Please check if you provide the correct folder name or file suffixes!", -exitval => 2, -verbose => 2) if ($size <= 0);
 
-if ($no_polyA) { $polyAChoice = 1 }
-elsif ($subsetFile ne "") { $polyAChoice = 2; }
+if ($polyA) { 
+    $polyAChoice = ($subsetFile ne "") ? 2 : 0;
+}
 
 if ($bowtie_path ne "") { $bowtie_path .= "/"; }
 if ($bowtie2_path ne "") { $bowtie2_path .= "/"; }
@@ -178,18 +178,18 @@ This option is designed for quantifying allele-specific expression. It is only v
 
 (Default: off) 
 
-=item B<--no-polyA>
+=item B<--polyA>
 
-Do not add poly(A) tails to the end of reference isoforms. (Default: adding poly(A) tails to all transcripts)
-
-=item B<--no-polyA-subset> <file>
-
-Add poly(A) tails to all transcripts except those listed in <file>. <file> is a file containing a list of transcript_ids. (Default: add poly(A) tails to all transcripts. This option cannot be used with '--no-polyA'.)
+Add poly(A) tails to the end of all reference isoforms. The length of poly(A) tail added is specified by '--polyA-length' option. STAR aligner users may not want to use this option. (Default: do not add poly(A) tail to any of the isoforms)
 
 =item B<--polyA-length> <int>
 
 The length of the poly(A) tails to be added. (Default: 125)
 
+=item B<--no-polyA-subset> <file>
+
+Only meaningful if '--polyA' is specified. Do not add poly(A) tails to those transcripts listed in <file>. <file> is a file containing a list of transcript_ids. (Default: off)
+
 =item B<--bowtie>
 
 Build Bowtie indices. (Default: off)
@@ -228,11 +228,11 @@ This program will generate 'reference_name.grp', 'reference_name.ti', 'reference
 
 B<'reference_name.transcripts.fa'> contains the extracted reference transcripts in Multi-FASTA format. Poly(A) tails are not added and it may contain lower case bases in its sequences if the corresponding genomic regions are soft-masked.
 
-B<'reference_name.idx.fa' and 'reference_name.n2g.idx.fa'> are used by aligners to build their own indices. Poly(A) tails are added unless '--no-polyA' is set. In addition, all sequence bases are converted into upper case. The only difference between 'reference_name.idx.fa' and 'reference_name.n2g.idx.fa' is that 'reference_name.n2g.idx.fa' in addition converts all 'N' characters to 'G' characters. This conversion is in particular desired for aligners (e.g. Bowtie) that do not allow reads to overlap with 'N' characters in the reference sequences. Otherwise, 'reference_name.idx.fa' should be used to build the aligner's index files. RSEM uses 'reference_name.idx.fa' to build Bowtie 2 indices and 'reference_name.n2g.idx.fa' to build Bowtie indices. For visualizing the transcript-coordinate-based BAM files generated by RSEM in IGV, 'reference_name.idx.fa' should be imported as a "genome" (see Visualization section in README.md for details). 
+B<'reference_name.idx.fa' and 'reference_name.n2g.idx.fa'> are used by aligners to build their own indices. In these two files, all sequence bases are converted into upper case. In addition, poly(A) tails are added if '--polyA' option is set. The only difference between 'reference_name.idx.fa' and 'reference_name.n2g.idx.fa' is that 'reference_name.n2g.idx.fa' in addition converts all 'N' characters to 'G' characters. This conversion is in particular desired for aligners (e.g. Bowtie) that do not allow reads to overlap with 'N' characters in the reference sequences. Otherwise, 'reference_name.idx.fa' should be used to build the aligner's index files. RSEM uses 'reference_name.idx.fa' to build Bowtie 2 indices and 'reference_name.n2g.idx.fa' to build Bowtie indices. For visualizing the transcript-coordinate-based BAM files generated by RSEM in IGV, 'reference_name.idx.fa' should be imported as a "genome" (see Visualization section in README.md for details). 
  
 =head1 EXAMPLES
 
-1) Suppose we have mouse RNA-Seq data and want to use the UCSC mm9 version of the mouse genome. We have downloaded the UCSC Genes transcript annotations in GTF format (as mm9.gtf) using the Table Browser and the knownIsoforms.txt file for mm9 from the UCSC Downloads. We also have all chromosome files for mm9 in the directory '/data/mm9'.  We want to put the generated reference files under '/ref' with name 'mouse_125'. We'll add poly(A) tails with length 125. Please note that GTF files generated from UCSC's Table Browser do not contain isoform-gene relationship information.  For the UCSC Genes annotation, this information can be obtained from the knownIsoforms.txt file.  Suppose we want to build Bowtie indices and Bowtie executables are found in '/sw/bowtie'.
+1) Suppose we have mouse RNA-Seq data and want to use the UCSC mm9 version of the mouse genome. We have downloaded the UCSC Genes transcript annotations in GTF format (as mm9.gtf) using the Table Browser and the knownIsoforms.txt file for mm9 from the UCSC Downloads. We also have all chromosome files for mm9 in the directory '/data/mm9'.  We want to put the generated reference files under '/ref' with name 'mouse_0'. We do not add any poly(A) tails. Please note that GTF files generated from UCSC's Table Browser do not contain isoform-gene relationship information.  For the UCSC Genes annotation, this information can be obtained from the knownIsoforms.txt file.  Suppose we want to build Bowtie indices and Bowtie executables are found in '/sw/bowtie'.
 
 There are two ways to write the command:
 
@@ -241,7 +241,7 @@ There are two ways to write the command:
                         --bowtie \
                         --bowtie-path /sw/bowtie \                  
                         /data/mm9/chr1.fa,/data/mm9/chr2.fa,...,/data/mm9/chrM.fa \
-                        /ref/mouse_125
+                        /ref/mouse_0
 
 OR
 
@@ -250,7 +250,7 @@ OR
                         --bowtie \
                         --bowtie-path /sw/bowtie \
                         /data/mm9 \
-                        /ref/mouse_125
+                        /ref/mouse_0
 
 2) Suppose we also want to build Bowtie 2 indices in the above example and Bowtie 2 executables are found in '/sw/bowtie2', the command will be:
 
@@ -261,13 +261,13 @@ OR
                         --bowtie2 \
                         --bowtie2-path /sw/bowtie2 \
                         /data/mm9 \
-                        /ref/mouse_125
+                        /ref/mouse_0
 
-3) Suppose we only have transcripts from EST tags in 'mm9.fasta'. In addition, we also have isoform-gene information in 'mapping.txt'. We do not want to add any poly(A) tails. The reference_name will be set to 'mouse_0'. In addition, we do not want to build Bowtie/Bowtie 2 indices, and will use an alternative aligner to align reads against either 'mouse_0.idx.fa' or 'mouse_0.idx.n2g.fa':
+3) Suppose we only have transcripts from EST tags stored in 'mm9.fasta' and isoform-gene information stored in 'mapping.txt'. We want to add 125bp long poly(A) tails to all transcripts. The reference_name is set as 'mouse_125'. In addition, we do not want to build Bowtie/Bowtie 2 indices, and will use an alternative aligner to align reads against either 'mouse_125.idx.fa' or 'mouse_125.idx.n2g.fa':
 
  rsem-prepare-reference --transcript-to-gene-map mapping.txt \
-                        --no-polyA \
+                        --polyA
                         mm9.fasta \
-                        mouse_0
+                        mouse_125
 
 =cut