Permalink
Browse files

rsem v1.1.13, speed up EM by only updating model parameters for first…

… 10 iterations, skip reads with its (or at least one of its mates', for paired-end reads) length < 25bp
  • Loading branch information...
1 parent 5b86789 commit 86e650e9577999a7ba00ab454d1f6bf674b0ea70 @bli25wisc bli25wisc committed Nov 6, 2011
Showing with 58 additions and 31 deletions.
  1. +4 −2 EM.cpp
  2. +3 −0 LenDist.h
  3. +1 −1 NoiseProfile.h
  4. +13 −8 PairedEndModel.h
  5. +15 −10 PairedEndQModel.h
  6. +1 −0 PairedEndRead.h
  7. +1 −0 PairedEndReadQ.h
  8. +6 −3 SingleModel.h
  9. +7 −4 SingleQModel.h
  10. +2 −0 SingleRead.h
  11. +2 −0 SingleReadQ.h
  12. +1 −1 buildReadIndex.cpp
  13. +2 −2 rsem-calculate-expression
View
@@ -394,9 +394,11 @@ void release(ReadReader<ReadType> **readers, HitContainer<HitType> **hitvs, doub
delete[] mhps;
}
+int tmp_n;
+
inline bool doesUpdateModel(int ROUND) {
- //return false; // never update, for debugging only
- return ROUND <= 20 || ROUND % 100 == 0;
+ // return ROUND <= 20 || ROUND % 100 == 0;
+ return ROUND <= 10;
}
//Including initialize, algorithm and results saving
View
@@ -3,6 +3,7 @@
#include<cstdio>
#include<cstring>
+#include<cstdlib>
#include<cassert>
#include<algorithm>
@@ -189,6 +190,8 @@ void LenDist::finish() {
sum += pdf[i];
}
+ if (sum <= EPSILON) { fprintf(stderr, "No valid read to estimate the length distribution!\n"); exit(-1); }
+
for (int i = 1; i <= span; i++) {
pdf[i] = pdf[i] / sum;
cdf[i] = cdf[i - 1] + pdf[i];
View
@@ -81,7 +81,7 @@ void NoiseProfile::finish() {
logp = 0.0;
sum = 0.0;
for (int i = 0; i < NCODES; i++) sum += (p[i] + c[i]);
- if (sum <= 0.0) return;
+ if (sum <= EPSILON) return;
for (int i = 0; i < NCODES; i++) {
p[i] = (p[i] + c[i]) / sum;
if (c[i] > 0.0) { logp += c[i] * log(p[i]); }
View
@@ -236,14 +236,19 @@ void PairedEndModel::estimateFromReads(const char* readFN) {
while (reader.next(read)) {
SingleRead mate1 = read.getMate1();
SingleRead mate2 = read.getMate2();
-
- mld->update(mate1.getReadLength(), 1.0);
- mld->update(mate2.getReadLength(), 1.0);
-
- if (i == 0) {
- npro->updateC(mate1.getReadSeq());
- npro->updateC(mate2.getReadSeq());
- }
+
+ if (!read.isLowQuality()) {
+ mld->update(mate1.getReadLength(), 1.0);
+ mld->update(mate2.getReadLength(), 1.0);
+
+ if (i == 0) {
+ npro->updateC(mate1.getReadSeq());
+ npro->updateC(mate2.getReadSeq());
+ }
+ }
+ else if (verbose && (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN)) {
+ printf("Warning: Read %s is ignored due to at least one of the mates' length < %d!\n", read.getName().c_str(), OLEN);
+ }
++cnt;
if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
View
@@ -243,16 +243,21 @@ void PairedEndQModel::estimateFromReads(const char* readFN) {
SingleReadQ mate1 = read.getMate1();
SingleReadQ mate2 = read.getMate2();
- mld->update(mate1.getReadLength(), 1.0);
- mld->update(mate2.getReadLength(), 1.0);
-
- qd->update(mate1.getQScore());
- qd->update(mate2.getQScore());
-
- if (i == 0) {
- nqpro->updateC(mate1.getReadSeq(), mate1.getQScore());
- nqpro->updateC(mate2.getReadSeq(), mate2.getQScore());
- }
+ if (!read.isLowQuality()) {
+ mld->update(mate1.getReadLength(), 1.0);
+ mld->update(mate2.getReadLength(), 1.0);
+
+ qd->update(mate1.getQScore());
+ qd->update(mate2.getQScore());
+
+ if (i == 0) {
+ nqpro->updateC(mate1.getReadSeq(), mate1.getQScore());
+ nqpro->updateC(mate2.getReadSeq(), mate2.getQScore());
+ }
+ }
+ else if (verbose && (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN)) {
+ printf("Warning: Read %s is ignored due to at least one of the mates' length < %d!\n", read.getName().c_str(), OLEN);
+ }
++cnt;
if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
View
@@ -61,6 +61,7 @@ void PairedEndRead::write(int argc, std::ostream *argv[]) {
void PairedEndRead::calc_lq() {
low_quality = mate1.isLowQuality() && mate2.isLowQuality();
+ if (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN) low_quality = true;
}
#endif
View
@@ -61,6 +61,7 @@ void PairedEndReadQ::write(int argc, std::ostream* argv[]) {
void PairedEndReadQ::calc_lq() {
low_quality = mate1.isLowQuality() && mate2.isLowQuality();
+ if (mate1.getReadLength() < OLEN || mate2.getReadLength() < OLEN) low_quality = true;
}
#endif /* PAIREDENDREADQ_H_ */
View
@@ -273,8 +273,11 @@ void SingleModel::estimateFromReads(const char* readFN) {
int cnt = 0;
while (reader.next(read)) {
- mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
- if (i == 0) { npro->updateC(read.getReadSeq()); }
+ if (!read.isLowQuality()) {
+ mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
+ if (i == 0) { npro->updateC(read.getReadSeq()); }
+ }
+ else if (verbose && read.getReadLength() < OLEN) { printf("Warning: Read %s is ignored due to read length < %d!\n", read.getName().c_str(), OLEN); }
++cnt;
if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
@@ -441,7 +444,7 @@ void SingleModel::finishSimulation() {
void SingleModel::calcMW() {
double probF, probR;
-
+
assert(seedLen >= OLEN && (mld == NULL ? gld->getMinL() : mld->getMinL()) >= seedLen);
memset(mw, 0, sizeof(double) * (M + 1));
View
@@ -283,9 +283,12 @@ void SingleQModel::estimateFromReads(const char* readFN) {
int cnt = 0;
while (reader.next(read)) {
- mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
- qd->update(read.getQScore());
- if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); }
+ if (!read.isLowQuality()) {
+ mld != NULL ? mld->update(read.getReadLength(), 1.0) : gld->update(read.getReadLength(), 1.0);
+ qd->update(read.getQScore());
+ if (i == 0) { nqpro->updateC(read.getReadSeq(), read.getQScore()); }
+ }
+ else if (verbose && read.getReadLength() < OLEN) { printf("Warning: Read %s is ignored due to read length < %d!\n", read.getName().c_str(), OLEN); }
++cnt;
if (verbose && cnt % 1000000 == 0) { printf("%d READS PROCESSED\n", cnt); }
@@ -339,7 +342,7 @@ void SingleQModel::read(const char* inpF) {
ori->read(fi);
gld->read(fi);
- fscanf(fi, "%d", &val);
+ assert(fscanf(fi, "%d", &val) == 1);
if (val > 0) {
if (mld == NULL) mld = new LenDist();
mld->read(fi);
View
@@ -61,6 +61,8 @@ void SingleRead::calc_lq() {
int numA = 0, numT = 0, numAO = 0, numTO = 0; // numAO : number of A in overlap seed region
int threshold_1, threshold_2;
+ if (len < OLEN) { low_quality = true; return; }
+
threshold_1 = int(0.9 * len - 1.5 * sqrt(len * 1.0) + 0.5);
threshold_2 = (OLEN - 1) / 2 + 1;
for (int i = 0; i < len; i++) {
View
@@ -67,6 +67,8 @@ void SingleReadQ::calc_lq() {
int numA = 0, numT = 0, numAO = 0, numTO = 0; // numAO : number of A in overlap seed region
int threshold_1, threshold_2;
+ if (len < OLEN) { low_quality = true; return; }
+
threshold_1 = int(0.9 * len - 1.5 * sqrt(len * 1.0) + 0.5);
threshold_2 = (OLEN - 1) / 2 + 1;
for (int i = 0; i < len; i++) {
View
@@ -53,7 +53,7 @@ void buildIndex(char* readF, int gap, bool hasQ) {
}
++nReads;
- if (verbose && nReads % 1000000 == 0) { printf("FIN %lld\n", nReads); }
+ if (verbose && nReads % 1000000 == 0) { printf("FIN %lld\n", (long long)nReads); }
} while (success);
fout.seekp(startPos);
@@ -110,6 +110,7 @@ pod2usage(-msg => "Min fragment length should be at least 1!", -exitval => 2, -v
pod2usage(-msg => "Min fragment length should be smaller or equal to max fragment length!", -exitval => 2, -verbose => 2) if ($minL > $maxL);
pod2usage(-msg => "The memory allocated for calculating credibility intervals should be at least 1 MB!\n", -exitval => 2, -verbose => 2) if ($NMB < 1);
pod2usage(-msg => "Number of threads should be at least 1!\n", -exitval => 2, -verbose => 2) if ($nThreads < 1);
+pod2usage(-msg => "Seed length should be at least 25!\n", -exitval => 2, -verbose => 2) if ($L < 25);
if ($strand_specific) { $probF = 1.0; }
@@ -484,7 +485,7 @@ Calculate 95% credibility intervals and posterior mean estimates. (Default: off
=item B<--seed-length> <int>
-Seed length used by the read aligner. Providing the correct value for this parameter is important for RSEM's accuracy if the data are single-end reads. If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter. The minimum value is 25. (Default: 25)
+Seed length used by the read aligner. Providing the correct value for this parameter is important for RSEM's accuracy if the data are single-end reads. If RSEM runs Bowtie, it uses this value for Bowtie's seed length parameter. The minimum value is 25. Any read with its or at least one of its mates' (for paired-end reads) length less than 25 will be ignored. (Default: 25)
=item B<--tag> <string>
@@ -694,4 +695,3 @@ Assume the path to the bowtie executables is in the user's PATH environment vari
=cut
-# LocalWords: usr

0 comments on commit 86e650e

Please sign in to comment.