Permalink
Browse files

Merge branch 'master' into strand

  • Loading branch information...
yang-yangfeng committed Apr 22, 2018
2 parents 6e50005 + cacbb3b commit 69f0d8a2a40b0fdef514ea1e5539e76e1988fa26
@@ -13,6 +13,7 @@ include(TestHelper)
set (regtools_VERSION_MAJOR 0)
set (regtools_VERSION_MINOR 5)
set (regtools_VERSION_PATCH 0)
configure_file (
"${PROJECT_SOURCE_DIR}/src/version.h.in"
"${PROJECT_BINARY_DIR}/version.h"
@@ -113,6 +113,7 @@ void GtfParser::add_exon_to_transcript_map(Gtf gtf1) {
string transcript_id = parse_attribute(attributes, "transcript_id");
string gene_name = parse_attribute(attributes, "gene_name");
//create a BED6 object
//NOTE: the 'name' column is simply going to be 'exon'
BED exon = BED(gtf1.seqname, gtf1.start,
gtf1.end, gtf1.feature,
gtf1.score, gtf1.strand);
@@ -150,26 +150,46 @@ bool JunctionsAnnotator::overlap_ps(const vector<BED>& exons,
known_junction = true;
}
else {
// YY NOTE: if we have not yet reached the junction on this transcript,
if(!junction_start) {
// then check if we have with this exon (i.e. the
// exon end is past or at the junction start)
if(exons[i].end >= junction.start) {
junction_start = true;
}
}
if(junction_start) {
// YY NOTE: if the exon lies completely within the junction,
// count it as skipped
if(exons[i].start > junction.start &&
exons[i].end < junction.end) {
junction.exons_skipped.insert(exons[i].name);
exons[i].end < junction.end &&
i > 0 && i < (exons.size()-1)) {
string exon_coords = common::num_to_str(exons[i].start) + "-" + common::num_to_str(exons[i].end);
junction.exons_skipped.insert(exon_coords);
}
if(exons[i].start > junction.start) {
junction.donors_skipped.insert(exons[i].start);
// YY NOTE: if the exon ends after the junction starts
// (and junction_start == true), then
// count the donor as skipped
// YY NOTE: maybe it should be junction.end-1? since
// if the "donor" and "acceptor" are already
// contiguous in the DNA it would get counted
// as "skipped"
if(exons[i].end > junction.start &&
exons[i].end < junction.end &&
i < (exons.size()-1)) {
junction.donors_skipped.insert(exons[i].end);
}
if(exons[i].end < junction.end) {
junction.acceptors_skipped.insert(exons[i].end);
// YY NOTE: if the exon starts before the junction ends
// (and junction_start == true), then
// count the acceptor as skipped
if(exons[i].start < junction.end &&
exons[i].start > junction.start &&
i > 0) {
junction.acceptors_skipped.insert(exons[i].start);
}
if(exons[i].end == junction.start) {
junction.known_donor = true;
}
//TODO - check for last exon
if(exons[i].start == junction.end) {
junction.known_acceptor = true;
}
@@ -180,6 +200,29 @@ bool JunctionsAnnotator::overlap_ps(const vector<BED>& exons,
return (junction.anchor != "N");
}
//YY NOTES
/*
So based on my understanding of the bin search, we should be looking at all the
possible transcripts that might overlap with a junction. That is to say, it
would seem that any discrepancy in acceptors/exons/donors skipped would come
from a problem with the actual counting.
The overlap function will be run on each transcript, and you can see from the
if block how it judges an acceptor/exon/donor as skipped. That logic appears
correct to me (see notes above). Doesn't immediately seem like a problem
with the actual recording of acceptors/exons/donors skipped (but probably
warrants a closer look).
Then the final place the count could get messed up is in the actual
printing to the tsv. The way this works is the skipped elements are
held in sets. Acceptors/donors skipped are held in sets based on their
coordinates while exons skipped are held in sets based on their names.
The number of elements skipped is simply the size of the set, so we only
count unique elements. Also don't see a problem immediately with how this works.
*/
//Find overlap between transcript and junction on the negative strand,
//function returns true if either the acceptor or the donor is known
bool JunctionsAnnotator::overlap_ns(const vector<BED> & exons,
@@ -215,21 +258,32 @@ bool JunctionsAnnotator::overlap_ns(const vector<BED> & exons,
}
if(junction_start) {
if(exons[i].start > junction.start &&
exons[i].end < junction.end) {
junction.exons_skipped.insert(exons[i].name);
exons[i].end < junction.end &&
i > 0 && i < (exons.size()-1)) {
string exon_coords = common::num_to_str(exons[i].start) + "-" + common::num_to_str(exons[i].end);
junction.exons_skipped.insert(exon_coords);
}
if(exons[i].start > junction.start) {
junction.donors_skipped.insert(exons[i].start);
}
if(exons[i].end < junction.end) {
// YY NOTE: if the exon ends after the junction starts
// (and junction_start == true), then
// count the donor as skipped
if(exons[i].end > junction.start &&
exons[i].end < junction.end &&
i < (exons.size()-1)) {
junction.acceptors_skipped.insert(exons[i].end);
}
if(exons[i].start == junction.end) {
junction.known_donor = true;
// YY NOTE: if the exon starts before the junction ends
// (and junction_start == true), then
// count the acceptor as skipped
if(exons[i].start < junction.end &&
exons[i].start > junction.start) {
junction.donors_skipped.insert(exons[i].start);
}
if(exons[i].end == junction.start) {
junction.known_acceptor = true;
}
if(exons[i].start == junction.end) {
junction.known_donor = true;
}
}
}
}
@@ -1,2 +1,2 @@
chrom start end name score strand splice_site acceptors_skipped exons_skipped donors_skipped anchor known_donor known_acceptor known_junction genes transcripts variant_info
22 93668 97252 JUNC00000001 5 + GT-AG 2 1 2 NDA 1 1 0 EP300 ENST00000263253 22:94626-94627
22 93668 97252 JUNC00000001 5 + GT-AG 1 1 1 NDA 1 1 0 EP300 ENST00000263253 22:94626-94627
@@ -1,42 +1,42 @@
chrom start end name score strand splice_site acceptors_skipped exons_skipped donors_skipped anchor known_donor known_acceptor known_junction genes transcripts
22 14103 38192 JUNC00300575 38 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 14103 38192 JUNC00300575 38 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 38307 38693 JUNC00300576 2 + GT-AG 0 0 0 N 0 0 0 NA NA
22 38826 46869 JUNC00300577 152 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 47045 48492 JUNC00300578 236 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 48753 50895 JUNC00300579 299 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 51008 52393 JUNC00300580 280 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 51008 56818 JUNC00300581 1 + GT-AG 2 1 2 NDA 1 1 0 EP300 ENST00000263253
22 52638 56818 JUNC00300582 302 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 52642 56818 JUNC00300583 2 + GT-AG 0 0 1 A 0 1 0 EP300 ENST00000263253
22 56911 58658 JUNC00300584 340 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 58795 61145 JUNC00300585 608 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 61262 62053 JUNC00300586 854 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 61262 62075 JUNC00300587 6 + GT-AG 1 0 1 D 1 0 0 EP300 ENST00000263253
22 61262 67744 JUNC00300588 1 + GT-AG 2 1 2 NDA 1 1 0 EP300 ENST00000263253
22 62227 67744 JUNC00300589 1016 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 62227 68842 JUNC00300590 28 + GT-AG 2 1 2 NDA 1 1 0 EP300 ENST00000263253
22 67821 68842 JUNC00300591 1096 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 68951 70043 JUNC00300592 971 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 70180 70766 JUNC00300593 223 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 71203 72838 JUNC00300594 286 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 73017 73211 JUNC00300595 298 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 73355 76000 JUNC00300596 217 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 76118 78174 JUNC00300597 448 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 78413 79417 JUNC00300598 498 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 79505 81647 JUNC00300599 408 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 79505 83728 JUNC00300600 1 + GT-AG 2 1 2 NDA 1 1 0 EP300 ENST00000263253
22 81727 83728 JUNC00300601 348 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 83784 85058 JUNC00300602 334 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 85135 87604 JUNC00300603 429 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 87671 89454 JUNC00300604 574 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 89604 89726 JUNC00300605 572 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 89872 90508 JUNC00300606 772 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 90621 91411 JUNC00300607 655 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 90621 91093 JUNC00300608 3 + GT-AG 1 0 0 D 1 0 0 EP300 ENST00000263253
22 91576 93504 JUNC00300609 387 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 93668 94628 JUNC00300610 216 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 94789 97252 JUNC00300611 571 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 97533 97778 JUNC00300612 548 + GT-AG 0 0 1 DA 1 1 1 EP300 ENST00000263253
22 90586 104069 JUNC00300613 10 - GG-AG 1 0 1 A 0 1 0 RP1-85F18.6 ENST00000415054
22 90586 104068 JUNC00300614 100 - GT-AG 1 0 0 DA 1 1 1 RP1-85F18.6 ENST00000415054
22 90585 104068 JUNC00300614 10 - GT-GG 1 0 1 D 1 0 0 RP1-85F18.6 ENST00000415054
22 38826 46869 JUNC00300577 152 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 47045 48492 JUNC00300578 236 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 48753 50895 JUNC00300579 299 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 51008 52393 JUNC00300580 280 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 51008 56818 JUNC00300581 1 + GT-AG 1 1 1 NDA 1 1 0 EP300 ENST00000263253
22 52638 56818 JUNC00300582 302 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 52642 56818 JUNC00300583 2 + GT-AG 0 0 0 A 0 1 0 EP300 ENST00000263253
22 56911 58658 JUNC00300584 340 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 58795 61145 JUNC00300585 608 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 61262 62053 JUNC00300586 854 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 61262 62075 JUNC00300587 6 + GT-AG 1 0 0 D 1 0 0 EP300 ENST00000263253
22 61262 67744 JUNC00300588 1 + GT-AG 1 1 1 NDA 1 1 0 EP300 ENST00000263253
22 62227 67744 JUNC00300589 1016 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 62227 68842 JUNC00300590 28 + GT-AG 1 1 1 NDA 1 1 0 EP300 ENST00000263253
22 67821 68842 JUNC00300591 1096 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 68951 70043 JUNC00300592 971 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 70180 70766 JUNC00300593 223 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 71203 72838 JUNC00300594 286 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 73017 73211 JUNC00300595 298 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 73355 76000 JUNC00300596 217 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 76118 78174 JUNC00300597 448 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 78413 79417 JUNC00300598 498 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 79505 81647 JUNC00300599 408 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 79505 83728 JUNC00300600 1 + GT-AG 1 1 1 NDA 1 1 0 EP300 ENST00000263253
22 81727 83728 JUNC00300601 348 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 83784 85058 JUNC00300602 334 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 85135 87604 JUNC00300603 429 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 87671 89454 JUNC00300604 574 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 89604 89726 JUNC00300605 572 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 89872 90508 JUNC00300606 772 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 90621 91411 JUNC00300607 655 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 90621 91093 JUNC00300608 3 + GT-AG 0 0 0 D 1 0 0 EP300 ENST00000263253
22 91576 93504 JUNC00300609 387 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 93668 94628 JUNC00300610 216 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 94789 97252 JUNC00300611 571 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 97533 97778 JUNC00300612 548 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 90586 104069 JUNC00300613 10 - GG-AG 0 0 1 A 0 1 0 RP1-85F18.6 ENST00000415054
22 90586 104068 JUNC00300614 100 - GT-AG 0 0 0 DA 1 1 1 RP1-85F18.6 ENST00000415054
22 90585 104068 JUNC00300614 10 - GT-GG 1 0 0 D 1 0 0 RP1-85F18.6 ENST00000415054
@@ -32,8 +32,8 @@
class TestAnnotate(IntegrationTest, unittest.TestCase):
def test_junctions_annotate(self):
junctions = self.inputFiles("bed/test_hcc1395_junctions.bed")[0]
gtf = self.inputFiles("gtf/test_ensemble_chr22.gtf")[0]
fasta = self.inputFiles("fa/test_chr22.fa")[0]
gtf = self.inputFiles("gtf/test_ensemble_chr22.gtf")[0]
output_file = self.tempFile("observed-annotate.out")
expected_file = self.inputFiles("junctions-annotate/expected-annotate.out")[0]
params = ["junctions", "annotate", "-o", output_file, junctions, fasta, gtf]

0 comments on commit 69f0d8a

Please sign in to comment.