Skip to content

Commit

Permalink
Merge pull request #81 from griffithlab/gene_id
Browse files Browse the repository at this point in the history
add gene id as column output by junctions annotate
  • Loading branch information
gatoravi committed Apr 17, 2020
2 parents 53d991b + 5becbbd commit 9221a19
Show file tree
Hide file tree
Showing 9 changed files with 97 additions and 78 deletions.
27 changes: 15 additions & 12 deletions src/gtf/gtf_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,13 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */

#include <sstream>
#include <cassert>
#include <cstdlib>
#include <fstream>
#include <iostream>
#include <set>
#include <stdexcept>
#include <vector>
#include <algorithm>
#include "common.h"
#include "bedFile.h"
#include "gtf_parser.h"
#include "lineFileUtilities.h"

Expand Down Expand Up @@ -112,14 +109,15 @@ void GtfParser::add_exon_to_transcript_map(Gtf gtf1) {
Tokenize(gtf1.attributes, attributes, ';');
string transcript_id = parse_attribute(attributes, "transcript_id");
string gene_name = parse_attribute(attributes, "gene_name");
string gene_id = parse_attribute(attributes, "gene_id");
//create a BED6 object
//NOTE: the 'name' column is simply going to be 'exon'
BED exon = BED(gtf1.seqname, gtf1.start,
gtf1.end, gtf1.feature,
gtf1.score, gtf1.strand);
if(transcript_id != string("NA")) {
transcript_map_[transcript_id].exons.push_back(exon);
set_transcript_gene(transcript_id, gene_name);
set_transcript_gene(transcript_id, gene_name, gene_id);
}
}

Expand Down Expand Up @@ -244,12 +242,14 @@ void GtfParser::set_gtffile(string filename) {
gtffile_ = filename;
}

//Get the gene ID using the trancript ID
string GtfParser::get_gene_from_transcript(string transcript_id) {
//Get the gene name and gene ID using the trancript ID
vector<string> GtfParser::get_gene_from_transcript(string transcript_id) {
if(transcript_to_gene_.count(transcript_id)) {
return transcript_to_gene_[transcript_id];
} else {
return "NA";
string arr[] = {"NA, NA"};
vector<string> NA (arr, arr + sizeof(arr)/sizeof(string));
return NA;
}
}

Expand All @@ -262,11 +262,14 @@ void GtfParser::load() {
//print_transcripts();
}

//Set the gene ID for a trancript ID
inline void GtfParser::set_transcript_gene(string transcript_id, string gene_id) {
//Set the gene name and gene ID for a trancript ID
inline void GtfParser::set_transcript_gene(string transcript_id, string gene_name, string gene_id) {
//check if key already exists
if(transcript_to_gene_.count(transcript_id) == 0)
transcript_to_gene_[transcript_id] = gene_id;
if(transcript_to_gene_.count(transcript_id) == 0){
string arr[] = {gene_name, gene_id};
vector<string> gene (arr, arr + sizeof(arr)/sizeof(string));
transcript_to_gene_[transcript_id] = gene;
}
}

//Assignment operator
Expand Down
9 changes: 5 additions & 4 deletions src/gtf/gtf_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ DEALINGS IN THE SOFTWARE. */
#ifndef GTF_PARSER_H_
#define GTF_PARSER_H_

#include <string>
#include <fstream>
#include <iostream>
#include <map>
Expand Down Expand Up @@ -112,8 +113,8 @@ class GtfParser {
ifstream gtf_fh_;
//Are exons within transcripts sorted
bool transcripts_sorted_;
//Jump from transcript-id to gene-id
map<string, string> transcript_to_gene_;
//Jump from transcript-id to {gene name, gene id}
map<string, vector<string> > transcript_to_gene_;
//Store transcripts as a vector of exon BEDs
//keyed by transcript_id
map<string, Transcript> transcript_map_;
Expand Down Expand Up @@ -177,9 +178,9 @@ class GtfParser {
//The return value is a vector of BEDs
const vector<BED> & get_exons_from_transcript(string transcript_id);
//Get the gene ID using the trancript ID
string get_gene_from_transcript(string transcript_id);
vector<string> get_gene_from_transcript(string transcript_id);
//Set the gene ID for a trancript ID
void set_transcript_gene(string transcript_id, string gene_id);
void set_transcript_gene(string transcript_id, string gene_name, string gene_id);
//Load all the necessary objects into memory
void load();
//Assignment operator
Expand Down
20 changes: 15 additions & 5 deletions src/junctions/junctions_annotator.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ DEALINGS IN THE SOFTWARE. */
#ifndef JUNCTIONS_ANNOTATOR_H_
#define JUNCTIONS_ANNOTATOR_H_

#include <string>
#include <sstream>
#include <iostream>
#include <iterator>
#include "bedFile.h"
Expand All @@ -41,7 +43,7 @@ struct AnnotatedJunction : BED {
set<string> transcripts_overlap;
//set of genes that
//the junction overlaps
set<string> genes_overlap;
set< vector<string> > genes_overlap;
//set of exons that the junction
//overlaps
set<string> exons_skipped;
Expand Down Expand Up @@ -72,7 +74,7 @@ struct AnnotatedJunction : BED {
"\t" << "exons_skipped" << "\t" << "donors_skipped" <<
"\t" << "anchor" <<
"\t" << "known_donor" << "\t" << "known_acceptor" << "\t" << "known_junction" <<
"\t" << "genes" << "\t" << "transcripts";
"\t" << "gene_names" << "\t" << "gene_ids" << "\t" << "transcripts";
if(variant_info_exists) {
out << "\t" << "variant_info";
}
Expand All @@ -90,13 +92,21 @@ struct AnnotatedJunction : BED {
//See if any genes overlap the junction
if(genes_overlap.size()) {
out << "\t";
for(set<string>::iterator it = genes_overlap.begin(); it != genes_overlap.end(); ++it) {
for(set< vector<string> >::iterator it = genes_overlap.begin(); it != genes_overlap.end(); ++it) {
if(it != genes_overlap.begin())
out << ",";
out << *it;
//print gene name
out << (*it)[0];
}
out << "\t";
for(set< vector<string> >::iterator it = genes_overlap.begin(); it != genes_overlap.end(); ++it) {
if(it != genes_overlap.begin())
out << ",";
//print gene name
out << (*it)[1];
}
} else {
out << "\t" << "NA";
out << "\t" << "NA" << "\t" << "NA";
}
//See if any transcripts overlap the junction
if(transcripts_overlap.size()) {
Expand Down
12 changes: 6 additions & 6 deletions src/variants/variants_annotator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -483,25 +483,25 @@ AnnotatedVariant VariantsAnnotator::annotate_record_with_transcripts() {
//Use a AnnotatedVariant object to hold the result
get_variant_overlaps_spliceregion(exons, variant);
if(variant.annotation != "non_splice_region") {
string gene_id = gtf_.get_gene_from_transcript(transcripts[i]);
string gene_name = gtf_.get_gene_from_transcript(transcripts[i])[0];
//Use sign to encode intronic/exonic
string annotation = variant.annotation;
string dist_str = variant.score;
//Add gene only once for multiple transcripts of the same gene.
if(overlapping_transcripts != "NA") {
//Check if this gene is new
if(unique_genes.find(gene_id) == unique_genes.end()) {
overlapping_genes += "," + gene_id;
unique_genes.insert(gene_id);
if(unique_genes.find(gene_name) == unique_genes.end()) {
overlapping_genes += "," + gene_name;
unique_genes.insert(gene_name);
}
overlapping_distances += "," + dist_str;
overlapping_transcripts += "," + transcripts[i];
annotations += "," + annotation;
} else {
overlapping_genes = gene_id;
overlapping_genes = gene_name;
overlapping_distances = dist_str;
overlapping_transcripts = transcripts[i];
unique_genes.insert(gene_id);
unique_genes.insert(gene_name);
annotations = annotation;
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
chrom start end name score strand splice_site acceptors_skipped exons_skipped donors_skipped anchor known_donor known_acceptor known_junction genes transcripts variant_info
22 93668 97252 JUNC00000001 5 + GT-AG 1 1 1 NDA 1 1 0 EP300 ENST00000263253 22:94626-94627
chrom start end name score strand splice_site acceptors_skipped exons_skipped donors_skipped anchor known_donor known_acceptor known_junction gene_names gene_ids transcripts variant_info
22 93668 97252 JUNC00000001 5 + GT-AG 1 1 1 NDA 1 1 0 EP300 ENSG00000100393 ENST00000263253 22:94626-94627
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
chrom start end name score strand splice_site acceptors_skipped exons_skipped donors_skipped anchor known_donor known_acceptor known_junction genes transcripts variant_info
22 93668 97252 JUNC00000001 5 - CT-AC 0 0 0 N 0 0 0 NA NA 22:94626-94627
chrom start end name score strand splice_site acceptors_skipped exons_skipped donors_skipped anchor known_donor known_acceptor known_junction gene_names gene_ids transcripts variant_info
22 93668 97252 JUNC00000001 5 - CT-AC 0 0 0 N 0 0 0 NA NA NA 22:94626-94627
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
chrom start end name score strand splice_site acceptors_skipped exons_skipped donors_skipped anchor known_donor known_acceptor known_junction genes transcripts
22 93668 97252 JUNC00000001 5 + GT-AG 2 1 2 NDA 1 1 0 EP300 ENST00000263253
chrom start end name score strand splice_site acceptors_skipped exons_skipped donors_skipped anchor known_donor known_acceptor known_junction gene_names gene_ids transcripts
22 93668 97252 JUNC00000001 5 + GT-AG 2 1 2 NDA 1 1 0 EP300 ENSG00000100393 ENST00000263253
Original file line number Diff line number Diff line change
@@ -1,42 +1,42 @@
chrom start end name score strand splice_site acceptors_skipped exons_skipped donors_skipped anchor known_donor known_acceptor known_junction genes transcripts
22 14103 38192 JUNC00300575 38 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 38307 38693 JUNC00300576 2 + GT-AG 0 0 0 N 0 0 0 NA NA
22 38826 46869 JUNC00300577 152 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 47045 48492 JUNC00300578 236 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 48753 50895 JUNC00300579 299 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 51008 52393 JUNC00300580 280 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 51008 56818 JUNC00300581 1 + GT-AG 1 1 1 NDA 1 1 0 EP300 ENST00000263253
22 52638 56818 JUNC00300582 302 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 52642 56818 JUNC00300583 2 + GT-AG 0 0 0 A 0 1 0 EP300 ENST00000263253
22 56911 58658 JUNC00300584 340 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 58795 61145 JUNC00300585 608 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 61262 62053 JUNC00300586 854 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 61262 62075 JUNC00300587 6 + GT-AG 1 0 0 D 1 0 0 EP300 ENST00000263253
22 61262 67744 JUNC00300588 1 + GT-AG 1 1 1 NDA 1 1 0 EP300 ENST00000263253
22 62227 67744 JUNC00300589 1016 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 62227 68842 JUNC00300590 28 + GT-AG 1 1 1 NDA 1 1 0 EP300 ENST00000263253
22 67821 68842 JUNC00300591 1096 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 68951 70043 JUNC00300592 971 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 70180 70766 JUNC00300593 223 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 71203 72838 JUNC00300594 286 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 73017 73211 JUNC00300595 298 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 73355 76000 JUNC00300596 217 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 76118 78174 JUNC00300597 448 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 78413 79417 JUNC00300598 498 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 79505 81647 JUNC00300599 408 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 79505 83728 JUNC00300600 1 + GT-AG 1 1 1 NDA 1 1 0 EP300 ENST00000263253
22 81727 83728 JUNC00300601 348 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 83784 85058 JUNC00300602 334 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 85135 87604 JUNC00300603 429 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 87671 89454 JUNC00300604 574 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 89604 89726 JUNC00300605 572 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 89872 90508 JUNC00300606 772 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 90621 91411 JUNC00300607 655 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 90621 91093 JUNC00300608 3 + GT-AG 0 0 0 D 1 0 0 EP300 ENST00000263253
22 91576 93504 JUNC00300609 387 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 93668 94628 JUNC00300610 216 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 94789 97252 JUNC00300611 571 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 97533 97778 JUNC00300612 548 + GT-AG 0 0 0 DA 1 1 1 EP300 ENST00000263253
22 90586 104069 JUNC00300613 10 - GG-AG 0 0 1 A 0 1 0 RP1-85F18.6 ENST00000415054
22 90586 104068 JUNC00300614 100 - GT-AG 0 0 0 DA 1 1 1 RP1-85F18.6 ENST00000415054
22 90585 104068 JUNC00300614 10 - GT-GG 1 0 0 D 1 0 0 RP1-85F18.6 ENST00000415054
chrom start end name score strand splice_site acceptors_skipped exons_skipped donors_skipped anchor known_donor known_acceptor known_junction gene_names gene_ids transcripts
22 14103 38192 JUNC00300575 38 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 38307 38693 JUNC00300576 2 + GT-AG 0 0 0 N 0 0 0 NA NA NA
22 38826 46869 JUNC00300577 152 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 47045 48492 JUNC00300578 236 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 48753 50895 JUNC00300579 299 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 51008 52393 JUNC00300580 280 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 51008 56818 JUNC00300581 1 + GT-AG 1 1 1 NDA 1 1 0 EP300 ENSG00000100393 ENST00000263253
22 52638 56818 JUNC00300582 302 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 52642 56818 JUNC00300583 2 + GT-AG 0 0 0 A 0 1 0 EP300 ENSG00000100393 ENST00000263253
22 56911 58658 JUNC00300584 340 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 58795 61145 JUNC00300585 608 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 61262 62053 JUNC00300586 854 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 61262 62075 JUNC00300587 6 + GT-AG 1 0 0 D 1 0 0 EP300 ENSG00000100393 ENST00000263253
22 61262 67744 JUNC00300588 1 + GT-AG 1 1 1 NDA 1 1 0 EP300 ENSG00000100393 ENST00000263253
22 62227 67744 JUNC00300589 1016 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 62227 68842 JUNC00300590 28 + GT-AG 1 1 1 NDA 1 1 0 EP300 ENSG00000100393 ENST00000263253
22 67821 68842 JUNC00300591 1096 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 68951 70043 JUNC00300592 971 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 70180 70766 JUNC00300593 223 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 71203 72838 JUNC00300594 286 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 73017 73211 JUNC00300595 298 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 73355 76000 JUNC00300596 217 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 76118 78174 JUNC00300597 448 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 78413 79417 JUNC00300598 498 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 79505 81647 JUNC00300599 408 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 79505 83728 JUNC00300600 1 + GT-AG 1 1 1 NDA 1 1 0 EP300 ENSG00000100393 ENST00000263253
22 81727 83728 JUNC00300601 348 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 83784 85058 JUNC00300602 334 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 85135 87604 JUNC00300603 429 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 87671 89454 JUNC00300604 574 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 89604 89726 JUNC00300605 572 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 89872 90508 JUNC00300606 772 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 90621 91411 JUNC00300607 655 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 90621 91093 JUNC00300608 3 + GT-AG 0 0 0 D 1 0 0 EP300 ENSG00000100393 ENST00000263253
22 91576 93504 JUNC00300609 387 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 93668 94628 JUNC00300610 216 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 94789 97252 JUNC00300611 571 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 97533 97778 JUNC00300612 548 + GT-AG 0 0 0 DA 1 1 1 EP300 ENSG00000100393 ENST00000263253
22 90586 104069 JUNC00300613 10 - GG-AG 0 0 1 A 0 1 0 RP1-85F18.6 ENSG00000232754 ENST00000415054
22 90586 104068 JUNC00300614 100 - GT-AG 0 0 0 DA 1 1 1 RP1-85F18.6 ENSG00000232754 ENST00000415054
22 90585 104068 JUNC00300614 10 - GT-GG 1 0 0 D 1 0 0 RP1-85F18.6 ENSG00000232754 ENST00000415054
11 changes: 8 additions & 3 deletions tests/lib/gtf/test_gtf_parser.cc
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
DEALINGS IN THE SOFTWARE. */

#include <string>
#include <gtest/gtest.h>
#include <sstream>
#include <stdexcept>
Expand Down Expand Up @@ -102,9 +103,13 @@ TEST_F(GtfParserTest, AddExonToTranscriptTest) {
test_gtf1.is_exon = true;
gp1.add_exon_to_transcript_map(test_gtf1);
EXPECT_EQ("EP300",
gp1.get_gene_from_transcript("ENST00000263253"));
EXPECT_EQ("NA",
gp1.get_gene_from_transcript("ENSTfake"));
gp1.get_gene_from_transcript("ENST00000263253")[0]);
EXPECT_EQ("ENSG00000100393",
gp1.get_gene_from_transcript("ENST00000263253")[1]);
EXPECT_EQ("NA, NA",
gp1.get_gene_from_transcript("ENSTfake")[0]);
EXPECT_EQ("NA, NA",
gp1.get_gene_from_transcript("ENSTfake")[0]);
gp1.annotate_transcript_with_bins();
EXPECT_EQ(37359u,
gp1.bin_from_transcript("ENST00000263253"));
Expand Down

0 comments on commit 9221a19

Please sign in to comment.