diff --git a/src/cis-splice-effects/CMakeLists.txt b/src/cis-splice-effects/CMakeLists.txt index 501f2ec..aca3693 100644 --- a/src/cis-splice-effects/CMakeLists.txt +++ b/src/cis-splice-effects/CMakeLists.txt @@ -12,5 +12,6 @@ include_directories(../gtf/ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__STDC_LIMIT_MACROS") add_library(cis-splice-effects + cis_splice_effects_associator.cc cis_splice_effects_identifier.cc cis_splice_effects_main.cc) diff --git a/src/cis-splice-effects/cis_splice_effects_associator.cc b/src/cis-splice-effects/cis_splice_effects_associator.cc new file mode 100644 index 0000000..d42aea8 --- /dev/null +++ b/src/cis-splice-effects/cis_splice_effects_associator.cc @@ -0,0 +1,274 @@ +/* cis_splice_effects_associator.cc -- 'cis-splice-effects associate' methods + + Copyright (c) 2015, The Griffith Lab + + Author: Avinash Ramu + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ + +#include +#include +#include "common.h" +#include "cis_splice_effects_associator.h" +#include "junctions_annotator.h" +#include "junctions_extractor.h" +#include "variants_annotator.h" + +//Usage for this tool +void CisSpliceEffectsAssociator::usage(ostream& out) { + out << "Usage:" + << "\t\t" << "regtools cis-splice-effects associate [options] variants.vcf junctions.bed ref.fa annotations.gtf" << endl; + out << "Options:" << endl; + out << "\t\t" << "-o STR\tOutput file containing the aberrant splice junctions with annotations. [STDOUT]" << endl; + out << "\t\t" << "-v STR\tOutput file containing variants annotated as splice relevant (VCF format)." << endl; + out << "\t\t" << "-j STR\tOutput file containing the aberrant junctions in BED12 format." << endl; + out << "\t\t" << "-s INT\tStrand specificity of RNA library preparation \n" + << "\t\t\t " << "(0 = unstranded, 1 = first-strand/RF, 2, = second-strand/FR). [1]" << endl; + out << "\t\t" << "-a INT\tMinimum anchor length. Junctions which satisfy a minimum \n" + << "\t\t\t " << "anchor length on both sides are reported. [8]" << endl; + out << "\t\t" << "-m INT\tMinimum intron length. [70]" << endl; + out << "\t\t" << "-M INT\tMaximum intron length. [500000]" << endl; + out << "\t\t" << "-w INT\tWindow size in b.p to identify splicing events in.\n" + << "\t\t\t " << "The tool identifies events in variant.start +/- w basepairs.\n" + << "\t\t\t " << "Default behaviour is to look at the window between previous and next exons." << endl; + out << "\t\t" << "-e INT\tMaximum distance from the start/end of an exon \n" + << "\t\t\t " << "to annotate a variant as relevant to splicing, the variant \n" + << "\t\t\t " << "is in exonic space, i.e a coding variant. [3]" << endl; + out << "\t\t" << "-i INT\tMaximum distance from the start/end of an exon \n" + << "\t\t\t " << "to annotate a variant as relevant to splicing, the variant \n" + << "\t\t\t " << "is in intronic space. [2]" << endl; + out << "\t\t" << "-I\tAnnotate variants in intronic space within a transcript(not to be used with -i)." << endl; + out << "\t\t" << "-E\tAnnotate variants in exonic space within a transcript(not to be used with -e)." << endl; + out << "\t\t" << "-S\tDon't skip single exon transcripts." << endl; + out << endl; +} + +//Return stream to write output to +void CisSpliceEffectsAssociator::close_ostream() { + if(ofs_.is_open()) + ofs_.close(); +} + +//Return stream to write output to +//If output file is not empty, attempt to open +//If output file is empty, set to cout +void CisSpliceEffectsAssociator::set_ostream() { + if(output_file_ == "NA") { + common::copy_stream(cout, ofs_); + } else { + ofs_.open(output_file_.c_str()); + if(!ofs_.is_open()) + throw runtime_error("Unable to open " + + output_file_); + } + if(output_junctions_bed_ != "NA") { + ofs_junctions_bed_.open(output_junctions_bed_.c_str()); + if(!ofs_junctions_bed_.is_open()) + throw runtime_error("Unable to open " + + output_junctions_bed_); + } +} + +//Do QC on files +void CisSpliceEffectsAssociator::file_qc() { + if(vcf_ == "NA" || bed_ == "NA" || + ref_ == "NA" || gtf_ == "NA") { + usage(std::cout); + throw runtime_error("Error parsing inputs!(2)\n\n"); + } + if(!common::file_exists(vcf_) || !common::file_exists(bed_) || + !common::file_exists(ref_) || !common::file_exists(gtf_)) { + throw runtime_error("Please make sure input files exist.\n\n"); + } +} + +//Parse command line options +void CisSpliceEffectsAssociator::parse_options(int argc, char* argv[]) { + optind = 1; //Reset before parsing again. + stringstream help_ss; + char c; + while((c = getopt(argc, argv, "o:w:v:j:e:Ei:ISha:m:M:")) != -1) { + switch(c) { + case 'o': + output_file_ = string(optarg); + break; + case 'w': + window_size_ = atoi(optarg); + break; + case 'v': + annotated_variant_file_ = string(optarg); + break; + case 'j': + output_junctions_bed_ = string(optarg); + break; + case 'i': + intronic_min_distance_ = atoi(optarg); + break; + case 'e': + exonic_min_distance_ = atoi(optarg); + break; + case 'I': + all_intronic_space_ = true; + break; + case 'E': + all_exonic_space_ = true; + break; + case 'S': + skip_single_exon_genes_ = false; + break; + case 'h': + usage(help_ss); + throw common::cmdline_help_exception(help_ss.str()); + default: + usage(std::cerr); + throw runtime_error("Error parsing inputs!(1)\n\n"); + } + } + if(argc - optind >= 4) { + vcf_ = string(argv[optind++]); + bed_ = string(argv[optind++]); + ref_ = string(argv[optind++]); + gtf_ = string(argv[optind++]); + } + if(optind < argc || + vcf_ == "NA" || + bed_ == "NA" || + ref_ == "NA" || + gtf_ == "NA"){ + usage(std::cerr); + throw runtime_error("Error parsing inputs!(2)\n\n"); + } + file_qc(); + cerr << "Variant file: " << vcf_ << endl; + cerr << "Junctions BED file: " << bed_ << endl; + cerr << "Reference fasta file: " << ref_ << endl; + cerr << "Annotation file: " << gtf_ << endl; + if(window_size_ != 0) { + cerr << "Window size: " << window_size_ << endl; + } + if(output_file_ != "NA") + cerr << "Output file: " << output_file_ << endl; + if(output_junctions_bed_ != "NA") + cerr << "Output junctions BED file: " << output_junctions_bed_ << endl; + if(annotated_variant_file_ != "NA") { + cerr << "Annotated variants file: " << annotated_variant_file_ << endl; + write_annotated_variants_ = true; + } + cerr << endl; +} + +//Call the junctions annotator; duplication of identify +void CisSpliceEffectsAssociator::annotate_junctions(const GtfParser& gp1) { + JunctionsAnnotator ja1(ref_, gp1); + ja1.set_gtf_parser(gp1); + set_ostream(); + //Annotate the junctions in the set and write to file + AnnotatedJunction::print_header(ofs_, true); + int i = 0; + //This is ugly, waiting to start using C++11/14 + for (set::iterator j1 = unique_junctions_.begin(); j1 != unique_junctions_.end(); j1++) { + Junction j = *j1; + AnnotatedJunction line(j); + ja1.get_splice_site(line); + ja1.annotate_junction_with_gtf(line); + line.name = j.name = get_junction_name(++i); + if(output_junctions_bed_ != "NA") { + j.print(ofs_junctions_bed_); + } + line.variant_info = variant_set_to_string(junction_to_variant_[j]); + line.print(ofs_, true); + } + close_ostream(); +} + +//get name for the junction; duplication of identify +string CisSpliceEffectsAssociator::get_junction_name(int i) { + stringstream name_ss; + name_ss << "JUNC" << setfill('0') << setw(8) << i; + return name_ss.str(); +} + +//parse BED into vector of junctions +vector CisSpliceEffectsAssociator::parse_BED_to_junctions(){ + vector junctions; + JunctionsAnnotator anno(bed_); + AnnotatedJunction line; + Junction junc; + line.reset(); + anno.open_junctions(); + while(anno.get_single_junction(line)) { + junc.chrom = common::str_to_num(line.fields[0]); + junc.start = common::str_to_num(line.fields[1]); + junc.end = common::str_to_num(line.fields[2]); + junc.name = line.fields[3]; + junc.read_count = common::str_to_num(line.fields[4]); + junc.strand = line.fields[5]; + junc.thick_start = common::str_to_num(line.fields[6]); + junc.thick_end = common::str_to_num(line.fields[7]); + junc.color = line.fields[8]; + junc.nblocks = common::str_to_num(line.fields[9]); + junc.has_left_min_anchor = true; + junc.has_right_min_anchor = true; + junctions.push_back(junc); // are we making a copy or just adding a pointer? + line.reset(); + } + anno.close_junctions(); +} + +//The workhorse +void CisSpliceEffectsAssociator::associate() { + //GTF parser object + GtfParser gp1(gtf_); + gp1.load(); + //variant annotator + VariantsAnnotator va(vcf_, gp1, annotated_variant_file_, intronic_min_distance_, exonic_min_distance_, all_intronic_space_, all_exonic_space_, skip_single_exon_genes_); + va.open_vcf_in(); + if(write_annotated_variants_) + va.open_vcf_out(); + cerr << endl; + //Get junctions from BED + vector junctions = parse_BED_to_junctions(); + //Annotate each variant and pay attention to splicing related ones + while(va.read_next_record()) { + AnnotatedVariant v1 = va.annotate_record_with_transcripts(); + if(v1.annotation != non_splice_region_annotation_string) { + string region_start = window_size_ ? common::num_to_str(v1.start - window_size_) : + common::num_to_str(v1.cis_effect_start); + string region_end = window_size_ ? common::num_to_str(v1.end + window_size_) : + common::num_to_str(v1.cis_effect_end); + string variant_region = v1.chrom + ":" + region_start + "-" + region_end; + cerr << "Variant " << v1; + cerr << "Variant region is " << variant_region << endl; + cerr << endl; + if(write_annotated_variants_) + va.write_annotation_output(v1); + //Add all the junctions to the unique set + for (size_t i = 0; i < junctions.size(); i++) { + //Allow partial overlap - either junction start or end is within window + if((junctions[i].start >= v1.cis_effect_start && junctions[i].start <= v1.cis_effect_end) || + (junctions[i].end <= v1.cis_effect_end && junctions[i].end >= v1.cis_effect_start)) { + unique_junctions_.insert(junctions[i]); + //add to the map of junctions to variants + junction_to_variant_[junctions[i]].insert(v1); + } + } + } + } + annotate_junctions(gp1); +} diff --git a/src/cis-splice-effects/cis_splice_effects_associator.h b/src/cis-splice-effects/cis_splice_effects_associator.h new file mode 100644 index 0000000..c0aa519 --- /dev/null +++ b/src/cis-splice-effects/cis_splice_effects_associator.h @@ -0,0 +1,143 @@ +/* cis_splice_effects_associator.h -- 'cis-splice-effects associate' + + Copyright (c) 2015, The Griffith Lab + + Author: Avinash Ramu + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. */ +#ifndef CIS_SPLICE_EFFECTS_ASSOCIATOR_ +#define CIS_SPLICE_EFFECTS_ASSOCIATOR_ + +#include "junctions_annotator.h" +#include "junctions_extractor.h" +#include "variants_annotator.h" + +//Workhorse for "cis-splice-effects associate" +class CisSpliceEffectsAssociator { + private: + //Object that annotates the variants + VariantsAnnotator va; + //Object that annotates the junctions + JunctionsAnnotator ja; + //VCF file with variants + string vcf_; + //RNAseq alignments + string bed_; + //Reference sequence FASTA + string ref_; + //GTF file with annotations + string gtf_; + //File to write output to + string output_file_; + //Aberrant splice junctions in BED12 format + string output_junctions_bed_; + //File to write output to + string annotated_variant_file_; + //Flag to indicate whether to write output vcf + bool write_annotated_variants_; + //Window size to look in + //Looks at variant.pos +/- window_size + uint32_t window_size_; + //output stream to output annotated junctions file + ofstream ofs_; + //output stream to output BED12 junctions file + ofstream ofs_junctions_bed_; + //Unique set of junctions near splicing variants + set unique_junctions_; + //Store the variants that mark a junction + map > junction_to_variant_; + //Minimum distance of a variant from + //edge of an exon(intronic) to be considered + //a splicing variant + uint32_t intronic_min_distance_; + //Minimum distance of a variant from + //edge of an exon(exonic) to be considered + //a splicing variant + uint32_t exonic_min_distance_; + //Flag set by the -I option + bool all_intronic_space_; + //Flag set by the -E option + bool all_exonic_space_; + //Option to skip single exon genes + bool skip_single_exon_genes_; + //strandness of data + int strandness_; + //Minimum anchor length for junctions + //Junctions need atleast this many bp overlap + // on both ends. + uint32_t min_anchor_length_; + //Minimum length of an intron, i.e min junction width + uint32_t min_intron_length_; + //Maximum length of an intron, i.e max junction width + uint32_t max_intron_length_; + public: + //Constructor + CisSpliceEffectsAssociator() : vcf_("NA"), output_file_("NA"), + output_junctions_bed_("NA"), + annotated_variant_file_("NA"), + write_annotated_variants_(false), + window_size_(0), + intronic_min_distance_(2), + exonic_min_distance_(3), + all_intronic_space_(false), + all_exonic_space_(false), + skip_single_exon_genes_(true), + strandness_(1), + min_anchor_length_(8), + min_intron_length_(70), + max_intron_length_(500000) {} + //Destructor + ~CisSpliceEffectsAssociator() { + if(ofs_.is_open()) { + ofs_.close(); + } + if(ofs_junctions_bed_.is_open()) { + ofs_junctions_bed_.close(); + } + } + //Parse command line arguments + void parse_options(int argc, char* argv[]); + //parse BED into vector of junctions + vector parse_BED_to_junctions(); + //Check if files exist + void file_qc(); + //Identify cis splicing effects + void associate(); + //Usage for this tool + void usage(ostream &out); + //Set ofstream object to appropriate value + //This could write to output_file_ or to std::cout + void set_ostream(); + //Close ofstream object + void close_ostream(); + //Get the window size + uint32_t window_size() { return window_size_; } + //Get the annotated variants file(VCF) + string annotated_variant_file() { return annotated_variant_file_; } + //Get the file with splice junctions(BED) + string output_file() { return output_file_; } + //Get the Input VCF + string vcf() { return vcf_; } + //Call the junctions annotator + void annotate_junctions(const GtfParser& gtf_p1); + //Get junction name given an index + string get_junction_name(int i); +}; + +#endif diff --git a/src/cis-splice-effects/cis_splice_effects_identifier.cc b/src/cis-splice-effects/cis_splice_effects_identifier.cc index 30c9f6a..137fd4f 100644 --- a/src/cis-splice-effects/cis_splice_effects_identifier.cc +++ b/src/cis-splice-effects/cis_splice_effects_identifier.cc @@ -33,8 +33,7 @@ DEALINGS IN THE SOFTWARE. */ //Usage for this tool void CisSpliceEffectsIdentifier::usage(ostream& out) { out << "Usage:" - << "\t\t" << "regtools cis-splice-effects identify [options] variants.vcf" - << "\t\t " << "alignments.bam ref.fa annotations.gtf" << endl; + << "\t\t" << "regtools cis-splice-effects identify [options] variants.vcf alignments.bam ref.fa annotations.gtf" << endl; out << "Options:" << endl; out << "\t\t" << "-o STR\tOutput file containing the aberrant splice junctions with annotations. [STDOUT]" << endl; out << "\t\t" << "-v STR\tOutput file containing variants annotated as splice relevant (VCF format)." << endl; diff --git a/src/cis-splice-effects/cis_splice_effects_main.cc b/src/cis-splice-effects/cis_splice_effects_main.cc index 125ac4d..c36ca8a 100644 --- a/src/cis-splice-effects/cis_splice_effects_main.cc +++ b/src/cis-splice-effects/cis_splice_effects_main.cc @@ -27,6 +27,7 @@ DEALINGS IN THE SOFTWARE. */ #include #include "common.h" #include "cis_splice_effects_identifier.h" +#include "cis_splice_effects_associator.h" using namespace std; @@ -49,10 +50,30 @@ int cis_splice_effects_identify(int argc, char* argv[]) { return 0; } +//Main command for associate; pretty much a duplicate +int cis_splice_effects_associate(int argc, char* argv[]) { + CisSpliceEffectsAssociator csea1; + try { + csea1.parse_options(argc, argv); + csea1.associate(); + } catch(const common::cmdline_help_exception& e) { + cerr << e.what() << endl; + return 0; + } catch (const std::runtime_error &e) { + cerr << e.what() << endl; + return 1; + } catch (const std::logic_error &e) { + cerr << e.what() << endl; + return 1; + } + return 0; +} + //Usage for cis-splice-effects subcommands int cis_splice_effects_usage(ostream &out = cout) { out << "Usage:\t\t" << "regtools cis-splice-effects [options]" << endl; out << "Command:\t" << "identify\t\tIdentify cis splicing effects." << endl; + out << "\t\tassociate\tAssociate extracted junctions with variants" << endl; out << endl; return 0; } @@ -65,5 +86,8 @@ int cis_splice_effects_main(int argc, char* argv[]) { if(string(argv[1]) == "identify") { return cis_splice_effects_identify(argc - 1, argv + 1); } + if(string(argv[1]) == "associate") { + return cis_splice_effects_associate(argc - 1, argv + 1); + } return cis_splice_effects_usage(std::cout); } diff --git a/src/junctions/junctions_annotator.h b/src/junctions/junctions_annotator.h index fe35b97..f4419ba 100644 --- a/src/junctions/junctions_annotator.h +++ b/src/junctions/junctions_annotator.h @@ -201,13 +201,18 @@ class JunctionsAnnotator { , skip_single_exon_genes_(true) , output_file_("NA") {} - //Default constructor JunctionsAnnotator(string ref1, GtfParser gp1) : ref_(ref1) , skip_single_exon_genes_(true) , gtf_(gp1) , output_file_("NA") {} + JunctionsAnnotator(string bedFile) + : ref_("NA") + , skip_single_exon_genes_(true) + , output_file_("NA") + , junctions_(bedFile) + {} //Get the GTF file string gtf_file(); //Get ostream object to write output to diff --git a/src/junctions/junctions_extractor.h b/src/junctions/junctions_extractor.h index 03f30b8..2754d8e 100644 --- a/src/junctions/junctions_extractor.h +++ b/src/junctions/junctions_extractor.h @@ -158,7 +158,6 @@ class JunctionsExtractor { public: //Default constructor JunctionsExtractor() { - //cerr << "default constructor called" << endl; min_anchor_length_ = 8; min_intron_length_ = 70; max_intron_length_ = 500000;