Skip to content

Commit

Permalink
Merge 69f0d8a into cacbb3b
Browse files Browse the repository at this point in the history
  • Loading branch information
yang-yangfeng committed Apr 22, 2018
2 parents cacbb3b + 69f0d8a commit 1745555
Show file tree
Hide file tree
Showing 21 changed files with 311 additions and 72 deletions.
Binary file added .DS_Store
Binary file not shown.
5 changes: 3 additions & 2 deletions CMakeLists.txt
Expand Up @@ -11,8 +11,9 @@ include(TestHelper)

#versioning stuff
set (regtools_VERSION_MAJOR 0)
set (regtools_VERSION_MINOR 4)
set (regtools_VERSION_PATCH 2)
set (regtools_VERSION_MINOR 5)
set (regtools_VERSION_PATCH 0)

configure_file (
"${PROJECT_SOURCE_DIR}/src/version.h.in"
"${PROJECT_BINARY_DIR}/version.h"
Expand Down
Binary file added src/.DS_Store
Binary file not shown.
2 changes: 1 addition & 1 deletion src/cis-ase/cis_ase_identifier.cc
Expand Up @@ -56,7 +56,7 @@ void CisAseIdentifier::usage(ostream& out) {
<< "regtools cis-ase identify [options] somatic_variants.vcf polymorphism.vcf"
<< " tumor_dna_alignments.bam tumor_rna_alignments.bam ref.fa annotations.gtf" << endl;
out << "Options:" << endl;
out << "\t" << "-o STR Output file containing the ase variants in VCF format. [STDOUT]" << endl;
out << "\t\t" << "-o STR Output file containing the ase variants in VCF format. [STDOUT]" << endl;
out << "\t\t" << "-d INT Minimum total read-depth for a somatic/ASE variant. [10]" << endl;
out << "\t\t" << "-w INT Window around a somatic variant to look for transcripts. ASE variants "
<< "will be in these transcripts[1000]" << endl;
Expand Down
39 changes: 22 additions & 17 deletions src/cis-splice-effects/cis_splice_effects_identifier.cc
Expand Up @@ -32,22 +32,24 @@ DEALINGS IN THE SOFTWARE. */

//Usage for this tool
void CisSpliceEffectsIdentifier::usage(ostream& out) {
out << "Usage:\t\t"
<< "regtools cis-splice-effects identify [options] variants.vcf"
<< " alignments.bam ref.fa annotations.gtf" << endl;
out << "Usage:"
<< "\t\t" << "regtools cis-splice-effects identify [options] variants.vcf"
<< "\t\t " << "alignments.bam ref.fa annotations.gtf" << endl;
out << "Options:" << endl;
out << "\t" << "-o STR Output file containing the aberrant splice junctions with annotations. [STDOUT]" << endl;
out << "\t\t" << "-v STR Output file containing variants annotated as splice relevant (VCF format)." << endl;
out << "\t\t" << "-w INT\tWindow size in b.p to identify splicing events in. "
<< "\t\t\t" << "The tool identifies events in variant.start +/- w basepairs."
<< "\t\t\t" << "Default behaviour is to look at the window between previous and next exons." << endl;
out << "\t\t" << "-j STR Output file containing the aberrant junctions in BED12 format." << endl;
out << "\t\t" << "-e INT\tMaximum distance from the start/end of an exon "
"\t\t\tto annotate a variant as relevant to splicing, the variant "
"\t\t\tis in exonic space, i.e a coding variant. [3]" << endl;
out << "\t\t" << "-i INT\tMaximum distance from the start/end of an exon "
"\t\t\tto annotate a variant as relevant to splicing, the variant "
"\t\t\tis in intronic space. [2]" << endl;
out << "\t\t" << "-o STR\tOutput file containing the aberrant splice junctions with annotations. [STDOUT]" << endl;
out << "\t\t" << "-v STR\tOutput file containing variants annotated as splice relevant (VCF format)." << endl;
out << "\t\t" << "-j STR\tOutput file containing the aberrant junctions in BED12 format." << endl;
out << "\t\t" << "-s INT\tStrand specificity of RNA library preparation \n"
<< "\t\t\t " << "(0 = unstranded, 1 = first-strand/RF, 2, = second-strand/FR). [1]" << endl;
out << "\t\t" << "-w INT\tWindow size in b.p to identify splicing events in.\n"
<< "\t\t\t " << "The tool identifies events in variant.start +/- w basepairs.\n"
<< "\t\t\t " << "Default behaviour is to look at the window between previous and next exons." << endl;
out << "\t\t" << "-e INT\tMaximum distance from the start/end of an exon \n"
<< "\t\t\t " << "to annotate a variant as relevant to splicing, the variant \n"
<< "\t\t\t " << "is in exonic space, i.e a coding variant. [3]" << endl;
out << "\t\t" << "-i INT\tMaximum distance from the start/end of an exon \n"
<< "\t\t\t " << "to annotate a variant as relevant to splicing, the variant \n"
<< "\t\t\t " << "is in intronic space. [2]" << endl;
out << "\t\t" << "-I\tAnnotate variants in intronic space within a transcript(not to be used with -i)." << endl;
out << "\t\t" << "-E\tAnnotate variants in exonic space within a transcript(not to be used with -e)." << endl;
out << "\t\t" << "-S\tDon't skip single exon transcripts." << endl;
Expand Down Expand Up @@ -98,7 +100,7 @@ void CisSpliceEffectsIdentifier::parse_options(int argc, char* argv[]) {
optind = 1; //Reset before parsing again.
stringstream help_ss;
char c;
while((c = getopt(argc, argv, "o:w:v:j:e:Ei:ISh")) != -1) {
while((c = getopt(argc, argv, "o:w:v:j:e:Ei:IShs:")) != -1) {
switch(c) {
case 'o':
output_file_ = string(optarg);
Expand Down Expand Up @@ -130,6 +132,9 @@ void CisSpliceEffectsIdentifier::parse_options(int argc, char* argv[]) {
case 'h':
usage(help_ss);
throw common::cmdline_help_exception(help_ss.str());
case 's':
strandness_ = atoi(optarg);
break;
default:
usage(std::cerr);
throw runtime_error("Error parsing inputs!(1)\n\n");
Expand Down Expand Up @@ -225,7 +230,7 @@ void CisSpliceEffectsIdentifier::identify() {
if(write_annotated_variants_)
va.write_annotation_output(v1);
//Extract junctions near this variant
JunctionsExtractor je1(bam_, variant_region);
JunctionsExtractor je1(bam_, variant_region, strandness_);
je1.identify_junctions_from_BAM();
vector<Junction> junctions = je1.get_all_junctions();
//Add all the junctions to the unique set
Expand Down
5 changes: 4 additions & 1 deletion src/cis-splice-effects/cis_splice_effects_identifier.h
Expand Up @@ -78,6 +78,8 @@ class CisSpliceEffectsIdentifier {
bool all_exonic_space_;
//Option to skip single exon genes
bool skip_single_exon_genes_;
//strandness of data
int strandness_;
public:
//Constructor
CisSpliceEffectsIdentifier() : vcf_("NA"), output_file_("NA"),
Expand All @@ -89,7 +91,8 @@ class CisSpliceEffectsIdentifier {
exonic_min_distance_(3),
all_intronic_space_(false),
all_exonic_space_(false),
skip_single_exon_genes_(true) {}
skip_single_exon_genes_(true),
strandness_(1) {}
//Destructor
~CisSpliceEffectsIdentifier() {
if(ofs_.is_open()) {
Expand Down
1 change: 0 additions & 1 deletion src/cis-splice-effects/cis_splice_effects_main.cc
Expand Up @@ -66,5 +66,4 @@ int cis_splice_effects_main(int argc, char* argv[]) {
return cis_splice_effects_identify(argc - 1, argv + 1);
}
return cis_splice_effects_usage(std::cout);
return 0;
}
2 changes: 1 addition & 1 deletion src/junctions/junctions_annotator.cc
Expand Up @@ -429,7 +429,7 @@ int JunctionsAnnotator::parse_options(int argc, char *argv[]) {
int JunctionsAnnotator::usage(ostream& out) {
out << "Usage:\t\t" << "regtools junctions annotate [options] junctions.bed ref.fa annotations.gtf" << endl;
out << "Options:\t" << "-E include single exon genes" << endl;
out << "\t\t" << "-o Output file" << endl;
out << "\t\t" << "-o FILE\tThe file to write output to. [STDOUT]" << endl;
out << endl;
return 0;
}
80 changes: 63 additions & 17 deletions src/junctions/junctions_extractor.cc
Expand Up @@ -41,7 +41,7 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) {
optind = 1; //Reset before parsing again.
int c;
stringstream help_ss;
while((c = getopt(argc, argv, "ha:i:I:o:r:")) != -1) {
while((c = getopt(argc, argv, "ha:i:I:o:r:s:")) != -1) {
switch(c) {
case 'a':
min_anchor_length_ = atoi(optarg);
Expand All @@ -61,6 +61,9 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) {
case 'h':
usage(help_ss);
throw common::cmdline_help_exception(help_ss.str());
case 's':
strandness_ = atoi(optarg);
break;
case '?':
default:
usage();
Expand All @@ -85,15 +88,18 @@ int JunctionsExtractor::parse_options(int argc, char *argv[]) {

//Usage statement for this tool
int JunctionsExtractor::usage(ostream& out) {
out << "Usage:\t\t" << "regtools junctions extract [options] indexed_alignments.bam" << endl;
out << "Usage:"
<< "\t\t" << "regtools junctions extract [options] indexed_alignments.bam" << endl;
out << "Options:" << endl;
out << "\t" << "-a INT\tMinimum anchor length. Junctions which satisfy a minimum "
"anchor length on both sides are reported. [8]" << endl;
out << "\t\t" << "-a INT\tMinimum anchor length. Junctions which satisfy a minimum \n"
<< "\t\t\t " << "anchor length on both sides are reported. [8]" << endl;
out << "\t\t" << "-i INT\tMinimum intron length. [70]" << endl;
out << "\t\t" << "-I INT\tMaximum intron length. [500000]" << endl;
out << "\t\t" << "-o FILE\tThe file to write output to. [STDOUT]" << endl;
out << "\t\t" << "-r STR\tThe region to identify junctions "
"in \"chr:start-end\" format. Entire BAM by default." << endl;
out << "\t\t" << "-r STR\tThe region to identify junctions \n"
<< "\t\t\t " << "in \"chr:start-end\" format. Entire BAM by default." << endl;
out << "\t\t" << "-s INT\tStrand specificity of RNA library preparation \n"
<< "\t\t\t " << "(0 = unstranded, 1 = first-strand/RF, 2, = second-strand/FR). [1]" << endl;
out << endl;
return 0;
}
Expand Down Expand Up @@ -138,7 +144,16 @@ int JunctionsExtractor::add_junction(Junction j1) {
string start, end;
s1 << j1.start; start = s1.str();
s1 << j1.end; end = s1.str();
string key = j1.chrom + string(":") + start + "-" + end + ":" + j1.strand;
//since ?,+,- sort differently on different systems
string strand_proxy;
if(j1.strand == "+") {
strand_proxy = "0";
} else if(j1.strand == "-") {
strand_proxy = "1";
} else {
strand_proxy = "2";
}
string key = j1.chrom + string(":") + start + "-" + end + ":" + strand_proxy;

//Check if new junction
if(!junctions_.count(key)) {
Expand Down Expand Up @@ -204,17 +219,57 @@ void JunctionsExtractor::print_all_junctions(ostream& out) {
}

//Get the strand from the XS aux tag
void JunctionsExtractor::set_junction_strand(bam1_t *aln, Junction& j1) {
void JunctionsExtractor::set_junction_strand_XS(bam1_t *aln, Junction& j1) {
uint8_t *p = bam_aux_get(aln, "XS");
if(p != NULL) {
char strand = bam_aux2A(p);
strand ? j1.strand = string(1, strand) : j1.strand = string(1, '?');
//cerr <<"XS strand is " << strand << endl;
} else {
//cerr <<"XS strand is NULL" << endl;
j1.strand = string(1, '?');
return;
}
}

//Get the strand from the bitwise flag
void JunctionsExtractor::set_junction_strand_flag(bam1_t *aln, Junction& j1) {
uint32_t flag = (aln->core).flag;
int reversed = (flag >> 4) % 2;
int mate_reversed = (flag >> 5) % 2;
int first_in_pair = (flag >> 6) % 2;
int second_in_pair = (flag >> 7) % 2;
// strandness_ is 0 for unstranded, 1 for RF, and 2 for FR
int bool_strandness = strandness_ - 1;
int first_strand = !bool_strandness ^ first_in_pair ^ reversed;
int second_strand = !bool_strandness ^ second_in_pair ^ mate_reversed;
char strand;
if (first_strand){
strand = '+';
} else {
strand = '-';
}
//cerr << "flag is " << flag << endl;
// if strand inferences from first and second in pair don't agree, we've got a problem
if (first_strand == second_strand){
j1.strand = string(1, strand);
} else {
j1.strand = string(1, '?');
}
//cerr <<"flag strand is " << j1.strand << endl;
return;
}

//Get the strand
void JunctionsExtractor::set_junction_strand(bam1_t *aln, Junction& j1) {
// if unstranded data
if (strandness_ > 0){
return set_junction_strand_flag(aln, j1);
} else {
return set_junction_strand_XS(aln, j1);
}
}

//Parse junctions from the read and store in junction map
int JunctionsExtractor::parse_alignment_into_junctions(bam_hdr_t *header, bam1_t *aln) {
int n_cigar = aln->core.n_cigar;
Expand All @@ -226,15 +281,6 @@ int JunctionsExtractor::parse_alignment_into_junctions(bam_hdr_t *header, bam1_t
string chr(header->target_name[chr_id]);
uint32_t *cigar = bam_get_cigar(aln);

/*
//Skip duplicates
int flag = aln->core.flag;
if(flag & 1024) {
cerr << "Skipping read_pos " << read_pos << " flag " << flag << endl;
return 0;
}
*/

Junction j1;
j1.chrom = chr;
j1.start = read_pos; //maintain start pos of junction
Expand Down
37 changes: 26 additions & 11 deletions src/junctions/junctions_extractor.h
Expand Up @@ -100,18 +100,26 @@ struct Junction : BED {
static inline bool compare_junctions(const Junction &j1,
const Junction &j2) {
//Different chromosome
if(j1.chrom < j2.chrom)
if(j1.chrom < j2.chrom){
return true;
if(j1.chrom > j2.chrom)
}
if(j1.chrom > j2.chrom){
return false;
}
//Same chromosome
if(j1.thick_start == j2.thick_start) {
if(j1.thick_end < j2.thick_end)
return true;
else
return false;
if(j1.thick_start < j2.thick_start) {
return true;
}
if(j1.thick_start > j2.thick_start) {
return false;
}
return j1.thick_start < j2.thick_start;
if(j1.thick_end < j2.thick_end) {
return true;
}
if(j1.thick_end > j2.thick_end) {
return false;
}
return j1.name < j2.name;
}

//Sort a vector of junctions
Expand All @@ -134,7 +142,7 @@ class JunctionsExtractor {
//Maximum length of an intron, i.e max junction width
uint32_t max_intron_length_;
//Map to store the junctions
//The key is "chr:start-end"
//The key is "chr:start-end:strand"
//The value is an object of type Junction(see above)
map<string, Junction> junctions_;
//Maintain a sorted list of junctions
Expand All @@ -145,19 +153,22 @@ class JunctionsExtractor {
string output_file_;
//Region to identify junctions, in "chr:start-end" format
string region_;
//strandness of data; 0 = unstranded, 1 = RF, 2 = FR
int strandness_;
public:
//Default constructor
JunctionsExtractor() {
//cerr << "default constructor called" << endl;
min_anchor_length_ = 8;
min_intron_length_ = 70;
max_intron_length_ = 500000;
junctions_sorted_ = false;
strandness_ = 1;
bam_ = "NA";
output_file_ = "NA";
region_ = ".";
}
//Default constructor
JunctionsExtractor(string bam1, string region1) : bam_(bam1), region_(region1) {
JunctionsExtractor(string bam1, string region1, int strandness1) : bam_(bam1), region_(region1), strandness_(strandness1) {
min_anchor_length_ = 8;
min_intron_length_ = 70;
max_intron_length_ = 500000;
Expand Down Expand Up @@ -193,6 +204,10 @@ class JunctionsExtractor {
//Add a junction to the junctions map
int add_junction(Junction j1);
//Get the strand from the XS aux tag
void set_junction_strand_XS(bam1_t *aln, Junction& j1);
//Get the strand from bitwise flag
void set_junction_strand_flag(bam1_t *aln, Junction& j1);
//Get the strand
void set_junction_strand(bam1_t *aln, Junction& j1);
};

Expand Down
6 changes: 3 additions & 3 deletions src/regtools.cc
Expand Up @@ -42,9 +42,9 @@ void version() {

//Regtools usage
int usage() {
cerr << "Usage:\t\t" << "regtools <command> [options]" << endl;
cerr << "Command:\t" << "junctions\t\tTools that operate on feature junctions."
<< "\t\t\t\t\t(eg. exon-exon junctions from RNA-seq.)" << endl;
cerr << "Usage:"
<< "\t\t" << "regtools <command> [options]" << endl;
cerr << "Command:\t" << "junctions\t\tTools that operate on feature junctions (e.g. exon-exon junctions from RNA-seq)." << endl;
cerr << "\t\t" << "cis-ase\t\t\tTools related to allele specific expression in cis." << endl;
cerr << "\t\t" << "cis-splice-effects\tTools related to splicing effects of variants." << endl;
cerr << "\t\t" << "variants\t\tTools that operate on variants." << endl;
Expand Down
15 changes: 8 additions & 7 deletions src/variants/variants_annotator.cc
Expand Up @@ -33,15 +33,16 @@ DEALINGS IN THE SOFTWARE. */
//Usage statement for this tool
int VariantsAnnotator::usage(ostream& out) {
out << "Usage:\t\t" << "regtools variants annotate [options] variants.vcf annotations.gtf" << endl;
out << "\t\t" << "-e INT\tMaximum distance from the start/end of an exon "
"\t\t\tto annotate a variant as relevant to splicing, the variant "
"\t\t\tis in exonic space, i.e a coding variant. [3]" << endl;
out << "\t\t" << "-i INT\tMaximum distance from the start/end of an exon "
"\t\t\tto annotate a variant as relevant to splicing, the variant "
"\t\t\tis in intronic space. [2]" << endl;
out << "Options:" << endl;
out << "\t\t" << "-o FILE\tThe file to write output to. [STDOUT]" << endl;
out << "\t\t" << "-e INT\tMaximum distance from the start/end of an exon \n"
<< "\t\t\t " << "to annotate a variant as relevant to splicing, the variant \n"
<< "\t\t\t " << "is in exonic space, i.e a coding variant. [3]" << endl;
out << "\t\t" << "-i INT\tMaximum distance from the start/end of an exon \n"
<< "\t\t\t " << "to annotate a variant as relevant to splicing, the variant \n"
<< "\t\t\t " << "is in intronic space. [2]" << endl;
out << "\t\t" << "-I\tAnnotate variants in intronic space within a transcript(not to be used with -i)." << endl;
out << "\t\t" << "-E\tAnnotate variants in exonic space within a transcript(not to be used with -e)." << endl;
out << "\t\t" << "-o\tFile to write output to. [STDOUT]" << endl;
out << "\t\t" << "-S\tDon't skip single exon transcripts." << endl;
out << endl;
return 0;
Expand Down
1 change: 1 addition & 0 deletions src/variants/variants_annotator.h
Expand Up @@ -148,6 +148,7 @@ class VariantsAnnotator {
vcf_record_(NULL) {
vcf_record_ = bcf_init();
intronic_min_distance_ = intronic_min_distance;
cerr << "exonic_min_distance_ is " << exonic_min_distance_ << endl;
exonic_min_distance_ = exonic_min_distance;
all_intronic_space_ = all_intronic_space;
all_exonic_space_ = all_exonic_space;
Expand Down
@@ -0,0 +1,2 @@
chrom start end name score strand splice_site acceptors_skipped exons_skipped donors_skipped anchor known_donor known_acceptor known_junction genes transcripts variant_info
22 93668 97252 JUNC00000001 5 - CT-AC 0 0 0 N 0 0 0 NA NA 22:94626-94627

0 comments on commit 1745555

Please sign in to comment.