Skip to content

Commit

Permalink
New version of novel exon detection script
Browse files Browse the repository at this point in the history
  • Loading branch information
tjakobi committed Jan 25, 2019
1 parent 97df07e commit 2eaf6e4
Showing 1 changed file with 87 additions and 21 deletions.
108 changes: 87 additions & 21 deletions scripts/detect_new_exons_from_fuchs_data.py
Expand Up @@ -18,6 +18,7 @@
import argparse import argparse
import os import os
import sys import sys
import pybedtools




def check_input_files(input_file_list): def check_input_files(input_file_list):
Expand All @@ -32,7 +33,6 @@ def check_input_files(input_file_list):




def parse_bed_12_file(input_file, annotation, local_dict, min_coverage, allowed_wobble): def parse_bed_12_file(input_file, annotation, local_dict, min_coverage, allowed_wobble):

with open(input_file) as fp: with open(input_file) as fp:


for line in fp: for line in fp:
Expand All @@ -51,12 +51,12 @@ def parse_bed_12_file(input_file, annotation, local_dict, min_coverage, allowed_
coverage = float(location_string[2]) coverage = float(location_string[2])


if coverage >= min_coverage: if coverage >= min_coverage:
for wobble in range(-1*allowed_wobble, allowed_wobble): for wobble in range(-1 * allowed_wobble, allowed_wobble):


if columns[0] + "_" + str(int(columns[1])+wobble) in annotation: if columns[0] + "_" + str(int(columns[1]) + wobble) in annotation:
start = 1 start = 1


if columns[0] + "_" + str(int(columns[2])+wobble) in annotation: if columns[0] + "_" + str(int(columns[2]) + wobble) in annotation:
stop = 1 stop = 1


if start == 0 and stop == 0: if start == 0 and stop == 0:
Expand All @@ -69,10 +69,11 @@ def parse_bed_12_file(input_file, annotation, local_dict, min_coverage, allowed_
return local_dict return local_dict




def parse_bed_6_file(input_file, annotation, local_dict, min_coverage, allowed_wobble): def parse_bed_6_file(input_file, annotation, local_dict, allowed_wobble):

with open(input_file) as fp: with open(input_file) as fp:


tmp_dict = {}

for line in fp: for line in fp:


if line.startswith("#"): if line.startswith("#"):
Expand All @@ -85,20 +86,22 @@ def parse_bed_6_file(input_file, annotation, local_dict, min_coverage, allowed_w
start = 0 start = 0
stop = 0 stop = 0


for wobble in range(-1*allowed_wobble, allowed_wobble): for wobble in range(-1 * allowed_wobble, allowed_wobble):


if columns[0] + "_" + str(int(columns[1])+wobble) in annotation: if columns[0] + "_" + str(int(columns[1]) + wobble) in annotation:
start = 1 start = 1


if columns[0] + "_" + str(int(columns[2])+wobble) in annotation: if columns[0] + "_" + str(int(columns[2]) + wobble) in annotation:
stop = 1 stop = 1


if start == 0 and stop == 0: if start == 0 and stop == 0:
location = columns[0] + "\t" + str(columns[1]) + "\t" + str(columns[2]) location = columns[0] + "\t" + str(columns[1]) + "\t" + str(columns[2])
if location not in local_dict: if location not in local_dict:
local_dict[location] = 1 local_dict[location] = 1
else: tmp_dict[location] = location
elif location in local_dict and location not in tmp_dict:
local_dict[location] += 1 local_dict[location] += 1
tmp_dict[location] = location


return local_dict return local_dict


Expand Down Expand Up @@ -134,14 +137,74 @@ def parse_gtf_file(input_file):
if int(columns[4]) - int(columns[3]) == 0: if int(columns[4]) - int(columns[3]) == 0:
continue continue


start_key = str(columns[0])+"_"+str(columns[3]) start_key = str(columns[0]) + "_" + str(columns[3])
stop_key = str(columns[0])+"_"+str(columns[4]) stop_key = str(columns[0]) + "_" + str(columns[4])


annotation[start_key] = 1 annotation[start_key] = 1
annotation[stop_key] = 1 annotation[stop_key] = 1


return annotation return annotation



def print_gtf(bed_obj, output_file):

counter = 1

for line in str(bed_obj).splitlines():

tmp = line.split('\t')
chromosome = tmp[0].rstrip()
start = tmp[3].rstrip()
stop = tmp[4].rstrip()

print(chromosome + "\t"
"circtools" + "\t" +
"gene" + "\t" +
start + "\t" +
stop + "\t" +
".\t" +
".\t" +
".\t" +
"gene_id \"circtools_gene_" + str(counter) + "\"; " +
"gene_name \"circtools_gene_" + str(counter) + "\"; " +
"gene_source \"circtools\"; " +
"gene_biotype \"circRNA\"",
file=output_file
)

print(chromosome + "\t"
"circtools" + "\t" +
"transcript" + "\t" +
start + "\t" +
stop + "\t" +
".\t" +
".\t" +
".\t" +
"gene_id \"circtools_gene_" + str(counter) + "\"; " +
"gene_name \"circtools_gene_" + str(counter) + "\"; " +
"transcript_id \"circtools_transcript_" + str(counter) + "\"; " +
"gene_source \"circtools\"; " +
"gene_biotype \"circRNA\"",
file=output_file
)

print(chromosome + "\t"
"circtools" + "\t" +
"exon" + "\t" +
start + "\t" +
stop + "\t" +
".\t" +
".\t" +
".\t" +
"gene_id \"circtools_gene_" + str(counter) + "\"; " +
"gene_name \"circtools_gene_" + str(counter) + "\"; " +
"transcript_id \"circtools_transcript_" + str(counter) + "\"; " +
"gene_source \"circtools\"; " +
"gene_biotype \"circRNA\"",
file=output_file
)
counter += 1

# main script starts here # main script starts here




Expand Down Expand Up @@ -213,12 +276,10 @@ def parse_gtf_file(input_file):
default="bed12" default="bed12"
) )



args = parser.parse_args() args = parser.parse_args()


check_input_files(args.bed_files) check_input_files(args.bed_files)



if len(args.bed_files) != len(args.assignment): if len(args.bed_files) != len(args.assignment):
print("Differing counts for BED files and group assignment, exiting.") print("Differing counts for BED files and group assignment, exiting.")
exit(-1) exit(-1)
Expand All @@ -227,7 +288,6 @@ def parse_gtf_file(input_file):
print("Threshold > number of provided sources files.") print("Threshold > number of provided sources files.")
exit(-1) exit(-1)



global_dict = {} global_dict = {}


assignment_dict = {} assignment_dict = {}
Expand Down Expand Up @@ -259,7 +319,6 @@ def parse_gtf_file(input_file):
global_dict[args.assignment[file]] = parse_bed_6_file(args.bed_files[file], global_dict[args.assignment[file]] = parse_bed_6_file(args.bed_files[file],
gtf_input, gtf_input,
global_dict[args.assignment[file]], global_dict[args.assignment[file]],
args.min_coverage,
args.wobble args.wobble
) )
else: else:
Expand All @@ -268,25 +327,32 @@ def parse_gtf_file(input_file):
global_dict[args.assignment[file]], global_dict[args.assignment[file]],
args.min_coverage, args.min_coverage,
args.wobble args.wobble
) )
# remove non-stringent exons # remove non-stringent exons
for sample in global_dict: for sample in global_dict:
final_dict[sample] = {} final_dict[sample] = {}
for key in global_dict[sample]: for key in global_dict[sample]:
if global_dict[sample][key] >= args.threshold: if global_dict[sample][key] >= args.threshold:
final_dict[sample][key] = global_dict[sample][key] final_dict[sample][key] = global_dict[sample][key]


for sample in final_dict:


file = open("sample_"+str(sample)+".gtf", "w") reference_annotation = pybedtools.BedTool(args.base_exon_file)

for sample in final_dict:


file = ""
for key in final_dict[sample]: for key in final_dict[sample]:
entry = key.split('\t') entry = key.split('\t')


sep = "\t" sep = "\t"


if int(entry[2]) - int(entry[1]) <= args.max_length: if int(entry[2]) - int(entry[1]) <= args.max_length:
file += (sep.join([entry[0], "circtools", "exon", entry[1], entry[2], ".", ".", ".", "."]) + "\n")

sample_bed = pybedtools.BedTool(file, from_string=True)
return_bed = sample_bed.intersect(reference_annotation, v=True)

print_gtf(return_bed, open("sample_"+str(sample)+".gtf", "w"))



file.write(sep.join([entry[0], "circtools", "exon", entry[1], entry[2], ".", ".", ".", "."])+"\n")


file.close()

0 comments on commit 2eaf6e4

Please sign in to comment.