Permalink
Browse files

New version of novel exon detection script

  • Loading branch information...
tjakobi committed Jan 25, 2019
1 parent 97df07e commit 2eaf6e4df8b1510733019e7c1c4a4ae31ff6ff24
Showing with 87 additions and 21 deletions.
  1. +87 −21 scripts/detect_new_exons_from_fuchs_data.py
@@ -18,6 +18,7 @@
import argparse
import os
import sys
import pybedtools


def check_input_files(input_file_list):
@@ -32,7 +33,6 @@ def check_input_files(input_file_list):


def parse_bed_12_file(input_file, annotation, local_dict, min_coverage, allowed_wobble):

with open(input_file) as fp:

for line in fp:
@@ -51,12 +51,12 @@ def parse_bed_12_file(input_file, annotation, local_dict, min_coverage, allowed_
coverage = float(location_string[2])

if coverage >= min_coverage:
for wobble in range(-1*allowed_wobble, allowed_wobble):
for wobble in range(-1 * allowed_wobble, allowed_wobble):

if columns[0] + "_" + str(int(columns[1])+wobble) in annotation:
if columns[0] + "_" + str(int(columns[1]) + wobble) in annotation:
start = 1

if columns[0] + "_" + str(int(columns[2])+wobble) in annotation:
if columns[0] + "_" + str(int(columns[2]) + wobble) in annotation:
stop = 1

if start == 0 and stop == 0:
@@ -69,10 +69,11 @@ def parse_bed_12_file(input_file, annotation, local_dict, min_coverage, allowed_
return local_dict


def parse_bed_6_file(input_file, annotation, local_dict, min_coverage, allowed_wobble):

def parse_bed_6_file(input_file, annotation, local_dict, allowed_wobble):
with open(input_file) as fp:

tmp_dict = {}

for line in fp:

if line.startswith("#"):
@@ -85,20 +86,22 @@ def parse_bed_6_file(input_file, annotation, local_dict, min_coverage, allowed_w
start = 0
stop = 0

for wobble in range(-1*allowed_wobble, allowed_wobble):
for wobble in range(-1 * allowed_wobble, allowed_wobble):

if columns[0] + "_" + str(int(columns[1])+wobble) in annotation:
if columns[0] + "_" + str(int(columns[1]) + wobble) in annotation:
start = 1

if columns[0] + "_" + str(int(columns[2])+wobble) in annotation:
if columns[0] + "_" + str(int(columns[2]) + wobble) in annotation:
stop = 1

if start == 0 and stop == 0:
location = columns[0] + "\t" + str(columns[1]) + "\t" + str(columns[2])
if location not in local_dict:
local_dict[location] = 1
else:
tmp_dict[location] = location
elif location in local_dict and location not in tmp_dict:
local_dict[location] += 1
tmp_dict[location] = location

return local_dict

@@ -134,14 +137,74 @@ def parse_gtf_file(input_file):
if int(columns[4]) - int(columns[3]) == 0:
continue

start_key = str(columns[0])+"_"+str(columns[3])
stop_key = str(columns[0])+"_"+str(columns[4])
start_key = str(columns[0]) + "_" + str(columns[3])
stop_key = str(columns[0]) + "_" + str(columns[4])

annotation[start_key] = 1
annotation[stop_key] = 1

return annotation


def print_gtf(bed_obj, output_file):

counter = 1

for line in str(bed_obj).splitlines():

tmp = line.split('\t')
chromosome = tmp[0].rstrip()
start = tmp[3].rstrip()
stop = tmp[4].rstrip()

print(chromosome + "\t"
"circtools" + "\t" +
"gene" + "\t" +
start + "\t" +
stop + "\t" +
".\t" +
".\t" +
".\t" +
"gene_id \"circtools_gene_" + str(counter) + "\"; " +
"gene_name \"circtools_gene_" + str(counter) + "\"; " +
"gene_source \"circtools\"; " +
"gene_biotype \"circRNA\"",
file=output_file
)

print(chromosome + "\t"
"circtools" + "\t" +
"transcript" + "\t" +
start + "\t" +
stop + "\t" +
".\t" +
".\t" +
".\t" +
"gene_id \"circtools_gene_" + str(counter) + "\"; " +
"gene_name \"circtools_gene_" + str(counter) + "\"; " +
"transcript_id \"circtools_transcript_" + str(counter) + "\"; " +
"gene_source \"circtools\"; " +
"gene_biotype \"circRNA\"",
file=output_file
)

print(chromosome + "\t"
"circtools" + "\t" +
"exon" + "\t" +
start + "\t" +
stop + "\t" +
".\t" +
".\t" +
".\t" +
"gene_id \"circtools_gene_" + str(counter) + "\"; " +
"gene_name \"circtools_gene_" + str(counter) + "\"; " +
"transcript_id \"circtools_transcript_" + str(counter) + "\"; " +
"gene_source \"circtools\"; " +
"gene_biotype \"circRNA\"",
file=output_file
)
counter += 1

# main script starts here


@@ -213,12 +276,10 @@ def parse_gtf_file(input_file):
default="bed12"
)


args = parser.parse_args()

check_input_files(args.bed_files)


if len(args.bed_files) != len(args.assignment):
print("Differing counts for BED files and group assignment, exiting.")
exit(-1)
@@ -227,7 +288,6 @@ def parse_gtf_file(input_file):
print("Threshold > number of provided sources files.")
exit(-1)


global_dict = {}

assignment_dict = {}
@@ -259,7 +319,6 @@ def parse_gtf_file(input_file):
global_dict[args.assignment[file]] = parse_bed_6_file(args.bed_files[file],
gtf_input,
global_dict[args.assignment[file]],
args.min_coverage,
args.wobble
)
else:
@@ -268,25 +327,32 @@ def parse_gtf_file(input_file):
global_dict[args.assignment[file]],
args.min_coverage,
args.wobble
)
)
# remove non-stringent exons
for sample in global_dict:
final_dict[sample] = {}
for key in global_dict[sample]:
if global_dict[sample][key] >= args.threshold:
final_dict[sample][key] = global_dict[sample][key]

for sample in final_dict:

file = open("sample_"+str(sample)+".gtf", "w")
reference_annotation = pybedtools.BedTool(args.base_exon_file)

for sample in final_dict:

file = ""
for key in final_dict[sample]:
entry = key.split('\t')

sep = "\t"

if int(entry[2]) - int(entry[1]) <= args.max_length:
file += (sep.join([entry[0], "circtools", "exon", entry[1], entry[2], ".", ".", ".", "."]) + "\n")

sample_bed = pybedtools.BedTool(file, from_string=True)
return_bed = sample_bed.intersect(reference_annotation, v=True)

print_gtf(return_bed, open("sample_"+str(sample)+".gtf", "w"))


file.write(sep.join([entry[0], "circtools", "exon", entry[1], entry[2], ".", ".", ".", "."])+"\n")

file.close()

0 comments on commit 2eaf6e4

Please sign in to comment.