New version of novel exon detection script

dieterich-lab · Jan 25, 2019 · 2eaf6e4 · 2eaf6e4
1 parent 97df07e
commit 2eaf6e4
Showing 1 changed file with 87 additions and 21 deletions.
diff --git a/scripts/detect_new_exons_from_fuchs_data.py b/scripts/detect_new_exons_from_fuchs_data.py
@@ -18,6 +18,7 @@
 import argparse
 import os
 import sys
+import pybedtools
 
 
 def check_input_files(input_file_list):
@@ -32,7 +33,6 @@ def check_input_files(input_file_list):
 
 
 def parse_bed_12_file(input_file, annotation, local_dict, min_coverage, allowed_wobble):
-
     with open(input_file) as fp:
 
         for line in fp:
@@ -51,12 +51,12 @@ def parse_bed_12_file(input_file, annotation, local_dict, min_coverage, allowed_
             coverage = float(location_string[2])
 
             if coverage >= min_coverage:
-                for wobble in range(-1*allowed_wobble, allowed_wobble):
+                for wobble in range(-1 * allowed_wobble, allowed_wobble):
 
-                    if columns[0] + "_" + str(int(columns[1])+wobble) in annotation:
+                    if columns[0] + "_" + str(int(columns[1]) + wobble) in annotation:
                         start = 1
 
-                    if columns[0] + "_" + str(int(columns[2])+wobble) in annotation:
+                    if columns[0] + "_" + str(int(columns[2]) + wobble) in annotation:
                         stop = 1
 
                 if start == 0 and stop == 0:
@@ -69,10 +69,11 @@ def parse_bed_12_file(input_file, annotation, local_dict, min_coverage, allowed_
     return local_dict
 
 
-def parse_bed_6_file(input_file, annotation, local_dict, min_coverage, allowed_wobble):
+def parse_bed_6_file(input_file, annotation, local_dict, allowed_wobble):
-
     with open(input_file) as fp:
 
+        tmp_dict = {}
+
         for line in fp:
 
             if line.startswith("#"):
@@ -85,20 +86,22 @@ def parse_bed_6_file(input_file, annotation, local_dict, min_coverage, allowed_w
             start = 0
             stop = 0
 
-            for wobble in range(-1*allowed_wobble, allowed_wobble):
+            for wobble in range(-1 * allowed_wobble, allowed_wobble):
 
-                if columns[0] + "_" + str(int(columns[1])+wobble) in annotation:
+                if columns[0] + "_" + str(int(columns[1]) + wobble) in annotation:
                     start = 1
 
-                if columns[0] + "_" + str(int(columns[2])+wobble) in annotation:
+                if columns[0] + "_" + str(int(columns[2]) + wobble) in annotation:
                     stop = 1
 
             if start == 0 and stop == 0:
                 location = columns[0] + "\t" + str(columns[1]) + "\t" + str(columns[2])
                 if location not in local_dict:
                     local_dict[location] = 1
-                else:
+                    tmp_dict[location] = location
+                elif location in local_dict and location not in tmp_dict:
                     local_dict[location] += 1
+                    tmp_dict[location] = location
 
     return local_dict
 
@@ -134,14 +137,74 @@ def parse_gtf_file(input_file):
                 if int(columns[4]) - int(columns[3]) == 0:
                     continue
 
-                start_key = str(columns[0])+"_"+str(columns[3])
+                start_key = str(columns[0]) + "_" + str(columns[3])
-                stop_key = str(columns[0])+"_"+str(columns[4])
+                stop_key = str(columns[0]) + "_" + str(columns[4])
 
                 annotation[start_key] = 1
                 annotation[stop_key] = 1
 
     return annotation
 
+
+def print_gtf(bed_obj, output_file):
+
+    counter = 1
+
+    for line in str(bed_obj).splitlines():
+
+        tmp = line.split('\t')
+        chromosome = tmp[0].rstrip()
+        start = tmp[3].rstrip()
+        stop = tmp[4].rstrip()
+
+        print(chromosome + "\t"
+                           "circtools" + "\t" +
+              "gene" + "\t" +
+              start + "\t" +
+              stop + "\t" +
+              ".\t" +
+              ".\t" +
+              ".\t" +
+              "gene_id \"circtools_gene_" + str(counter) + "\"; " +
+              "gene_name \"circtools_gene_" + str(counter) + "\"; " +
+              "gene_source \"circtools\"; " +
+              "gene_biotype \"circRNA\"",
+              file=output_file
+              )
+
+        print(chromosome + "\t"
+                           "circtools" + "\t" +
+              "transcript" + "\t" +
+              start + "\t" +
+              stop + "\t" +
+              ".\t" +
+              ".\t" +
+              ".\t" +
+              "gene_id \"circtools_gene_" + str(counter) + "\"; " +
+              "gene_name \"circtools_gene_" + str(counter) + "\"; " +
+              "transcript_id \"circtools_transcript_" + str(counter) + "\"; " +
+              "gene_source \"circtools\"; " +
+              "gene_biotype \"circRNA\"",
+              file=output_file
+              )
+
+        print(chromosome + "\t"
+                           "circtools" + "\t" +
+              "exon" + "\t" +
+              start + "\t" +
+              stop + "\t" +
+              ".\t" +
+              ".\t" +
+              ".\t" +
+              "gene_id \"circtools_gene_" + str(counter) + "\"; " +
+              "gene_name \"circtools_gene_" + str(counter) + "\"; " +
+              "transcript_id \"circtools_transcript_" + str(counter) + "\"; " +
+              "gene_source \"circtools\"; " +
+              "gene_biotype \"circRNA\"",
+              file=output_file
+              )
+        counter += 1
+
 # main script starts here
 
 
@@ -213,12 +276,10 @@ def parse_gtf_file(input_file):
                    default="bed12"
                    )
 
-
 args = parser.parse_args()
 
 check_input_files(args.bed_files)
 
-
 if len(args.bed_files) != len(args.assignment):
     print("Differing counts for BED files and group assignment, exiting.")
     exit(-1)
@@ -227,7 +288,6 @@ def parse_gtf_file(input_file):
     print("Threshold > number of provided sources files.")
     exit(-1)
 
-
 global_dict = {}
 
 assignment_dict = {}
@@ -259,7 +319,6 @@ def parse_gtf_file(input_file):
         global_dict[args.assignment[file]] = parse_bed_6_file(args.bed_files[file],
                                                               gtf_input,
                                                               global_dict[args.assignment[file]],
-                                                              args.min_coverage,
                                                               args.wobble
                                                               )
     else:
@@ -268,25 +327,32 @@ def parse_gtf_file(input_file):
                                                                global_dict[args.assignment[file]],
                                                                args.min_coverage,
                                                                args.wobble
-                                                            )
+                                                               )
 # remove non-stringent exons
 for sample in global_dict:
     final_dict[sample] = {}
     for key in global_dict[sample]:
         if global_dict[sample][key] >= args.threshold:
             final_dict[sample][key] = global_dict[sample][key]
 
-for sample in final_dict:
 
-    file = open("sample_"+str(sample)+".gtf", "w")
+reference_annotation = pybedtools.BedTool(args.base_exon_file)
+
+for sample in final_dict:
 
+    file = ""
     for key in final_dict[sample]:
         entry = key.split('\t')
 
         sep = "\t"
 
         if int(entry[2]) - int(entry[1]) <= args.max_length:
+            file += (sep.join([entry[0], "circtools", "exon", entry[1], entry[2], ".", ".", ".", "."]) + "\n")
+
+    sample_bed = pybedtools.BedTool(file, from_string=True)
+    return_bed = sample_bed.intersect(reference_annotation, v=True)
+
+    print_gtf(return_bed, open("sample_"+str(sample)+".gtf", "w"))
+
 
-            file.write(sep.join([entry[0], "circtools", "exon", entry[1], entry[2], ".", ".", ".", "."])+"\n")
 
-    file.close()