In [1]:
import sys
import re
import numpy as np
import pandas as pd
from Bio import SeqIO
from Bio import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import GC
import seaborn as sns
import matplotlib.pyplot as plt
import statistics
import operator
from tqdm.notebook import tqdm
from functions import *
import random
from Bio import Restriction
import copy
from itertools import combinations
from itertools import permutations
from itertools import product
import urllib.request as urllib2
from dna_features_viewer import BiopythonTranslator
from Bio.SeqFeature import SeqFeature, FeatureLocation

In [2]:
#Create a codon table for e.coli:
all_name_cds_ecoli = {str(record.description).split("gene=")[1].split("]",1)[0]:str(record.seq)
                      for record in SeqIO.parse("../ref_files_m13/GCF_000005845.2_ASM584v2_cds_from_genomic.fna", "fasta")}
codon_table_ecoli = create_codon_table(list(all_name_cds_ecoli.values()), 11)

# Make a reversed dict codon usage of whole genome
rev_ecoli_codon_usage = reverse_codon_table(codon_table_ecoli)

In [45]:
def recode_perc_change(perc,cds,codon_table):
    #recode the cds by changing the % provided as input based on the codon table provided as input.
    #output: recoded cds (string)
    new_cds = ""
    for pos in range(int(len(cds)/3)):    #25% de chance de le changer
        codon_in_cds = cds[pos*3:pos*3+3].upper()
        #set seed at each position in the CDS
        random.seed(pos)
        if pick_this_pos(perc) and codon_in_cds !="ATG" and codon_in_cds!="TGG" and pos*3+3 != len(cds):
            #if no TGG (cannot be changed), no start codon (cannot be changed) or no stop codon:
            other_codons = {}
            for aa, codon_freq in codon_table.items():
                if codon_in_cds in codon_freq:
                    for codon,freq in codon_freq.items():
                        if codon_in_cds != codon:
                            other_codons[codon] = freq
            new_codon_dict = {codon:freq/sum(list(codon_freq.values())) for codon,freq in other_codons.items()}
            replacement_codon = random.choices(population=list(new_codon_dict.keys()),weights=list(new_codon_dict.values()),k=1)
            new_cds += ''.join(replacement_codon).upper()
        else:
            new_cds += codon_in_cds
    #new_cds is recoded
    return new_cds


def merge_contiguous_regions(list_of_tuples):
    #the regions do not ovif regions == forbiden_regions[-1]: 
    #last forbidden region, exceptionnal case:erlap, but have common borders. This function merge common borders into one forbidden region.
    #input: list of tuples
    #output: list of tuples merged
    merge_list = []
    max_range = [-1]
    max_list_of_tuples = max([max(i) for i in list_of_tuples])
    min_range = [max_list_of_tuples+1]
    for nb,ref in enumerate(list_of_tuples):
        if nb+1 < len(list_of_tuples):    #to make sure we do not compared with something outside the list!
            comp = list_of_tuples[nb+1]
            if comp[0] <= ref[1] and comp[1] >= ref[0]: #if there is any overlap:
                min_temp = min([ref[0],comp[0]])
                max_temp = max([ref[1],comp[1]])
                min_range.append(min_temp)
                max_range.append(max_temp)
                max_range = [max(max_range)]
                min_range = [min(min_range)]
                if nb+1 == len(list_of_tuples)-1:
                    merge_list.append((min_range[0],max_range[0]))
            else:
                if nb > 0:
                    if max_range == [-1] and min_range == [max_list_of_tuples+1]:
                        merge_list.append((ref[0],ref[1]))
                    else:
                        merge_list.append((min_range[0],max_range[0]))
                    if nb+1 == len(list_of_tuples)-1:  #if last
                         merge_list.append((comp[0],comp[1]))
                    max_range = [-1]
                    min_range = [max_list_of_tuples+1]
                else:
                    merge_list.append((ref[0],ref[1]))
    return merge_list
def pick_this_pos(percent):
    #return true if the random number (between 0-100) is lower than the given percent.
    return random.randrange(100) < percent

def same_ORF(interval, ref_pos_gene):
    #this function tells if coordinates of a part of a cds to be recoded overlaps with an actual gene (other than himself)
    #if the pos[0]-interval[0] < 3, on l'exclu de toute maniere
    return bool((ref_pos_gene-interval)%3 == 0)

# def overlap_situation(cds_parts,CDSs):
#     no_overlap_to_recode = {}
#     for gene_name, parts in cds_parts.items():
#         no_overlap_to_recode[gene_name] = {}
#         for gene_seq, interval in parts.items():
#             for m,(comp_gene_name,pos) in enumerate(CDSs.items()):
#                 if interval[0] < interval[1]:#gene II case: gene overlaping with the last position and the first position of the file.
#                     if pos[0] < interval[1] and interval[0] < pos[1]: #if overlap between region and compared region
#                         if gene_name == comp_gene_name: #if we are looking at interval in same gene
#                                 no_overlap_to_recode[gene_name][gene_seq] = interval
#                             else: #if overlaps, but not in the same gene:
#                                 if same_ORF(pos[1],pos[m+1][0]): #if the overlap is in the same ORF with the next gene
                                    
#                         else: #if comp_gene_name != gene_name, meaning an interval (seq to recode) overlaps with an actual CDSs in the construct other than himself
#                             if same_ORF(interval[1],pos[0]):#if this genome section is in the same orf compared to the others, append it. Otherwise, exclude it.
#                                 print(interval,pos)
#                                 no_overlap_to_recode[gene_name][gene_seq] = (interval[1],pos[1])
#                 else:
#                     print(gene_name)
#     return no_overlap_to_recode
                    
def find_forbidden_regions(path_to_gb_file):
    #we have to exlcude promoter sequence, terminator, rbs and the oriR in the restricted regions dic:
    #data structure : {start:end, start1:end1, start2:end2, start3:end3...}
    forbidden_regions = []
    CDSs = {}
    #CDSs in the form of :{I:(0,1000), II:(980,1500),...}: it is the position of every gene!
    for gb_record in SeqIO.parse(open(path_to_gb_file, "r"), "genbank"):
        dna_seq = gb_record.seq
        for feat in gb_record.features:
            if "prom" in str(feat.qualifiers) or "rbs" in str(feat.qualifiers) or "term" in str(feat.qualifiers) or "origin" in str(feat.qualifiers):
                initial_loc_forbid = str(feat.location)
                start = int(initial_loc_forbid.split(":")[0][1:])
                end = int(initial_loc_forbid.split(":")[1].split("]")[0])
                forbidden_regions.append((start,end))
            if feat.type == "CDS" and "join":
                if "join" not in str(feat.location):
                    #join is here for gene II, because it is a circular construction (overlaps with the beginning and the end of the file)
                    initial_loc_cds = str(feat.location)
                    start = int(initial_loc_cds.split(":")[0][1:])
                    end = int(initial_loc_cds.split(":")[1].split("]")[0])
                    CDSs[feat.qualifiers["label"][0]] = (start,end)
                else:
                    initial_loc_cds_start = str(feat.location)[5:].split(",")[0]
                    initial_loc_cds_end = str(feat.location).split(",")[1]
                    start = int(initial_loc_cds_start.split(":")[0][1:])
                    end = int(initial_loc_cds_end.split(":")[1].split("]")[0])
                    CDSs[feat.qualifiers["label"][0]] = (start,end)
    
    return forbidden_regions,CDSs,dna_seq
    
def get_changing_cds_parts(coor,mforbiden_regions,dna_seq):
    #coor is absolute coordinates of the fasta file
    #returns all dna_string that could be recoded (excludes forbidden sequences (term_seq,prom_seq, rbs, rep origin...) and absolute pos in a dict 
    #{seq1:(start1,end1), seq2:(start2,end2),...}
    cds_parts = {}
    notpickreg = []
    for i,regions in enumerate(mforbiden_regions):
        #find all changeable part of CDSs, or exclude forbiden sequences:
        if regions[0] > coor[0] and regions[0] < coor[1]: #Normal case: if, elif else
            if not notpickreg: #if notpickreg empty
                cds_parts[str(dna_seq[coor[0]:regions[0]])] = (coor[0],regions[0])
                notpickreg = [regions[0],regions[1]]
            else:
                cds_parts[str(dna_seq[notpickreg[1]:regions[0]])] = (notpickreg[1],regions[0])
                notpickreg = [regions[0],regions[1]]
        elif regions[0] <= coor[0] and regions[1] >= coor[0]:
            notpickreg = [regions[0],regions[1]]
            if regions[1] >= coor[1]:
                return []
        else:
            if regions[0] > coor[1]:  #if region[0] exceeds the limit of the cds
                if not notpickreg: #meaning if no forbidden region in cds:
                    cds_parts[str(dna_seq[coor[0]:coor[1]])] = (coor[0],coor[1])
                else:
                    if notpickreg[1] <= coor[1]:
                        cds_parts[str(dna_seq[notpickreg[1]:coor[1]])] = (notpickreg[1],coor[1])
                break
        if regions == forbiden_regions[-1]: #last forbidden region, exceptionnal case:
            if not notpickreg:
                cds_parts[str(dna_seq[coor[0]:regions[0]])] = (coor[0],regions[0])
            else:
                cds_parts[str(dna_seq[notpickreg[1]:coor[1]])] = (notpickreg[1],coor[1])
    return cds_parts

def find_start_of_gene(last_pos,CDSs):
    #find the corresponding gene with a position in the genome, returns always the first occurence of overlapping in the CDSs.
    #Since uniq_all_locations_sorted is sorted, I do not see how this function may not work. 
    #If two in frame genes are compared in same_ORF(), they will both pointing in the same gene, and so be in frame. 
    for gene_name,pos in CDSs.items():
        if pos[0] < pos[1]:
            if last_pos >= int(pos[0]) and last_pos <= int(pos[1]):
                return (gene_name,pos)
                break
        else: #for the gene that overlaps the last and the first coordinate in file:
            if (last_pos <= int(pos[0]) and last_pos <= int(pos[1])) or (last_pos >= int(pos[0]) and last_pos >= int(pos[1])):
                return (gene_name,pos)
                break

# main:
forbiden_regions,CDSs,dna_seq = find_forbidden_regions(path_to_gb_file = "../ref_files_m13/m13mp18-Kan_annotated.gb")

#add overlaping region in forbiden_regions:
merge_forbiden_regions = merge_contiguous_regions(forbiden_regions)  #merge continuous region, but doesnt care about overlaping region.
max_len_dna = len(str(dna_seq))
#print(CDSs)

cds_parts = {}
for gene_name, coor in CDSs.items():
    gene = dna_seq[coor[0]:coor[1]]
    if coor[0] < coor[1]:
        cds_parts[gene_name] = get_changing_cds_parts(coor,merge_forbiden_regions,dna_seq=dna_seq)
    else:
        cds_parts_end = get_changing_cds_parts((coor[0],len(dna_seq)),merge_forbiden_regions,dna_seq=dna_seq)
        cds_parts_start = get_changing_cds_parts((0, coor[1]),merge_forbiden_regions,dna_seq=dna_seq)
        cds_parts[gene_name] = {**cds_parts_end, **cds_parts_start}
        
#trouver les portions de genome qui s'overlappent et les exlcure!
no_overlap_to_recode = {}
all_locations = [interval for gene_name, parts in cds_parts.items() for gene_seq, interval in parts.items()]
uniq_all_locations_sorted = sorted(list(set(all_locations)), key=lambda tup: (tup[0],tup[1])) #sorted by first value (and second) in tuple unique values.
#print(uniq_all_locations_sorted)
parts_to_recode = []
wrong_frame_lastpos = 0 #the last position in the last cds not in frame:  
for i,positions in enumerate(uniq_all_locations_sorted):
    #print(positions)
    append = True
    if i+1 < len(uniq_all_locations_sorted):
        new_coors = {}
        next_coors = uniq_all_locations_sorted[i+1]
        #print("next_coors : ",next_coors)
        while next_coors[0] <= positions[1] and positions[0] <= next_coors[1]: #while there are overlapping regions.
            #print(find_start_of_gene(int(positions[1]),CDSs)[0],find_start_of_gene(int(positions[1]),CDSs)[1][0],positions[1],next_coors[0])
            #if next_coors[0] > or < find_start_of_gene(int(positions[1]),CDSs), the condition still works :)
            #the start of reference gene compared together:
            if same_ORF(find_start_of_gene(int(positions[1]),CDSs)[1][0],next_coors[0]):
                if positions[1] == next_coors[1]: #should be always true, but just for safety:
                    new_coors[(max([wrong_frame_lastpos,positions[0]]),positions[1])] = str(dna_seq)[max([wrong_frame_lastpos,positions[0]]):positions[1]] #not append directly, other cdss might overlap with a different ORF
                else:
                    print("two overlaping CDSs in the same ORF have different STOP codSon position. Please revise annotations")
                    print("the program exit in an unexcepted way.")
                    break
            else: #we need to find the in frame coors! So a little bit more complicated...
                if positions[0] < next_coors[0]: # we do not want to add empty sequences in dictionary:
                    parts_to_recode.append({(max([wrong_frame_lastpos,positions[0]]),next_coors[0]) : str(dna_seq[max([wrong_frame_lastpos,positions[0]]):next_coors[0]]) })
                if positions[1] < next_coors[1]: #meaning all the end of gene_ref has overlap with an altORF, so we  
                    append = False
                    wrong_frame_lastpos = positions[1]
                    break
                else:
                    last_pos = next_coors[1]
                wrong_frame_lastpos = positions[1]
            #print(parts_to_recode)
            try: #changing position to the next element of the list:
                next_coors = uniq_all_locations_sorted[uniq_all_locations_sorted.index(next_coors[0],next_coors[1])+1]
            except: #if we are at the end of the list
                break

    if append: #if something to append in the list of dicts:
        if new_coors:
            parts_to_recode.append(new_coors)
        else: #if no specified coordinates, meaning no overlap in frame, simply add the given region:
            parts_to_recode.append({(max([wrong_frame_lastpos,positions[0]]),positions[1]) : str(dna_seq[max([wrong_frame_lastpos,positions[0]]):positions[1]]) })
    new_coors = {}
    #print(parts_to_recode)
    
#remove repetitive parts of genome to recode. parts of genome are found based on each gene position. 
#If 2 genes overlap, it is possible a part of the genome is included
all_pos = [tuple(list(dic.keys())[0]) for dic in parts_to_recode]
removed_items = []
for i in all_pos:
    for j in all_pos:
        if i != j and i[0] <= j[0] and i[1] >= j[1]:
            if j not in removed_items:
                parts_to_recode = [dic for dic in parts_to_recode if j not in dic] #remove j from the list of dicts
#print("je suis le vrai dico: ", parts_to_recode)

in_frame_parts_to_recode = []
#get only in frame region and adjust the coordinates accordingly:
#######if there are gene on different strand, this code is not correct#######

for recode_regions in parts_to_recode:
    init_pos,end_pos = (next(iter(recode_regions))[0],next(iter(recode_regions))[1]) #next(iter(recode_regions)) is the first and only key for each dict (aka the position of each region).
    start_ref_gene = find_start_of_gene(end_pos,CDSs)[1][0]
    end_ref_gene = find_start_of_gene(end_pos,CDSs)[1][1]
    #note: this if was really difficult to write. I am NOT sure it is the most general answer to find the correct frame for eache region.
    if start_ref_gene > end_ref_gene and init_pos < end_ref_gene: #exceptionnal case
        len_dna_seq = len(dna_seq)
        init_pos_to_zero = (3-(init_pos)%3)%3   #est-ce que ma position est en dans le meme cadre avec le 0?
        end_pos_to_zero = (end_pos)%3
        start_ref_gene_end_fileinframe = (3-(len_dna_seq-start_ref_gene)%3)%3 #est-ce que la fin du fichier est in frame avec le debut du gene qui overlap avec le debut et la fin du fichier?
        start_ref_gene_end_fileinframe = (len_dna_seq-start_ref_gene)%3
        inframe_init_pos = init_pos + init_pos_to_zero + start_ref_gene_end_fileinframe
        inframe_end_pos = end_pos - start_ref_gene_end_fileinframe - end_pos_to_zero  #NOT GOOD
        in_frame_parts_to_recode.append({(inframe_init_pos,inframe_end_pos) : str(dna_seq)[inframe_init_pos:inframe_end_pos]})
        
    else:  #for general case, find the gene with the first position in the list. Adjust coordinates to keep only in frame
        inframe_init_pos = init_pos+(3-(init_pos-start_ref_gene)%3)%3
        inframe_end_pos = end_pos-(3-(start_ref_gene-end_pos)%3)%3
        in_frame_parts_to_recode.append({(inframe_init_pos,inframe_end_pos) : str(dna_seq)[inframe_init_pos:inframe_end_pos]})
print(in_frame_parts_to_recode)

#check if all the regions are now %3:
for dics in in_frame_parts_to_recode:
    if len(next(iter(dics.values())))%3 != 0:
        print("REGIONS IN FRAME ARE NOT ALL %3, SUCH AS: ")
        print(position,regions_in_frame)
        print("PLEASE FIX THIS ISSUE, IT WILL OTHERWISE CAUSE MAJOR PROBLEM!")

#recode every in frame regions, recode at 25% each CDSs:
recoded_in_frame_parts = [{next(iter(dics.keys())):recode_perc_change(25,next(iter(dics.values())),rev_ecoli_codon_usage)}
                          for dics in in_frame_parts_to_recode]
coor_min = 0
coor_max = 0
recoded_genome_parts = []
for i,dics in enumerate(recoded_in_frame_parts):
    position = next(iter(dics.keys()))
    rec_seq = next(iter(dics.values()))
    if position[0] != 0:
        recoded_genome_parts.append(str(dna_seq)[0:position[0]])
    else:
        continue
    if i+1 < len(recoded_in_frame_parts):
        next_position = next(iter(recoded_in_frame_parts[i+1].keys()))
        if position[1] < next_position[0]:
            recoded_genome_parts.append(rec_seq)
            recoded_genome_parts.append(str(dna_seq)[position[1]:next_position[0]])
    else:
        if position[1] != 
    
    
              
### gtg pour le codon start: jamais recoder!
### gene antisens: attention!

        
        
        
        #delta_inframe_init_pos = (find_start_of_gene(init_pos,CDSs)[1][0] - init_pos)%3 #delta inframe is the adjustement to make each start of each region in frame 
#il faut trouver une condition pour retrouver les cadres de lecture pour le gène pII.

    
    

[{(0, 378): 'AATGCTACTACTATTAGTAGAATTGATGCCACCTTTTCAGCTCGCGCCCCAAATGAAAATATAGCTAAACAGGTTATTGACCATTTGCGAAATGTATCTAATGGTCAAACTAAATCTACTCGTTCGCAGAATTGGGAATCAACTGTTATATGGAATGAAACTTCCAGACACCGTACTTTAGTTGCATATTTAAAACATGTTGAGCTACAGCATTATATTCAGCAATTAAGCTCTAAGCCATCCGCAAAAATGACCTCTTATCAAAAGGAGCAATTAAAGGTACTCTCTAATCCTGACCTGTTGGAGTTTGCTTCCGGTCTGGTTCGCTTTGAAGCTCGAATTAAAACGCGATATTTGAAGTCTTTCGGGCTTCCTCTT'}, {(426, 474): 'AAAGACCTGATTTTTGATTTATGGTCATTCTCGTTTTCTGAACTGTTT'}, {(495, 768): 'ATGAATATTTATGACGATTCCGCAGTATTGGACGCTATCCAGTCTAAACATTTTACTATTACCCCCTCTGGCAAAACTTCTTTTGCAAAAGCCTCTCGCTATTTTGGTTTTTATCGTCGTCTGGTAAACGAGGGTTATGATAGTGTTGCTCTTACTATGCCTCGTAATTCCTTTTGGCGTTATGTATCTGCATTAGTTGAATGTGGTATTCCTAAATCTCAACTGATGAATCTTTCTACCTGTAATAATGTTGTTCCGTTAGTTCGTTTTATT'}, {(816, 822): 'CTTAAA'}, {(842, 1085): 'ATGATTAAAGTTGAAATTAAACCATCTCAAGCCCAATTTACTACTCGTTCTGGTGTTTCTCGTCAGGGCAAGCCTTATTCACTGAATGAGCAGCTTTGTTACGTTGATTTGGGTAATGAATATCCGGTTCTTGTCAAGATTACTCTTGATGAAGGTCAGCCAGCCTATGCGCCTGGTCTGTACACCGTTCATCTGTCCTCTTTCAAAGT

In [40]:
recode_perc_change(25,"ATGAATATTTATGACGATTCCGCAGTATTGGACGCTATCCAGTCTAAACATTTTACTATTACCCCCTCTGGCAAAACTTCTTTTGCAAAAGCCTCTCGCTATTTTGGTTTTTATCGTCGTCTGGTAAACGAGGGTTATGATAGTGTTGCTCTTACTATGCCTCGTAATTCCTTTTGGCGTTATGTATCTGCATTAGTTGAATGTGGTATTCCTAAATCTCAACTGATGAATCTTTCTACCTGTAATAATGTTGTTCCGTTAGTTCGTTTTATT",rev_ecoli_codon_usage)

'ATGAACATATATGACGATTCCGCAGTATTGGACGCTATCCAGTCAAAACATTTTACAATTACCCCGAGCGGCAAAACTTCTTTTGCGAAAGCCAGCCGGTATTTTGGTTTTTATCGTCGTCTGGTAAACGAAGGTTATGACAGTGTTGCCCTTACTATGCCTCGCAACTCCTTCTGGCGTTATGTATCTGCATTAGTTGAGTGCGGTATTCCCAAATCCCAACTGATGAATCTTAGTACTTGTAATAACGTTGTTCCGTTAGTGCGTTTCATT'

In [None]:
#draft of may things!
        
        #append something
        
                
            # else:
            #     new_coors = {}
            #     wrong_frame = True
            #     wrong_frame_lastpos = next_coors[1]
            #     if next_coors[1] < positions[1]: #in the case the next cds (next_coors) is not in frame and shorter than the cds it is compared with:
            #         #appending the part before next_coors[0]:
            #         if not cds_smaller:
            #             parts_to_recode.append({str(dna_seq[positions[0]:next_coors[0]]):(positions[0],next_coors[0])})
            #             last_start = next_coors[0]
            #             last_end = next_coors[1]
            #             cds_smaller = True
            #             try:
            #                 next_coors = all_locations[all_locations.index(next_coors[0],next_coors[1])+1]
            #             except: #if we are at the end of the list, need to append the recoded parts not overlaping with the little cds:                                                                        
            #                 #appending the last part of the recodable cds, excluding the cds not in frame:
            #                 new_coors[dna_seq[next_coors[1]:positions[1]]] = (next_coors[1],positions[1])
            #                 parts_to_recode.append({str(dna_seq[next_coors[1]:positions[1]]):(next_coors[1],positions[1])})
            #         else: #if multiple smaller CDSs
            #             parts_to_recode.append({str(dna_seq[last_start:next_coors[0]]):(positions[0],next_coors[0])})
            #             last_start = next_coors[0]
            #             try:
            #                 next_coors = all_locations[all_locations.index(next_coors[0],next_coors[1])+1]
            #             except: #if we are at the end of the list, need to append the recoded parts not overlaping with the little cds:                                                                        
            #                 #appending the last part of the recodable cds, excluding the cds not in frame:
            #                 new_coors[dna_seq[next_coors[1]:positions[1]]] = (next_coors[1],positions[1])
            #             break
            #     else:
            #         in_frame_startpos = last_end+((3-last_end-positions[0])%3)
            #         in_frame_end_pos = ###ACHANGER POUR LE IN FRAME ET REMPLACER LE NEXT_COORD[0] PLUS BAS!
            #         if cds_smaller and in_frame_startpos =< next_coors[0]+3: #these two might overlap! Get at least 1 codon !
            #             in_frame_pos = (3-last_end-positions[0])%3
            #             parts_to_recode.append({str(dna_seq[in_frame_pos:next_coors[0]]):(positions[0],next_coors[0])})
            #         break
            # try:
            #     next_coors = all_locations[all_locations.index(next_coors[0],next_coors[1])+1]
            # except: #if we are at the end of the list
            #     break
                
    # elif wrong_frame: #do not append in dict if overlaping region not in frame
    #     if cds_smaller:
    #         parts_to_recode.append({str(dna_seq[next_coors[1]:positions[1]]):(next_coors[1],positions[1])})
    #     continue
    # elif new_coors:
    #     parts_to_recode.append(new_coors)
    # elif not new_coors:
    #     parts_to_recode.append({str(dna_seq[positions[0]:positions[1]]):(positions[0],positions[1])})
    # print(parts_to_recode)
    
# CDSs_no_overlaps = {}
# whose_overlaps = {}   #gene key overlaps with gene value.
# for i, (gene_name, coors) in enumerate(CDSs.items()):
#     CDSs_no_overlaps[gene_name] = {}
#     if i+1 < len(CDSs):
#         next_coors = list(CDSs.values())[i+1]
#         new_coors = {}
#         while next_coors[0] <= coors[1] and coors[0] <= next_coors[1] : # if overlap between 2 cdss
#             whose_overlaps[list(CDSs.keys())[i+1]] = gene_name
#             if gene_name in whose_overlaps: #meaning gene key starts not at the begining of itself, rather after the known end of previously fixed overlapé
#                 max_start_of_gene = max([CDSs[gene][1] for gene in whose_overlaps if gene == gene_name])
#             else:
#                 max_start_of_gene = coors[0]
#             if same_ORF(coors[1],next_coors[0]): #if overlaping cds are in frame, it is possible to recode
#                 new_coors[str(dna_seq[max_start_of_gene:next_coors[0]])] = (max_start_of_gene,coors[1])
#             else: #if there is just one cds not in frame, even if there is other overlaping cds, we cannot recode this part
#                 new_coors[str(dna_seq[max_start_of_gene:next_coors[0]])] = (max_start_of_gene,next_coors[0])
#                 break
#             next_coors = list(CDSs.values())[list(CDSs.values()).index((next_coors[0],next_coors[1]))+1]
#             print(next_coors,gene_name)
#         if not new_coors:
#             CDSs_no_overlaps[gene_name] = {str(dna_seq[coors[0]:coors[1]]):(coors[0],coors[1])}
            
# print(CDSs_no_overlaps)

            
            



# cds_part_no_prob_overlap = overlap_situation(cds_parts,CDSs)
# print(cds_part_no_prob_overlap)
# #print("j suis quoi???",cds_part_no_prob_overlap)

# recoded_genome_parts = {}
# for gene_name, parts in cds_part_no_prob_overlap.items():
#     recoded_genome_parts[gene_name] = {}
#     for part,coor in parts.items():
#         #Search for orf coordinates and recoded the cds:
#         start_part = coor[0]
#         end_part = coor[1]
#         start_codon = start_part+(start_part%3)
#         end_codon = end_part-(end_part%3)
#         if start_part < end_part:
#             in_frame_part = dna_seq[start_codon:end_codon]
#             recoded_part = str(recode_perc_change(50,in_frame_part,rev_ecoli_codon_usage))
#             if recoded_part != "":
#                 recoded_genome_parts[gene_name][recoded_part] = (start_codon,end_codon)
#         else:
#             print("minou")
#             in_frame_part_end = dna_seq[start_codon:max_len_dna]
#             in_frame_part_start = dna_seq[0:end_codon]
#             recoded_part_end = str(recode_perc_change(50,in_frame_part_end,rev_ecoli_codon_usage))
#             recoded_part_start = str(recode_perc_change(50,in_frame_part_start,rev_ecoli_codon_usage))
#             if recoded_part != "":
#                 recoded_genome_parts[gene_name][in_frame_part_end] = (start_codon,max_len_dna)
#                 recoded_genome_parts[gene_name][recoded_part_start] = (0,end_codon)       
# print(recoded_genome_parts)


#construct the final m13 vector:
# all_recoded_start_coor = [str(list(recoded_parts.values())).split(",")[0][2:] for recoded_parts in recoded_genome_parts.values()]
# all_recoded_start_coor = [(recoded_parts.values()) for recoded_parts in recoded_genome_parts.values()]
# print(all_recoded_start_coor)
# all_recoded_end_coor = [coor[1] for seq,coor in parts.items() for recoded_parts in recoded_genome_parts.values()]
# flat_recoded_cds = {coor:seq for seq,coor in partsATGA.items() for parts in recoded_genome_parts.values()}
# print(flat_recoded_cds)

# max_len_dna = len(str(dna_seq))[1,2,3,4,5,6,7,8].index(2)+1
# m13_WTZ = ""
# last_pos = 0
# for i in range(max_len_dna):
#     if i in all_recoded_start_coor:
#         m13_WTZ += dna_seq[last_pos:i]
#         last_pos = i
#     elif i in all_recoded_end_coor:
#         m13_WTZ += dna_seq[last_pos:i]
#         last_pos = i
#     else:
#         continue
# print(len(m13_WTZ))
        
        
#     print("old",cds_parts, "\n")    
#     #le gene en chevauche-t-il un autre? Si oui, couper la partie correspondante dans la liste de gene
#     cds_parts = overlap_situation(coor,cds_parts) 
#     print("new",cds_parts,"\n")
#     print(old_cds_parts==cds_parts)    
#     cds_recoded = {}
#     traceback = []
        
                 
        
        
        #when forbiden_regions[0] < coor[0]
    #find the reading frame in each part of changeable CDSs
    
    #change the reading frame in cds
    
    #merge all substring into a new file