Here, I look at ISs that inserted into a gene and look at the orientation of the gene to see whether there is a bias in the orientation of genes into which the IS inserted. The answer seems to be: no. 
Another thing I look at is whether the IS inserts in the same orientation as the CDS. Nothing to see there either, apparently. 

In [1]:
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation

In [2]:
ref = "C:/Users/Flora/Desktop/M2 Systèmes Complexes/Stage/Bioinfo/ref/"
data = "C:/Users/Flora/Desktop/M2 Systèmes Complexes/Stage/Bioinfo/LTEE-clone-curated/"

In [3]:
record = SeqIO.read(ref+'REL606.gbk', 'genbank') #loading ancestor reference file

In [5]:
def extract_positions(IS_name):
    '''Generates a dictionary listing the position of insertions that occured at some point in at least
    one of the clones of one of the populations (only one occurrence per position, even if it occured
    in different populations).'''
    
    from os import listdir #package to manage contents of a directory
    from os.path import isfile, join
    onlyfiles = [f for f in listdir(data) if isfile(join(data, f))] #listing all file names for the .gd files
    files = [] # creating a list that will first contain these names to be called, and later on open .gd files
    file_names = [] # list that will contain names as strings (kept as such for all the analysis)
    readlines_names = [] #list that will contain lists with the lines of each file (obtained via readlines())
    for i in onlyfiles: #fixing an error in the list of files (I don't know why it occurs) and filling the previously defined lists
        if len(i) < 30:
            files.append(i)
            file_names.append(i)
            readlines_names.append(i)
    
    
    for i in range(len(files)):
        files[i] = open(data+files[i],"r") #the list files becomes a list of open files
        readlines_names[i] = files[i].readlines() # the list readlines_names becomes a list of lists, with each list containing 
        #all the lines from one file
        
    insertion_positions = {} # Creating a dictionary to hold the positions of the insertions
    for i in range(len(file_names)): # for each file
        for line in readlines_names[i]: #we look at all the lines in the corresponding readlines_names entry
            line = line.split()
            if 'MOB' in line and IS_name in line: #if the line corresponds to a mutation linked to the IS
                insertion_positions[line[4]] = [line[4], line[6]] #we add an entry for this IS position to our dictionary
                # and save the orientation as well
                # /!\ this will erase the previous insertion if there was already an insertion at this position
    
    return insertion_positions
    

In [27]:
def Gene_orientation(IS_name):
    '''Looks at the orientation of genes into which an IS has inserted. Also looks at whether the IS inserted on the same
    strand as the gene. Returns a list containing the orientation of interrupted genes and indicates the proportion of 
    common orientations between insertion and CDS.'''
    
    orientations = [] #initializing list to contain orientations
    common_orientation = 0 #initializing count of common orientations

    positions_IS = extract_positions(IS_name) # generating a dictionary of the format {insertion_position: [position, orientation]}

    for insertion_position in list(positions_IS.keys()): #for each insertion
        IS_position = int(insertion_position) #getting the position as an integer

        for feature in record.features[1:]: # going through the list of features
            if feature.type == 'CDS' and 'join' not in str(feature.location): 
                #looking at coding sequences and excluding a problematic location
                start_position = int(str(feature.location).split(':')[0][1:]) #getting the start position of the feature
                end_position = int(str(feature.location).split(':')[1][:-4]) #getting the end position of the feature
                if IS_position >= start_position and IS_position <= end_position: #if the IS interrupted that feature
                    orientation = feature.strand #we get the orientation of the CDS
                    orientations.append(orientation) #we save it in our list
                    if int(positions_IS[insertion_position][1]) == orientation: #if IS and CDS have the same orientation
                        common_orientation += 1 #we add +1 to the count

    print('Proportion of common orientations: ', common_orientation/len(orientations))
    return orientations


In [28]:
orientations_IS150 = Gene_orientation('IS150')

Proportion of common orientations:  0.42081447963800905


In [22]:
orientations_IS150.count(-1)/len(orientations_IS150)

0.5294117647058824

In [29]:
orientations_IS1 = Gene_orientation('IS1')

Proportion of common orientations:  0.4406779661016949


In [19]:
orientations_IS1.count(-1)/len(orientations_IS1)

0.4406779661016949

In [30]:
orientations_IS186 = Gene_orientation('IS186')

Proportion of common orientations:  0.5333333333333333


In [21]:
orientations_IS186.count(-1)/len(orientations_IS186)

0.4666666666666667