In [2]:
import os
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd

In [50]:
#load data
block = pd.read_table("inversion_block_info_all.txt", sep='\t')
BW25113 = pd.read_csv("Ecoli_BW25113_final_exp0707.csv")

#sort block files based on taxa
BW25113_block = block[block.taxa=='NZ_CP009273'].sort_values(by='start').reset_index(drop=True)

#fixing rows that were not parse properly
fix = block.taxa.unique()[4:]
df = pd.DataFrame({})
for i in fix:
    df = df.append(block.loc[block.taxa==i])

df['block2'] = df['block'].apply(lambda x : x.split('NZ')[1])
df['block'] = df['block'].apply(lambda x : x.split('NZ')[0])

df = df.drop(['inversion'],1)
df.columns = ['block','start', 'end', 'rev_comp', 'inversion', 'taxa']
df = df[['block', 'taxa', 'start', 'end','rev_comp', 'inversion']]

df['taxa'] = df['taxa'].apply(lambda x : 'NZ'+x)

#add fixed rows into the block files
BW25113_block = BW25113_block.append(df.loc[df['taxa']=="NZ_CP009273"])

BW25113_block['start'] = pd.to_numeric(BW25113_block['start'])
BW25113_block = BW25113_block.sort_values(by='start').reset_index(drop=True)

In [4]:
#function to match by start
def match_block_start(gene_start, block_file): 
    block_file = block_file.sort_values(by='start').reset_index(drop=True)
    starts =[]
    for i in block_file.start:
        if i < gene_start: 
            starts.append(i)
    if len(starts) == 0:
        return None
    else:
        return(block_file.block[np.argmax(starts)])

#function to match by gene end 
def match_block_end(gene_end, block_file):
    block_file = block_file.sort_values(by='start', ascending=False).reset_index(drop=True)
    ends =[]
    for j in block_file.end:
        if j > gene_end: 
            ends.append(j)
    if len(ends)==0:
        return(block_file.block[len(block_file)-1])
    else:
        return(block_file.block[np.argmin(ends)])

## files to check neighbouring blocks

In [5]:
BW25113_block['block_order'] = np.arange(1,len(BW25113_block)+1,1)

In [6]:
#number of rows check
len(BW25113_block)

1148

# BW25113

In [7]:
##find difference between blocks
diff = [0]
for i in range(len(BW25113_block.start)-1):
    diff.append(BW25113_block.start[i+1] - BW25113_block.end[i])

BW25113_block['diff'] = diff
BW25113_block['overlaps'] = BW25113_block['diff'] < 0 

print('There are', len(BW25113_block), 'blocks and', sum(BW25113_block['overlaps']), 'are overlapping blocks')
#this current block is starting inside the last block
#BW25113_block[BW25113_block['overlaps']==True] 
#BW25113_block.loc[15:20]

There are 1148 blocks and 5 are overlapping blocks


## Overlapping genes

In [8]:
# block = ['a','b','c','d']
# start = [1,6,10,21]
# end = [20,9,12,25]

block = BW25113_block['block']
start = BW25113_block['start']
end = BW25113_block['end']

N = len(block)

tally =[]
for i in range(len(block)):
    tally.append((start[i], block[i],"start"))
    tally.append((end[i], block[i],"end"))
    
tally = sorted(tally, key=lambda x:x[0])

groups={}
stack={}
for entry in tally:
    t = entry[0]
    name = entry[1]
    action = entry[2]
    
    if action == "start":
        stack[name]= True
        if len(stack)>1:
            groups[','.join(stack)] = True
    if action == "end":
        del stack[name]

#print(tally)
#print(groups)

In [9]:
BW25113_overlaps = pd.DataFrame({'Overlapping_blocks':[x for x in groups.keys()]})
blocks = BW25113_overlaps['Overlapping_blocks'].str.split(',',n=2, expand=True)
BW25113_overlaps['Bigblock'] = blocks[0]
BW25113_overlaps['Smallblock1'] = blocks[1]
BW25113_overlaps = pd.merge(BW25113_overlaps, BW25113_block[['block','start','end','inversion']], left_on='Bigblock',right_on='block', how='left')
BW25113_overlaps = pd.merge(BW25113_overlaps, BW25113_block[['block','start','end','inversion']], left_on='Smallblock1',right_on='block', how='left')
BW25113_overlaps['inversion_conflict'] = BW25113_overlaps['inversion_x'] != BW25113_overlaps['inversion_y']
BW25113_overlaps['inside_block'] = BW25113_overlaps['end_y'] < BW25113_overlaps['end_x']
print('there are', sum(BW25113_overlaps['inversion_conflict']), 'out of', len(BW25113_overlaps),'overlaps that have inconsistent inversion')
print(sum(BW25113_overlaps[BW25113_overlaps['inversion_conflict']]['inside_block']),'inconsitencies are inside blocks')
BW25113_overlaps.head()

there are 1 out of 5 overlaps that have inconsistent inversion
0 inconsitencies are inside blocks


Unnamed: 0,Overlapping_blocks,Bigblock,Smallblock1,block_x,start_x,end_x,inversion_x,block_y,start_y,end_y,inversion_y,inversion_conflict,inside_block
0,"Block56,Block57",Block56,Block57,Block56,220668,221215,0.0,Block57,220669,223414,0.0,False,False
1,"Block187,Block17",Block187,Block17,Block187,748625,753863,1.0,Block17,751912,751934,1.0,False,True
2,"Block501,Block97",Block501,Block97,Block501,1933474,1940335,1.0,Block97,1938486,1938513,1.0,False,True
3,"Block913,Block96",Block913,Block96,Block913,3585984,3597071,1.0,Block96,3591755,3591787,1.0,False,True
4,"Block1105,Block1106",Block1105,Block1106,Block1105,4460195,4460227,1.0,Block1106,4460214,4462437,0.0,True,False


In [10]:
overlapping_region = BW25113_overlaps[BW25113_overlaps['inside_block']==False].reset_index(drop=True)

overlapping_region['start'] = overlapping_region['start_y']
overlapping_region['end'] = overlapping_region['end_x']
overlapping_region['block'] = overlapping_region['Overlapping_blocks']
overlapping_region = overlapping_region[['block', 'start', 'end', 'inversion_conflict']]
overlapping_region 

Unnamed: 0,block,start,end,inversion_conflict
0,"Block56,Block57",220669,221215,False
1,"Block1105,Block1106",4460214,4460227,True


In [11]:
def overlapping_gene2(start, end, block_file):
    inside_gene, overlapping_start, overlapping_end=[],[],[]
    for i in range(len(block_file)):
        if (block_file['gbk_start'][i] < start and block_file['gbk_end'][i] > start):
            overlapping_start.append(block_file['Locus_tag'][i])
        if (block_file['gbk_start'][i] > start and block_file['gbk_end'][i] < end):
            inside_gene.append(block_file['Locus_tag'][i]) 
        if (block_file['gbk_start'][i] > start and block_file['gbk_start'][i] < end):
            overlapping_end.append(block_file['Locus_tag'][i]) 
    return [overlapping_start,inside_gene,overlapping_end]

In [12]:
BW25113_regions = {}
regions={}
for i in range(len(overlapping_region)):
    regions[overlapping_region['block'][i]] = overlapping_gene2(overlapping_region['start'][i], 
                                                               overlapping_region['end'][i], BW25113)
regions

{'Block56,Block57': [['b0197'], [], ['b0198']],
 'Block1105,Block1106': [['b4237'], [], []]}

## inside block genes check

### no inversion conflict

In [13]:
insideblock_region = BW25113_overlaps[BW25113_overlaps['inside_block']==True].reset_index(drop=True)
insideblock_region = insideblock_region[insideblock_region['inversion_conflict']==False].reset_index(drop=True)

insideblock_region['start'] = insideblock_region['start_y']
insideblock_region['end'] = insideblock_region['end_y']
insideblock_region['block'] = insideblock_region['Overlapping_blocks']
insideblock_region = insideblock_region[['block', 'start', 'end', 'inversion_conflict']]

In [14]:
inside_regions={}
for i in range(len(insideblock_region)):
    inside_regions[insideblock_region['block'][i]] = overlapping_gene2(insideblock_region['start'][i], 
                                                                      insideblock_region['end'][i], BW25113)
pd.DataFrame(inside_regions)

Unnamed: 0,"Block187,Block17","Block501,Block97","Block913,Block96"
0,[b0718],[b1854],[b3453]
1,[],[],[]
2,[],[],[]


In [15]:
insideblock_region.head()

Unnamed: 0,block,start,end,inversion_conflict
0,"Block187,Block17",751912,751934,False
1,"Block501,Block97",1938486,1938513,False
2,"Block913,Block96",3591755,3591787,False


In [16]:
genes = [i for i in inside_regions.values()]
overlap_start, overlap, overlap_end = [],[],[]
for i in range(len(genes)):
    overlap_start.append(len(genes[i][0]))
    overlap.append(len(genes[i][1]))
    overlap_end.append(len(genes[i][2]))

print(sum(overlap_start), sum(overlap), sum(overlap_end))
    

3 0 0


### With inversion conflict

In [17]:
block_region = BW25113_overlaps[BW25113_overlaps['inside_block']==True].reset_index(drop=True)
block_region = block_region[block_region['inversion_conflict']==True].reset_index(drop=True)

block_region['start'] = block_region['start_y']
block_region['end'] = block_region['end_y']
block_region['block'] = block_region['Overlapping_blocks']
block_region = block_region[['block', 'start', 'end', 'inversion_conflict']]

In [18]:
block_regions={}
for i in range(len(block_region)):
    block_regions[block_region['block'][i]] = overlapping_gene2(block_region['start'][i], 
                                                                      block_region['end'][i], BW25113)
pd.DataFrame(block_regions)

In [19]:
block_region.head()

Unnamed: 0,block,start,end,inversion_conflict


In [20]:
genes = [i for i in block_regions.values()]
overlap_start, overlap, overlap_end = [],[],[]
for i in range(len(genes)):
    overlap_start.append(len(genes[i][0]))
    overlap.append(len(genes[i][1]))
    overlap_end.append(len(genes[i][2]))

print(sum(overlap_start), sum(overlap), sum(overlap_end))
    

0 0 0


### Omitted blocks that are inside another block 

In [21]:
block_ends=[]
inside_block=[]
for i in range(len(BW25113_block)-1): 
    block_ends.append(BW25113_block.end[i])
    if not(all(BW25113_block.end[i+1] > x for x in block_ends)):
           inside_block.append(BW25113_block.block[i+1])

print('There are',len(inside_block), 'inside blocks')

There are 3 inside blocks


In [22]:
##View inside block df 
inside_block_df = pd.merge(BW25113_block, pd.DataFrame({'block':inside_block}), how='right', on='block')
inside_block_df.head()
print('there are', len(inside_block_df), 'inside blocks and', sum(inside_block_df['inversion']), 'are inverted')
BW25113_inside_inverted = inside_block_df[inside_block_df['inversion']==1]
BW25113_inside_inverted

there are 3 inside blocks and 3.0 are inverted


Unnamed: 0,block,taxa,start,end,rev_comp,inversion,block_order,diff,overlaps
0,Block17,NZ_CP009273,751912,751934,1,1.0,185,-1951,True
1,Block97,NZ_CP009273,1938486,1938513,1,1.0,500,-1849,True
2,Block96,NZ_CP009273,3591755,3591787,1,1.0,913,-5316,True


In [23]:
BW25113_block_final = BW25113_block[~BW25113_block['block'].isin(inside_block)]

In [24]:
print('there are', sum(BW25113_block_final['overlaps']), 'overlapping blocks and', 
                      sum(pd.DataFrame(BW25113_block_final[BW25113_block_final['overlaps']==1])['inversion']), 'are inverted')

there are 2 overlapping blocks and 0.0 are inverted


In [25]:
BW25113_overlapped = pd.DataFrame(BW25113_block_final[BW25113_block_final['overlaps']==1])
BW25113_overlapped

Unnamed: 0,block,taxa,start,end,rev_comp,inversion,block_order,diff,overlaps
55,Block57,NZ_CP009273,220669,223414,0,0.0,56,-546,True
1105,Block1106,NZ_CP009273,4460214,4462437,0,0.0,1106,-13,True


## BW expression data 

In [51]:
#matching blocks by start
BW25113_block_by_start = [match_block_start(i, BW25113_block_final) for i in BW25113.gbk_start]



In [52]:
print(max(BW25113_block_final['end']),max(BW25113['gbk_end']))

4631461 >4087350


In [54]:
#matching block by end

BW25113.gbk_end = [i.replace('>','') for i in BW25113.gbk_end]
BW25113['gbk_end'] = pd.to_numeric(BW25113['gbk_end'])

BW25113_block_by_end = [match_block_end(i, BW25113_block_final) for i in BW25113.gbk_end]

In [55]:
#add new columns and make it a new dataframe
BW25113_new = BW25113
BW25113_new['block_by_start'] = BW25113_block_by_start
BW25113_new['block_by_end'] = BW25113_block_by_end

#check if the start and end identifies the same block
BW25113_new['Single_block'] = BW25113_new['block_by_start'] == BW25113_new['block_by_end']
print('BW25113 has', len(BW25113_new), 'rows and', sum(BW25113_new.Single_block), 'are single blocks')

BW25113 has 4077 rows and 3041 are single blocks


In [56]:
#non single blocks - check for neighbouring 
BW25113_non_sb = BW25113_new[BW25113_new.Single_block==False]
print('There are',len(BW25113_non_sb), 'spitted genes')

#merging by the gene start 
BW25113_non_sb = pd.merge(BW25113_non_sb, BW25113_block_final, how='left', left_on='block_by_start', right_on='block')
BW25113_non_sb = BW25113_non_sb.drop(['Single_block','block', 'taxa', 'start', 'end', 'rev_comp','inversion','diff'],1)
BW25113_non_sb.columns.values[-2] = 'start_order'
BW25113_non_sb.columns.values[-1] = 'start_block_overlap'

#merging by the gene end 
BW25113_non_sb = pd.merge(BW25113_non_sb, BW25113_block_final, how='left', left_on='block_by_end', right_on='block')
BW25113_non_sb = BW25113_non_sb.drop(['block', 'taxa', 'start', 'end', 'rev_comp','inversion','diff'],1)
BW25113_non_sb.columns.values[-2] = 'end_order'
BW25113_non_sb.columns.values[-1] = 'end_block_overlap'

There are 1036 spitted genes


In [57]:
#check continuity, neighbour and overlapping blocks 
BW25113_non_sb['check_continuity'] = BW25113_non_sb['start_order'] < BW25113_non_sb['end_order']
print('There are', len(BW25113_non_sb), 'spitted genes,', sum(BW25113_non_sb['check_continuity']), 'are continous.') 
BW25113_non_sb['check_neighbour'] = BW25113_non_sb['start_order']+1 == BW25113_non_sb['end_order']
print('There are', len(BW25113_non_sb), 'spitted genes,', sum(BW25113_non_sb['check_neighbour']), 'are neighbouring blocks') 

There are 1036 spitted genes, 1036 are continous.
There are 1036 spitted genes, 953 are neighbouring blocks


In [58]:
BW25113_non_sb = pd.merge(BW25113_non_sb, BW25113_block[['block','end']], left_on='block_by_start',
                      right_on='block', how='left')
BW25113_non_sb = pd.merge(BW25113_non_sb, BW25113_block[['block','start']], left_on='block_by_end',
                      right_on='block', how='left')

In [59]:
BW25113_non_sb['space_between_block'] = BW25113_non_sb['start'] - BW25113_non_sb['end']

In [60]:
pd.DataFrame(BW25113_non_sb[BW25113_non_sb['space_between_block']>0])['space_between_block'].describe()

count      1036.000000
mean      17729.616795
std       34128.479955
min           1.000000
25%         378.500000
50%        3070.000000
75%       11926.000000
max      114522.000000
Name: space_between_block, dtype: float64

## single block and merge with block file 

In [62]:
BW25113_sb = BW25113_new[BW25113_new['Single_block']==True]
BW25113_sb = pd.merge(BW25113_sb, BW25113_block, how='left', left_on='block_by_start', right_on='block')
print('There are', len(BW25113_sb), 'single block genes')

There are 3041 single block genes
