In [2]:
import os
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd

In [3]:
#load data
block = pd.read_table("inversion_block_info_all.txt", sep='\t',header=None)
ATCC = pd.read_csv("Ecoli_ATCC_25922_final_RawCounts.csv")
BW25113 = pd.read_csv("Ecoli_BW25113_final_RawCounts.csv")
K12DH = pd.read_csv("Ecoli_K12_DH10B_final_RawCounts.csv")
K12MG = pd.read_csv("Ecoli_K12_MG1655_final_RawCounts.csv")

In [4]:
block.columns = ['blockname','taxa','start','end','rev_comp','inversion']

In [5]:
block['block'] = block['blockname'].apply(lambda x:x.split('.')[0])

In [6]:
block = block[['block', 'taxa','start','end','rev_comp','inversion']]

In [7]:
block['taxa'].value_counts()

NC_010473      1148
NZ_CP009273    1148
U00096000      1148
CP009072       1148
Name: taxa, dtype: int64

In [8]:
#sort block files based on taxa
ATCC_block = block[block.taxa=='CP009072'].sort_values(by='start').reset_index(drop=True)
BW25113_block = block[block.taxa=='NZ_CP009273'].sort_values(by='start').reset_index(drop=True)
K12DH_block = block[block.taxa=='NC_010473'].sort_values(by='start').reset_index(drop=True)
K12MG_block = block[block.taxa=='U00096000'].sort_values(by='start').reset_index(drop=True)

# #fixing rows that were not parse properly
# fix = block.taxa.unique()[4:]
# df = pd.DataFrame({})
# for i in fix:
#     df = df.append(block.loc[block.taxa==i])

# df['block2'] = df['block'].apply(lambda x : x.split('NZ')[1])
# df['block'] = df['block'].apply(lambda x : x.split('NZ')[0])

# df = df.drop(['inversion'],1)
# df.columns = ['block','start', 'end', 'rev_comp', 'inversion', 'taxa']
# df = df[['block', 'taxa', 'start', 'end','rev_comp', 'inversion']]

# df['taxa'] = df['taxa'].apply(lambda x : 'NZ'+x)

# #add fixed rows into the block files
# ATCC_block = ATCC_block.append(df.loc[df['taxa']=="NZ_CP009072"])
# BW25113_block = BW25113_block.append(df.loc[df['taxa']=="NZ_CP009273"])

ATCC_block['start'] = pd.to_numeric(ATCC_block['start'])
ATCC_block = ATCC_block.sort_values(by='start').reset_index(drop=True)

BW25113_block['start'] = pd.to_numeric(BW25113_block['start'])
BW25113_block = BW25113_block.sort_values(by='start').reset_index(drop=True)

In [9]:
#function to match by start
def match_block_start(gene_start, block_file): 
    block_file = block_file.sort_values(by='start').reset_index(drop=True)
    starts =[]
    for i in block_file.start:
        if i < gene_start: 
            starts.append(i)
    if len(starts) == 0:
        return None
    else:
        return(block_file.block[np.argmax(starts)])

#function to match by gene end 
def match_block_end(gene_end, block_file):
    block_file = block_file.sort_values(by='start', ascending=False).reset_index(drop=True)
    ends =[]
    for j in block_file.end:
        if j > gene_end: 
            ends.append(j)
    if len(ends)==0:
        return(block_file.block[len(block_file)-1])
    else:
        return(block_file.block[np.argmin(ends)])

## files to check neighbouring blocks

In [10]:
ATCC_block['block_order'] = np.arange(1,len(ATCC_block)+1,1)
BW25113_block['block_order'] = np.arange(1,len(BW25113_block)+1,1)
K12DH_block['block_order'] = np.arange(1,len(K12DH_block)+1,1)
K12MG_block['block_order'] = np.arange(1,len(K12MG_block)+1,1)

In [11]:
#number of rows check
len(ATCC_block)==len(BW25113_block)==len(K12DH_block)==len(K12MG_block)

True

# ATCC

### Omitted blocks that are inside another block 

In [13]:
block_ends=[]
inside_block=[]
for i in range(len(ATCC_block)-1): 
    block_ends.append(ATCC_block.end[i])
    if not(all(ATCC_block.end[i+1] > x for x in block_ends)):
           inside_block.append(ATCC_block.block[i+1])

In [14]:
##View inside block df 
inside_block_df = pd.merge(ATCC_block, pd.DataFrame({'block':inside_block}), how='right', on='block')
print('there are', len(inside_block_df), 'inside blocks and', sum(inside_block_df['inversion']), 'are inverted')
ATCC_inside_inverted = inside_block_df[inside_block_df['inversion']==1]

there are 60 inside blocks and 22 are inverted


In [15]:
ATCC_block_final = ATCC_block[~ATCC_block['block'].isin(inside_block)]

In [16]:
len(ATCC_block)

1148

In [17]:
ATCC_block_final.head()

Unnamed: 0,block,taxa,start,end,rev_comp,inversion,block_order
0,Block611,CP009072,0,19237,0,0,1
1,Block612,CP009072,19639,25321,0,0,2
2,Block613,CP009072,25328,25469,0,0,3
3,Block614,CP009072,25805,30879,0,0,4
4,Block615,CP009072,30896,41163,0,0,5


In [18]:
ATCC_block_final['block'].nunique()

1088

## ATCC expression data                  

In [19]:
ATCC['gbk_end'] = ATCC['gbk_end'].apply(lambda x:x.replace('>',''))
ATCC['gbk_end'] = pd.to_numeric(ATCC['gbk_end'])

In [20]:
#matching blocks by start
ATCC_block_by_start = [match_block_start(i, ATCC_block_final) for i in ATCC['gbk_start']]

In [21]:
#matching block by end
ATCC_block_by_end = [match_block_end(i, ATCC_block_final) for i in ATCC['gbk_end']]

In [22]:
#add new columns and make it a new dataframe
ATCC_new = ATCC.copy()
ATCC_new['block_by_start'] = ATCC_block_by_start
ATCC_new['block_by_end'] = ATCC_block_by_end

#check if the start and end identifies the same block
ATCC_new['Single_block'] = ATCC_new['block_by_start'] == ATCC_new['block_by_end']
print('ATCC has', len(ATCC_new), 'rows and', sum(ATCC_new.Single_block), 'are single blocks')

ATCC has 4825 rows and 3181 are single blocks


## single block and merge with block file 

In [23]:
ATCC_sb = ATCC_new[ATCC_new['Single_block']==True]

In [24]:
ATCC_sb = pd.merge(ATCC_sb, ATCC_block_final, how='left', left_on='block_by_start', right_on='block')

In [25]:
ATCC_sb.head()

Unnamed: 0,gene_id,gbk_start,gbk_end,gbk_midpoint,gbk_gene_id,gbk_old_locus_tag,gbk_strand,GSE94978_1,GSE94978_2,block_by_start,block_by_end,Single_block,block,taxa,start,end,rev_comp,inversion,block_order
0,DR76_RS00005,1,1278,639,,DR76_1,0,107,208,Block611,Block611,True,Block611,CP009072,0,19237,0,0,1
1,DR76_RS00010,1275,2279,1777,,DR76_2,0,128,346,Block611,Block611,True,Block611,CP009072,0,19237,0,0,1
2,DR76_RS00015,2276,3241,2758,,DR76_4,0,221,416,Block611,Block611,True,Block611,CP009072,0,19237,0,0,1
3,DR76_RS00020,3215,3961,3588,,DR76_3,1,888,769,Block611,Block611,True,Block611,CP009072,0,19237,0,0,1
4,DR76_RS00025,4013,4831,4422,,DR76_5,1,161,274,Block611,Block611,True,Block611,CP009072,0,19237,0,0,1


# BW25113

### Omitted blocks that are inside another block 

In [26]:
block_ends=[]
inside_block=[]
for i in range(len(BW25113_block)-1): 
    block_ends.append(BW25113_block.end[i])
    if not(all(BW25113_block.end[i+1] > x for x in block_ends)):
           inside_block.append(BW25113_block.block[i+1])

print('There are',len(inside_block), 'inside blocks')

There are 3 inside blocks


In [27]:
##View inside block df 
inside_block_df = pd.merge(BW25113_block, pd.DataFrame({'block':inside_block}), how='right', on='block')
inside_block_df.head()
print('there are', len(inside_block_df), 'inside blocks and', sum(inside_block_df['inversion']), 'are inverted')
BW25113_inside_inverted = inside_block_df[inside_block_df['inversion']==1]
BW25113_inside_inverted

there are 3 inside blocks and 3 are inverted


Unnamed: 0,block,taxa,start,end,rev_comp,inversion,block_order
0,Block17,NZ_CP009273,751912,751934,1,1,185
1,Block97,NZ_CP009273,1938486,1938513,1,1,500
2,Block96,NZ_CP009273,3591755,3591787,1,1,913


In [28]:
BW25113_block_final = BW25113_block[~BW25113_block['block'].isin(inside_block)]

In [29]:
BW25113_block_final['block'].nunique()

1145

## BW expression data 

In [30]:
BW25113['gbk_end'] = BW25113['gbk_end'].apply(lambda x:x.replace('>',''))

In [31]:
BW25113['gbk_end'] = pd.to_numeric(BW25113['gbk_end'])

In [32]:
#matching blocks by start
BW25113_block_by_start = [match_block_start(i, BW25113_block_final) for i in BW25113['gbk_start']]

In [33]:
#matching block by end
BW25113_block_by_end = [match_block_end(i, BW25113_block_final) for i in BW25113['gbk_end']]

In [34]:
#add new columns and make it a new dataframe
BW25113_new = BW25113.copy()
BW25113_new['block_by_start'] = BW25113_block_by_start
BW25113_new['block_by_end'] = BW25113_block_by_end

#check if the start and end identifies the same block
BW25113_new['Single_block'] = BW25113_new['block_by_start'] == BW25113_new['block_by_end']
print('BW25113 has', len(BW25113_new), 'rows and', sum(BW25113_new.Single_block), 'are single blocks')

BW25113 has 4077 rows and 3040 are single blocks


## single block and merge with block file 

In [35]:
BW25113_sb = BW25113_new[BW25113_new['Single_block']==True]
BW25113_sb = pd.merge(BW25113_sb, BW25113_block, how='left', left_on='block_by_start', right_on='block')
print('There are', len(BW25113_sb), 'single block genes')

There are 3040 single block genes


In [36]:
BW25113.columns

Index(['Locus_tag', 'GSE73673_6', 'GSE73673_7', 'GSE73673_8', 'gene_id',
       'GSE85914', 'gbk_locus_tag', 'gbk_start', 'gbk_end', 'gbk_midpoint',
       'gbk_gene_id', 'gbk_old_locus_tag', 'gbk_strand'],
      dtype='object')

In [37]:
BW25113_sb.columns

Index(['Locus_tag', 'GSE73673_6', 'GSE73673_7', 'GSE73673_8', 'gene_id',
       'GSE85914', 'gbk_locus_tag', 'gbk_start', 'gbk_end', 'gbk_midpoint',
       'gbk_gene_id', 'gbk_old_locus_tag', 'gbk_strand', 'block_by_start',
       'block_by_end', 'Single_block', 'block', 'taxa', 'start', 'end',
       'rev_comp', 'inversion', 'block_order'],
      dtype='object')

# K12DH 

### Omitted blocks that are inside another block 

In [38]:
block_ends=[]
inside_block=[]
for i in range(len(K12DH_block)-1): 
    block_ends.append(K12DH_block.end[i])
    if not(all(K12DH_block.end[i+1] > x for x in block_ends)):
           inside_block.append(K12DH_block.block[i+1])

print('There are',len(inside_block), 'inside blocks')

There are 3 inside blocks


In [39]:
##View inside block df 
inside_block_df = pd.merge(K12DH_block, pd.DataFrame({'block':inside_block}), how='right', on='block')
inside_block_df.head()
print('there are', len(inside_block_df), 'inside blocks and', sum(inside_block_df['inversion']), 'are inverted')
K12DH_inside_inverted = inside_block_df[inside_block_df['inversion']==1]
K12DH_inside_inverted

there are 3 inside blocks and 3 are inverted


Unnamed: 0,block,taxa,start,end,rev_comp,inversion,block_order
0,Block17,NC_010473,808271,808293,1,1,183
1,Block97,NC_010473,2032824,2032851,1,1,500
2,Block96,NC_010473,3694163,3694195,1,1,913


In [40]:
K12DH_block_final = K12DH_block[~K12DH_block['block'].isin(inside_block)]

## K12 DH Expression data 

In [41]:
K12DH['gbk_end'] = K12DH['gbk_end'].apply(lambda x:x.replace('>',''))
K12DH['gbk_end'] = pd.to_numeric(K12DH['gbk_end'])

In [42]:
#matching blocks by start
K12DH_block_by_start = [match_block_start(i, K12DH_block_final) for i in K12DH['gbk_start']]

In [43]:
#matching block by end
K12DH_block_by_end = [match_block_end(i, K12DH_block_final) for i in K12DH['gbk_end']]

In [44]:
#add new columns and make it a new dataframe
K12DH_new = K12DH.copy()
K12DH_new['block_by_start'] = K12DH_block_by_start
K12DH_new['block_by_end'] = K12DH_block_by_end

#check if the start and end identifies the same block
K12DH_new['Single_block'] = K12DH_new['block_by_start'] == K12DH_new['block_by_end']
print('K12DH has', len(K12DH_new), 'rows and', sum(K12DH_new.Single_block), 'are single blocks')

K12DH has 4247 rows and 3058 are single blocks


## single block and merge with block file 

In [45]:
K12DH_sb = K12DH_new[K12DH_new['Single_block']==True]
K12DH_sb = pd.merge(K12DH_sb, K12DH_block, how='left', left_on='block_by_start', right_on='block')
print('There are', len(K12DH_sb), 'single block genes')

There are 3058 single block genes


# K12MG expression data 

## K12MG Expression data

In [46]:
#matching blocks by start
K12MG_block_by_start = [match_block_start(i, K12MG_block) for i in K12MG['gbk_start']]

In [47]:
#matching block by end
K12MG_block_by_end = [match_block_end(i, K12MG_block) for i in K12MG['gbk_end']]

In [48]:
#add new columns and make it a new dataframe
K12MG_new = K12MG
K12MG_new['block_by_start'] = K12MG_block_by_start
K12MG_new['block_by_end'] = K12MG_block_by_end

#check if the start and end identifies the same block
K12MG_new['Single_block'] = K12MG_new['block_by_start'] == K12MG_new['block_by_end']
print('K12MG has', len(K12MG_new), 'rows and', sum(K12MG_new.Single_block), 'are single blocks')

K12MG has 3891 rows and 2969 are single blocks


## single block and merge with block file 

In [49]:
K12MG_sb = K12MG_new[K12MG_new['Single_block']==True]
K12MG_sb = pd.merge(K12MG_sb, K12MG_block, how='left', left_on='block_by_start', right_on='block')
print('There are', len(K12MG_sb), 'single block genes')

There are 2969 single block genes


In [50]:
K12MG_sb['block'].nunique()

563

## Sample dataframe

In [51]:
ATCC_final = pd.melt(ATCC_sb, id_vars =list(ATCC_sb.columns[:7]) + list(ATCC_sb.columns[9:18]), 
                     value_vars =['GSE94978_1', 'GSE94978_2'], 
                     var_name ='replicates', value_name ='raw_exp') 
ATCC_final['Locus_tag'] = np.nan
ATCC_final['exp'] = 'GSE94978'
ATCC_final['exp'].value_counts()

GSE94978    6362
Name: exp, dtype: int64

In [52]:
BW25113_final = pd.melt(BW25113_sb, id_vars =['Locus_tag', 'gene_id'] + list(BW25113_sb.columns[6:22]), 
                     value_vars =['GSE73673_6', 'GSE73673_7', 'GSE73673_8', 'GSE85914'], 
                     var_name ='replicates', value_name ='raw_exp') 
BW25113_final['exp'] = BW25113_final['replicates'].apply(lambda x:x.split('_')[0])
BW25113_final['exp'].value_counts()

GSE73673    9120
GSE85914    3040
Name: exp, dtype: int64

In [53]:
K12DH_sb.columns

Index(['Locus_tag', 'gene_id', 'GSE98890_1', 'GSE98890_2', 'gbk_start',
       'gbk_end', 'gbk_midpoint', 'gbk_gene_id', 'gbk_locus_tag', 'gbk_strand',
       'block_by_start', 'block_by_end', 'Single_block', 'block', 'taxa',
       'start', 'end', 'rev_comp', 'inversion', 'block_order'],
      dtype='object')

In [54]:
K12DH_final = pd.melt(K12DH_sb, id_vars =['Locus_tag', 'gene_id'] + list(K12DH_sb.columns[4:19]), 
                     value_vars =['GSE98890_1', 'GSE98890_2'], 
                     var_name ='replicates', value_name ='raw_exp') 
K12DH_final['exp'] = 'GSE98890'

In [55]:
K12MG_final = pd.melt(K12MG_sb, id_vars =['Locus_tag', 'gene_id'] + list(K12MG_sb.columns[13:28]), 
                     value_vars =list(K12MG_sb.columns[2:13]), 
                     var_name ='replicates', value_name ='raw_exp') 
K12MG_final['exp'] = K12MG_final['replicates'].apply(lambda x:x.split('_')[0])
K12MG_final['exp'].value_counts()

GSE114917    11876
GSE60522      8907
GSE54199      5938
GSE40313      5938
Name: exp, dtype: int64

In [56]:
K12MG_sb.columns

Index(['Locus_tag', 'gene_id', 'GSE60522_1', 'GSE60522_2', 'GSE60522_3',
       'GSE114917_LP1', 'GSE114917_LP2', 'GSE114917_LP3', 'GSE114917_LP4',
       'GSE54199_RNA', 'GSE54199_DNA', 'GSE40313_1', 'GSE40313_2', 'gbk_start',
       'gbk_end', 'gbk_midpoint', 'gbk_gene_id', 'gbk_old_locus_tag',
       'gbk_strand', 'block_by_start', 'block_by_end', 'Single_block', 'block',
       'taxa', 'start', 'end', 'rev_comp', 'inversion', 'block_order'],
      dtype='object')

In [57]:
len(K12MG.columns)

22

In [58]:
K12MG_final['strain'] = 'K12MG'
K12DH_final['strain'] = 'K12DH'
ATCC_final['strain'] = 'ATCC'
BW25113_final['strain'] = 'BW25113'

In [59]:
len(ATCC_final.columns)

21

In [60]:
len(BW25113_final.columns)

22

In [61]:
len(K12DH_final.columns)

21

In [62]:
len(K12MG_final.columns)

21

In [63]:
set(BW25113_final.columns)^set(ATCC_final.columns)

{'gbk_locus_tag'}

In [64]:
df = pd.concat([K12DH_final,K12MG_final,BW25113_final, ATCC_final])

In [65]:
df.columns

Index(['Locus_tag', 'gene_id', 'gbk_start', 'gbk_end', 'gbk_midpoint',
       'gbk_gene_id', 'gbk_locus_tag', 'gbk_strand', 'block_by_start',
       'block_by_end', 'Single_block', 'block', 'taxa', 'start', 'end',
       'rev_comp', 'inversion', 'replicates', 'raw_exp', 'exp', 'strain',
       'gbk_old_locus_tag'],
      dtype='object')

In [66]:
df = df.drop(['gbk_old_locus_tag','Single_block', 'gbk_locus_tag','block_by_end','block_by_start'], axis=1)

In [67]:
df.columns

Index(['Locus_tag', 'gene_id', 'gbk_start', 'gbk_end', 'gbk_midpoint',
       'gbk_gene_id', 'gbk_strand', 'block', 'taxa', 'start', 'end',
       'rev_comp', 'inversion', 'replicates', 'raw_exp', 'exp', 'strain'],
      dtype='object')

In [68]:
df = df[['Locus_tag', 'gene_id', 'gbk_start','gbk_end','gbk_midpoint', 'gbk_strand', 
         'block','start', 'end', 'rev_comp', 'inversion', 'strain', 'taxa', 'replicates',
        'exp','raw_exp']]

In [69]:
df.head()

Unnamed: 0,Locus_tag,gene_id,gbk_start,gbk_end,gbk_midpoint,gbk_strand,block,start,end,rev_comp,inversion,strain,taxa,replicates,exp,raw_exp
0,ECDH10B_0001,thrL,190,255,222,0,Block1,0,5468,0,0,K12DH,NC_010473,GSE98890_1,GSE98890,161.0
1,ECDH10B_0002,thrA,337,2799,1568,0,Block1,0,5468,0,0,K12DH,NC_010473,GSE98890_1,GSE98890,2634.0
2,ECDH10B_0003,thrB,2801,3733,3267,0,Block1,0,5468,0,0,K12DH,NC_010473,GSE98890_1,GSE98890,525.0
3,ECDH10B_0004,thrC,3734,5020,4377,0,Block1,0,5468,0,0,K12DH,NC_010473,GSE98890_1,GSE98890,589.0
4,ECDH10B_0007,yaaJ,6529,7959,7244,1,Block4,5692,9904,0,0,K12DH,NC_010473,GSE98890_1,GSE98890,92.0


In [70]:
df.to_csv('Sample_final_df_RawExp.csv',index=False)