In [50]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

### DH to W3 mapping merging with W3 gene_info 
+ attained alphabetical names from CF gene info 
+ (preparing to map to full mapping)


In [51]:
##load DH_W3 mapping and W3 gene_info 
DH_mapping = pd.read_table('./DH10B_and_W3110_gene_map.txt')
W3_info = pd.read_table('../Genome/Ecoli_K12_W3110_NC_007779_gene_info.txt')

DH_mapping.columns = ['DH', 'DH_names', 'W3', 'W3_names', 'drop']
#drop bad format of missing column value`
DH_mapping = DH_mapping[['DH', 'DH_names', 'W3', 'W3_names']].dropna(how='any')

In [52]:
#take unique W3
DH_mapped = DH_mapping.groupby(['W3_names']).agg(
    DH_names=pd.NamedAgg(column='DH_names', aggfunc=lambda x:','.join(x))).reset_index()

In [53]:
#merged with W3 gene_info 
W3_mapped = pd.merge(W3_info[['gbk_locus_tag','gbk_gene_id']], DH_mapped, left_on='gbk_locus_tag', right_on='W3_names')

In [54]:
W3_mapped.describe()

Unnamed: 0,gbk_locus_tag,gbk_gene_id,W3_names,DH_names
count,3995,2740,3995,3995
unique,3995,2700,3995,3990
top,Y75_RS00465,rrf,Y75_RS00465,ECDH10B_RS06230
freq,1,5,1,2


In [55]:
W3_mapped = W3_mapped[W3_mapped['gbk_gene_id'].notna()]

In [56]:
#Expand DH names that were combined and separated by ','
extra= W3_mapped['DH_names'].str.split(',', n=4, expand=True)
extra.columns = ['DH_name1','DH_name2']
#merged back with W3 gene alphabetical name
extra['W3_names'] = W3_mapped['W3_names']
extra['gene_id'] = W3_mapped['gbk_gene_id']

In [57]:
extra.describe()

Unnamed: 0,DH_name1,DH_name2,W3_names,gene_id
count,2740,5,2740,2740
unique,2739,5,2740,2700
top,ECDH10B_RS11430,ECDH10B_RS15770,Y75_RS12875,rrf
freq,2,1,1,5


In [58]:
#collapsing the multiple DH names columns into extra rows
new = pd.melt(extra, id_vars=['W3_names','gene_id'], value_vars=['DH_name1', 'DH_name2'], 
       value_name='DH_names')

new = new[['W3_names', 'DH_names', 'gene_id']].dropna(subset=['DH_names'])

#new.fillna('-', inplace=True)

In [59]:
# making unique DH name to prepare to map with full mapping
# combining multiple alphabetical names rows into one row seprated by commas 
W3_final = new.groupby(['DH_names']).agg(
    W3_names=pd.NamedAgg(column='W3_names', aggfunc=lambda x:','.join(x)), 
    gene_id=pd.NamedAgg(column='gene_id', aggfunc=lambda x:','.join(x))).reset_index()

In [60]:
# make seperate columns for the multiple gene names 
extra_gene_id = W3_final['gene_id'].str.split(',', n=3, expand=True)
extra_gene_id.columns = ['W3_name1', 'W3_name2']
W3_final = pd.concat([W3_final, extra_gene_id], axis=1)

In [61]:
#drop original gene_id columns that has multiple columns
W3_final.drop(['gene_id', 'W3_names'],axis=1, inplace=True)
#W3_final = W3_final.replace('-',None)

### ATCC and CF mapping merging with CF gene info 
+ attained alphabetical names from CF gene info 
+ (preparing to map to full mapping)

#### include both CF gene_info files

In [62]:
#include both CF mappings 
CF_info1 = pd.read_table('../Genome/Ecoli_CFT073_NC_004431_gene_info.txt')
CF_info2 = pd.read_table('../Genome/Ecoli_CFT073_AE014075_gene_info.txt')

CF_names1 = CF_info1[['gbk_gene_id', 'gbk_locus_tag', 'gbk_old_locus_tag']]
CF_names2 = CF_info2[['gbk_gene_id', 'gbk_locus_tag']]

In [63]:
CF_name = pd.merge(CF_names1, CF_names2, left_on='gbk_old_locus_tag', right_on='gbk_locus_tag', 
                  how='outer')
CF_name = CF_name[CF_name['gbk_locus_tag_x'].notna()] #only select valid C_RS locus tags 

CF_name['check'] = CF_name['gbk_gene_id_x'] == CF_name['gbk_gene_id_y'] #check if both gen_info provides the same info

## check in consistent naming
# print(len(CF_name))
# CF_name[(CF_name['gbk_gene_id_x'].notna()) & (CF_name['gbk_gene_id_y'].notna()) & (CF_name['check']==False)]

In [64]:
## Organizing
CF_name['CF_alt_name'] = np.where(CF_name['gbk_old_locus_tag'].isna(), CF_name['gbk_locus_tag_y'],
                                  CF_name['gbk_old_locus_tag'])
CF_name.rename(columns={'gbk_gene_id_x':'CF_name1', 'gbk_gene_id_y':'CF_name2', 
                       'gbk_locus_tag_x':'locus_tag'}, inplace=True)
CF_name = CF_name[['CF_name1', 'CF_name2', 'locus_tag', 'CF_alt_name']]

CF_name.describe()

Unnamed: 0,CF_name1,CF_name2,locus_tag,CF_alt_name
count,2855,3532,5024,4738
unique,2756,3532,5024,4738
top,tnpA,ybeY,C_RS23980,c5121
freq,19,1,1,1


#### ATCC CF mapping merging with both CF gene info 

In [65]:
## ATCC to CF ----------
#load data
ATCC_mapping = pd.read_table('./ATCC_and_CFT073_gene_map.txt')
ATCC_mapping.columns = ['ATCC', 'ATCC_names', 'CF', 'CF_names', 'drop']
ATCC_mapping = ATCC_mapping[['ATCC', 'ATCC_names',
                             'CF', 'CF_names']].drop_duplicates(keep='first').dropna(how='any').reset_index(drop=True)


In [66]:
#take unique CF
ATCC_mapped = ATCC_mapping.groupby(['CF_names']).agg(
    ATCC_names=pd.NamedAgg(column='ATCC_names', aggfunc=lambda x:','.join(x))).reset_index()

In [67]:
#merged with CF gene_info 
CF_mapped = pd.merge(CF_name, ATCC_mapped, left_on='locus_tag', right_on='CF_names')

CF_mapped.dropna(subset=['CF_name1','CF_name2','CF_alt_name'], how='all', inplace=True)
CF_mapped.describe()

Unnamed: 0,CF_name1,CF_name2,locus_tag,CF_alt_name,CF_names,ATCC_names
count,2517,3076,3934,3887,3934,3934
unique,2458,3076,3934,3887,3934,3925
top,tnpB,yebB,C_RS11575,c5121,C_RS11575,DR76_RS12305
freq,8,1,1,1,1,2


In [68]:
#Expand ATCC names that were combined and separated by ','
extra= CF_mapped['ATCC_names'].str.split(',', n=4, expand=True)
extra.columns = ['ATCC_name1','ATCC_name2']
#merged back with CF gene alphabetical name
extra = pd.concat([CF_mapped.iloc[:, 0:4], extra], axis=1)

In [69]:
extra.describe()

Unnamed: 0,CF_name1,CF_name2,locus_tag,CF_alt_name,ATCC_name1,ATCC_name2
count,2517,3076,3934,3887,3934,23
unique,2458,3076,3934,3887,3920,23
top,tnpB,yebB,C_RS11575,c5121,DR76_RS12305,DR76_RS19730
freq,8,1,1,1,3,1


In [70]:
#collapsing the multiple ATCC names columns into extra rows
new = pd.melt(extra, id_vars=['CF_name1', 'CF_name2', 'CF_alt_name','locus_tag'],
              value_vars=['ATCC_name1', 'ATCC_name2'], 
              value_name='ATCC_names')

new = new.dropna(subset=['ATCC_names'])

new.fillna('-', inplace=True)

In [71]:
# making unique ATCC name to prepare to map with full mapping
# combining multiple alphabetical names rows into one row seprated by commas 
CF_final = new.groupby(['ATCC_names']).agg(
    CF_name1=pd.NamedAgg(column='CF_name1', aggfunc=lambda x:','.join(x)), 
    CF_name2=pd.NamedAgg(column='CF_name2', aggfunc=lambda x:','.join(x)), 
    CF_alt=pd.NamedAgg(column='CF_alt_name', aggfunc=lambda x:','.join(x)), 
    locus_tag=pd.NamedAgg(column='locus_tag', aggfunc=lambda x:','.join(x))).reset_index()

In [72]:
# collapse the columns for multiple alphabetical names for both CF gene infos 
extra_names = CF_final['CF_name1'].str.split(',', n=4, expand=True)
extra_names.columns = ['CF_name1', 'CF_name2', 'CF_name3']
extra_names2 = CF_final['CF_name2'].str.split(',', n=4, expand=True)
extra_names2.columns = ['CF_name5', 'CF_name6', 'CF_name7']

extra = pd.concat([extra_names, extra_names2], axis=1).reset_index(drop=True)

#change df into np array
temp=np.array(extra)
del extra

#remove NA and only keep unique names
fixed =[]
for i in temp: 
     fixed.append(sorted(list(set(i[~pd.isnull(i)])))[::-1])
del temp

#remove '-' names 
fixed = pd.DataFrame(fixed)
fixed = fixed.replace('-', np.nan)
fixed.columns = ['CF_names1', 'CF_names2', 'CF_names3']

In [73]:
#Collapse CF alternative names into columns
alt_names = CF_final['CF_alt'].str.split(',', n=4, expand=True)
alt_names=np.array(alt_names)
alt=[]
for i in alt_names:
    alt.append(sorted(list(set(i[~pd.isnull(i)])))[::-1]) #taking away NAs and Nones and duplicates 
alt = pd.DataFrame(alt)
alt= alt.replace('-', np.nan)
alt.columns = ['CF_alt1', 'CF_alt2', 'CF_alt3']

In [74]:
CF_final = pd.concat([CF_final[['ATCC_names']], fixed, alt], axis=1)
CF_final.head()

Unnamed: 0,ATCC_names,CF_names1,CF_names2,CF_names3,CF_alt1,CF_alt2,CF_alt3
0,DR76_RS00080,yehE,,,c2640,,
1,DR76_RS00085,mrp,apbC,,c2641,,
2,DR76_RS00090,metG,,,c2642,,
3,DR76_RS00095,,,,c2643,,
4,DR76_RS00100,yehI,,,c2644,,


In [75]:
CF_final.describe(include='all')

Unnamed: 0,ATCC_names,CF_names1,CF_names2,CF_names3,CF_alt1,CF_alt2,CF_alt3
count,3942,3290,697,0.0,3894,14,1
unique,3942,3247,689,,3876,14,1
top,DR76_RS16655,tnpB,gspE,,c0342,c1467,c0278
freq,1,7,2,,2,1,1
mean,,,,,,,
std,,,,,,,
min,,,,,,,
25%,,,,,,,
50%,,,,,,,
75%,,,,,,,


### Full Mapping 

In [76]:
##Full Mapping--------
full_mapping = pd.read_table('./ATCC_CFT073_K12MG_K12DH_BW_gene_map.txt')
full_mapping.columns = ['ATCC', 'ATCC_names', 'BW', 'BW_names', 'DH', 'DH_names', 'MG', 'MG_names', 'drop']
full_mapping = full_mapping[['ATCC_names','BW_names', 'DH_names', 'MG_names']]
full_mapping = full_mapping.drop_duplicates(keep='first') 
#3422

In [77]:
#merge DH names
full_mapping = pd.merge(full_mapping, W3_final, on='DH_names', how='left')


#merge ATCC names
full_mapping = pd.merge(full_mapping, CF_final, on='ATCC_names', how='left') 


#merge MG names
MG_info = pd.read_table('../Genome/Ecoli_K12_MG1655_chrom_U00096_gene_info.txt')[['gbk_gene_id', 'gbk_locus_tag']]
MG_info.columns=['MG_id', 'MG_names']

full_mapping = pd.merge(full_mapping, MG_info, on='MG_names', how='left')


In [78]:
W3_final.describe()

Unnamed: 0,DH_names,W3_name1,W3_name2
count,2744,2744,1
unique,2744,2700,1
top,ECDH10B_RS10360,rrf,gatA
freq,1,5,1


In [79]:
CF_final.describe()

Unnamed: 0,CF_names3
count,0.0
mean,
std,
min,
25%,
50%,
75%,
max,


In [80]:
full_mapping.isna().sum()

ATCC_names       0
BW_names         0
DH_names         0
MG_names         0
W3_name1       979
W3_name2      3421
CF_names1      478
CF_names2     2775
CF_names3     3422
CF_alt1        412
CF_alt2       3416
CF_alt3       3422
MG_id            0
dtype: int64

In [81]:
full_mapping.head()

Unnamed: 0,ATCC_names,BW_names,DH_names,MG_names,W3_name1,W3_name2,CF_names1,CF_names2,CF_names3,CF_alt1,CF_alt2,CF_alt3,MG_id
0,DR76_RS03365,BW25113_RS19835,ECDH10B_RS20405,b3820,,,yigI,,,c4741,,,yigI
1,DR76_RS03360,BW25113_RS19840,ECDH10B_RS20410,b3821,pldA,,pldA,,,c4742,,,pldA
2,DR76_RS03355,BW25113_RS19845,ECDH10B_RS20415,b3822,recQ,,recQ,,,c4744,,,recQ
3,DR76_RS03350,BW25113_RS19850,ECDH10B_RS20420,b3823,rhtC,,rhtC,,,c4745,,,rhtC
4,DR76_RS03345,BW25113_RS19855,ECDH10B_RS20425,b3824,rhtB,,yigK,rhtB,,c4746,,,rhtB


## BLAST 

In [82]:
##Comparing---------------
ATCC_DH_blast = pd.read_table('../BLAST/parameters_more_sensitive/RBBH_ATCC_DH10B_more_sensitive.tab')
MG_DH_blast = pd.read_table('../BLAST/parameters_more_sensitive/RBH_out_MG1655_DH10B_more_sensitive_dmnd_uniq.tab')
MG_ATCC_blast = pd.read_table('../BLAST/parameters_more_sensitive/RBH_out_MG1655_ATCC_more_sensitive_dmnd_uniq.tab')

ATCC_DH_blast = ATCC_DH_blast[['A_gene', 'A_alt_name', 'B_gene']]
ATCC_DH_blast.columns = ['ATCC_gene','ATCC_gene2', 'DH_gene']
#3244 rows from blast 

MG_DH_blast = MG_DH_blast[['A_gene', 'A_alt_name', 'B_gene']]
MG_DH_blast.columns = ['MG_gene','MG_gene2', 'DH_gene']
#4162 rows from blast 

MG_ATCC_blast = MG_ATCC_blast[['A_gene', 'A_alt_name', 'B_gene', 'B_alt_name']]
MG_ATCC_blast.columns = ['MG_gene','MG_gene2', 'ATCC_gene', 'ATCC_gene2']
#3264 rows from blast 

In [83]:
#remove redundant ecpD gene
ATCC_DH_blast = ATCC_DH_blast[ATCC_DH_blast['DH_gene']!='yadV']

## ATCC VS DH 

In [84]:
#subsetting from full mapping
ATCC_DH_mapping = full_mapping.drop(['BW_names', 'MG_names','MG_id'], axis=1)

In [85]:
#drop not found DH
ATCC_DH_mapping = ATCC_DH_mapping.dropna(subset=list(ATCC_DH_mapping.columns[2:4]),how='all').drop_duplicates(keep='first')

#drop not found ATCC
ATCC_DH_mapping = ATCC_DH_mapping.dropna(subset=list(ATCC_DH_mapping.columns[5:]),how='all').drop_duplicates(keep='first')

### Cleaning AD blast 
alt names are in the alphabetical names columns if alp name is missing 

In [86]:
## fix shifting in alt gene names 
ATCC_DH_blast['ATCC_genes'] = ATCC_DH_blast['ATCC_gene']
ATCC_DH_blast['ATCC_gene'] = np.where(ATCC_DH_blast['ATCC_gene2'].isna(), np.NaN, ATCC_DH_blast['ATCC_gene'])
ATCC_DH_blast['ATCC_alt2'] = np.where(ATCC_DH_blast['ATCC_genes'].str.startswith('c'), 
                                      ATCC_DH_blast['ATCC_genes'], np.NaN)
ATCC_DH_blast['ATCC_gene2'] = np.where(ATCC_DH_blast['ATCC_gene2'].isna(), ATCC_DH_blast['ATCC_alt2'],
                                      ATCC_DH_blast['ATCC_gene2'])
ATCC_DH_blast = ATCC_DH_blast[['ATCC_gene', 'ATCC_gene2', 'DH_gene']]
ATCC_DH_blast['ATCC_gene2'] = ATCC_DH_blast['ATCC_gene2'].apply(lambda x:str(x))

In [87]:
## comma problems in more than 1 alternative names
ATCC_DH_blast['ATCC_alt'] = ATCC_DH_blast['ATCC_gene2'].apply(lambda x:x.split(',')[0])
alt2 =[]
for i in ATCC_DH_blast['ATCC_gene2']:
    try: 
        alt2.append(i.split(',')[1])
    except (IndexError):
        alt2.append('NA')

ATCC_DH_blast['ATCC_alt2'] = alt2

In [88]:
#extra rows
temp = ATCC_DH_blast[ATCC_DH_blast['ATCC_alt2'] != 'NA'].reset_index(drop=True)
temp.tail()

Unnamed: 0,ATCC_gene,ATCC_gene2,DH_gene,ATCC_alt,ATCC_alt2
6,ecpD,"c0401, c0171",ecpD,c0401,c0171
7,sucC,"c0805, c5036",sucC,c0805,c5036
8,tdcD,"c3873, c4530",tdcD,c3873,c4530
9,rbsK,"c4680, c0331, c2492",rbsK,c4680,c0331
10,arcB,"c5348, c3970",arcB,c5348,c3970


In [89]:
temp = temp[['ATCC_gene','ATCC_alt2','DH_gene']]
temp.rename(columns={'ATCC_alt2':'ATCC_alt'},inplace=True)

temp = temp.append(pd.DataFrame({'ATCC_gene':['rbsK'],'ATCC_alt':['c2492'],
                                 'DH_gene':['rbsK']}), ignore_index=True)

ATCC_DH_blast = ATCC_DH_blast[['ATCC_gene','ATCC_alt','DH_gene']]
ATCC_DH_blast = pd.concat([ATCC_DH_blast,temp]).reset_index(drop=True)

#the row that was accidentally filter bc of true NA in alt name
ATCC_DH_blast = ATCC_DH_blast.drop([1396]).reset_index(drop=True)

ATCC_DH_blast.describe()

Unnamed: 0,ATCC_gene,ATCC_alt,DH_gene
count,3152,3254,3254
unique,3140,3254,3241
top,rbsK,c5121,rbsK
freq,3,1,3


In [90]:
ATCC_DH_blast = ATCC_DH_blast[~(ATCC_DH_blast['ATCC_alt']=='nan')]

### Comparing

In [91]:
AD_blast = np.array(ATCC_DH_blast[['ATCC_gene','DH_gene']].dropna())
AD_blast_alt = np.array(ATCC_DH_blast[['ATCC_alt', 'DH_gene']].dropna())

In [92]:
#checking blast using alphabet names only
AD_blast_check = pd.DataFrame({})
for i in range(len(AD_blast)):
    df = ATCC_DH_mapping.iloc[:,:6][((ATCC_DH_mapping['CF_names1']== AD_blast[i][0])|
               (ATCC_DH_mapping['CF_names2']==AD_blast[i][0]))]
    if len(df)>=1:
        df['matched_CF']= AD_blast[i][0]
        df['matched_W3']= AD_blast[i][1]
        df['check'] = ((ATCC_DH_mapping['W3_name1']==AD_blast[i][1])|
                (ATCC_DH_mapping['W3_name2']==AD_blast[i][1]))
        AD_blast_check = AD_blast_check.append(df, ignore_index=True)

In [93]:
print(len(AD_blast_check), AD_blast_check['check'].sum())
AD_blast_check.head()

1982 1898


Unnamed: 0,ATCC_names,DH_names,W3_name1,W3_name2,CF_names1,CF_names2,matched_CF,matched_W3,check
0,DR76_RS01100,ECDH10B_RS12475,yfbR,,yfbR,,yfbR,yfbR,True
1,DR76_RS07060,ECDH10B_RS16790,agaR,,agaR,,agaR,agaR,True
2,DR76_RS09325,ECDH10B_RS15175,amiC,,amiC,,amiC,amiC,True
3,DR76_RS03755,ECDH10B_RS20010,asnC,,asnC,,asnC,asnC,True
4,DR76_RS00930,ECDH10B_RS12280,arnC,,arnC,,arnC,arnC,True


In [94]:
#checking blast_alt using alt names only
AD_blast_alt_check = pd.DataFrame({})
for i in range(len(AD_blast_alt)):
    df = ATCC_DH_mapping.drop(ATCC_DH_mapping.columns[4:6], axis=1)[
        ((ATCC_DH_mapping['CF_alt1']== AD_blast_alt[i][0])|
               (ATCC_DH_mapping['CF_alt2']==AD_blast_alt[i][0]))]
    if len(df)>=1:
        df['matched_CF']= AD_blast_alt[i][0]
        df['matched_W3']= AD_blast_alt[i][1]
        df['check'] = ((ATCC_DH_mapping['W3_name1']==AD_blast_alt[i][1])|
                (ATCC_DH_mapping['W3_name2']==AD_blast_alt[i][1]))
        AD_blast_alt_check = AD_blast_alt_check.append(df, ignore_index=True)

In [95]:
print(len(AD_blast_alt_check), AD_blast_alt_check['check'].sum())
AD_blast_alt_check.head()

2024 1913


Unnamed: 0,ATCC_names,DH_names,W3_name1,W3_name2,CF_names3,CF_alt1,CF_alt2,CF_alt3,matched_CF,matched_W3,check
0,DR76_RS01100,ECDH10B_RS12475,yfbR,,,c2832,,,c2832,yfbR,True
1,DR76_RS07060,ECDH10B_RS16790,agaR,,,c3886,,,c3886,agaR,True
2,DR76_RS09325,ECDH10B_RS15175,amiC,,,c3411,,,c3411,amiC,True
3,DR76_RS03755,ECDH10B_RS20010,asnC,,,c4671,,,c4671,asnC,True
4,DR76_RS00930,ECDH10B_RS12280,arnC,,,c2796,,,c2796,arnC,True


### check equivalent names 1

In [96]:
W3_names = pd.read_csv('../BLAST/Ecoli_K12_W3110_NC_007779__all_genes_info_blast.csv', header=None)
W3_names.drop(W3_names.columns[11],axis=1,inplace=True)

In [167]:
AD_check1 = AD_blast_check[AD_blast_check['check']==0][AD_blast_check.columns[[0,1,2,3,7]].to_list()]
AD_check2 = AD_blast_alt_check[AD_blast_alt_check['check']==0][AD_blast_alt_check.columns[[0,1,2,3,9]].to_list()]

In [169]:
AD_check1 = pd.merge(AD_check1, W3_names, left_on='matched_W3', right_on=W3_names.columns[0])
AD_check1['check'] = AD_check1.drop(['matched_W3','W3_name1'],1).isin(AD_check1['W3_name1']).any(1)
AD_include1 = AD_check1[AD_check1['check']]

In [171]:
AD_check2 = pd.merge(AD_check2, W3_names, left_on='matched_W3', right_on=W3_names.columns[0])
AD_check2['check'] = AD_check2.drop(['matched_W3','W3_name1'],1).isin(AD_check2['W3_name1']).any(1)
AD_include2 = AD_check2[AD_check2['check']]

In [100]:
AD_include = pd.concat([AD_include1.iloc[:,[0,1]], AD_include2.iloc[:,[0,1]]], axis=0).drop_duplicates(keep='first').reset_index(drop=True)

### combine the checks

In [101]:
##Comebine the two checks
AD_exclude = pd.concat([AD_blast_check.iloc[:, [0,1,-1]],
                 AD_blast_alt_check.iloc[:, [0,1,-1]]],axis=0).groupby(['ATCC_names','DH_names'])['check'].sum().reset_index()

# show genes that are mismatched
AD_exclude = AD_exclude[AD_exclude['check']==0].reset_index(drop=True)

In [102]:
AD_exclude = AD_exclude[~AD_exclude['ATCC_names'].isin(np.array(AD_include['ATCC_names']))]

## MG VS DH

In [103]:
#subsetting from full mapping
MG_DH_mapping = full_mapping.drop(full_mapping.columns[6:-1].to_list(), axis=1).drop(full_mapping.columns[0:2].to_list(), axis=1)

In [104]:
#drop not found DH - no need to drop MG b/c MG is very complete
MG_DH_mapping = MG_DH_mapping.dropna(subset=list(MG_DH_mapping.columns[2:4]),how='all').drop_duplicates(keep='first')

### Comparing

In [105]:
## Load blast files
MG_DH_blast = pd.read_table('../BLAST/parameters_more_sensitive/RBH_out_MG1655_DH10B_more_sensitive_dmnd_uniq.tab')

MG_DH_blast = MG_DH_blast[['A_gene', 'A_alt_name', 'B_gene']]
MG_DH_blast.columns = ['MG_gene','MG_alt', 'DH_gene']

In [106]:
MD_blast = np.array(MG_DH_blast[['MG_gene','DH_gene']].dropna())
MD_blast_alt = np.array(MG_DH_blast[['MG_alt', 'DH_gene']].dropna())

In [107]:
#checking blast using alphabet names only
MD_blast_check = pd.DataFrame({})
for i in range(len(MD_blast)):
    df = MG_DH_mapping[(MG_DH_mapping['MG_id']== MD_blast[i][0])]
    if len(df)>=1:
        df['matched_MG']= MD_blast[i][0]
        df['matched_DH']= MD_blast[i][1]
        df['check'] = ((MG_DH_mapping['W3_name1']==MD_blast[i][1])|
                (MG_DH_mapping['W3_name2']==MD_blast[i][1]))
        MD_blast_check = MD_blast_check.append(df, ignore_index=True)

In [108]:
print(len(MD_blast_check), MD_blast_check['check'].sum())
MD_blast_check

2289 2201


Unnamed: 0,DH_names,MG_names,W3_name1,W3_name2,MG_id,matched_MG,matched_DH,check
0,ECDH10B_RS20635,b4686,yshB,,yshB,yshB,yshB,True
1,ECDH10B_RS12905,b2376,ypdI,,ypdI,ypdI,ypdI,True
2,ECDH10B_RS11105,b2029,gndA,,gnd,gnd,gnd,False
3,ECDH10B_RS22190,b4154,frdA,,frdA,frdA,frdA,True
4,ECDH10B_RS09710,b1761,gdhA,,gdhA,gdhA,gdhA,True
...,...,...,...,...,...,...,...,...
2284,ECDH10B_RS02260,b0484,copA,,copA,copA,copA,True
2285,ECDH10B_RS19750,b4477,dgoA,,dgoA,dgoA,dgoA,True
2286,ECDH10B_RS19745,b4478,dgoD,,dgoD,dgoD,dgoD,True
2287,ECDH10B_RS18900,b4453,ldrD,,ldrD,ldrD,ldrD,True


In [109]:
#checking blast_alt using alt names only
MD_blast_alt_check = pd.DataFrame({})
for i in range(len(MD_blast_alt)):
    df = MG_DH_mapping[(MG_DH_mapping['MG_names']== MD_blast_alt[i][0])]
    if len(df)>=1:
        df['matched_MG']= MD_blast_alt[i][0]
        df['matched_W3']= MD_blast_alt[i][1]
        df['check'] = ((MG_DH_mapping['W3_name1']==MD_blast_alt[i][1])|
                (MG_DH_mapping['W3_name2']==MD_blast_alt[i][1]))
        MD_blast_alt_check = MD_blast_alt_check.append(df, ignore_index=True)

In [110]:
print(len(MD_blast_alt_check), MD_blast_alt_check['check'].sum())
MD_blast_check

2401 2274


Unnamed: 0,DH_names,MG_names,W3_name1,W3_name2,MG_id,matched_MG,matched_DH,check
0,ECDH10B_RS20635,b4686,yshB,,yshB,yshB,yshB,True
1,ECDH10B_RS12905,b2376,ypdI,,ypdI,ypdI,ypdI,True
2,ECDH10B_RS11105,b2029,gndA,,gnd,gnd,gnd,False
3,ECDH10B_RS22190,b4154,frdA,,frdA,frdA,frdA,True
4,ECDH10B_RS09710,b1761,gdhA,,gdhA,gdhA,gdhA,True
...,...,...,...,...,...,...,...,...
2284,ECDH10B_RS02260,b0484,copA,,copA,copA,copA,True
2285,ECDH10B_RS19750,b4477,dgoA,,dgoA,dgoA,dgoA,True
2286,ECDH10B_RS19745,b4478,dgoD,,dgoD,dgoD,dgoD,True
2287,ECDH10B_RS18900,b4453,ldrD,,ldrD,ldrD,ldrD,True


In [196]:
MD_blast_check[MD_blast_check['check']==0][MD_blast_check.columns[[0,1,2,3,5,6]].to_list()].head()

Unnamed: 0,DH_names,MG_names,W3_name1,W3_name2,matched_MG,matched_DH
2,ECDH10B_RS11105,b2029,gndA,,gnd,gnd
19,ECDH10B_RS22170,b4150,blaEC,,ampC,ampC
26,ECDH10B_RS07080,b1262,trpCF,,trpC,trpC
71,ECDH10B_RS22695,b4254,argF,,argI,argI
117,ECDH10B_RS18680,b3500,gorA,,gor,gor


### check equivalent names

In [181]:
MD_check1 = MD_blast_check[MD_blast_check['check']==0][MD_blast_check.columns[[0,1,2,3,6]].to_list()]
MD_check2 = MD_blast_alt_check[MD_blast_alt_check['check']==0][MD_blast_alt_check.columns[[0,1,2,3,6]].to_list()]

In [183]:
MD_check1 = pd.merge(MD_check1, W3_names, left_on='matched_DH', right_on=W3_names.columns[0])
MD_check1['check'] = MD_check1.drop(['matched_DH','W3_name1'],1).isin(MD_check1['W3_name1']).any(1)
MD_include1 = MD_check1[MD_check1['check']]

In [185]:
MD_check2 = pd.merge(MD_check2, W3_names, left_on='matched_W3', right_on=W3_names.columns[0])
MD_check2['check'] = MD_check2.drop(['matched_W3','W3_name1'],1).isin(MD_check2['W3_name1']).any(1)
MD_include2 = MD_check2[MD_check2['check']]

In [116]:
MD_include = pd.concat([MD_include1.iloc[:,[0,1]], MD_include2.iloc[:,[0,1]]], axis=0).drop_duplicates(keep='first').reset_index(drop=True)

### combine the checks

In [118]:
##Comebine the two checks
MD_exclude = pd.concat([MD_blast_check.iloc[:, [0,1,-1]],
                MD_blast_alt_check.iloc[:, [0,1,-1]]],axis=0).groupby(['MG_names','DH_names'])['check'].sum().reset_index()

##show genes that are mismatched
MD_exclude = MD_exclude[MD_exclude['check']==0]

In [119]:
MD_exclude = MD_exclude[~MD_exclude['DH_names'].isin(np.array(MD_include['DH_names']))]

In [191]:
MD_blast_check[MD_blast_check['MG_names']=='b3581']

Unnamed: 0,DH_names,MG_names,W3_name1,W3_name2,MG_id,matched_MG,matched_DH,check
1657,ECDH10B_RS19145,b3581,ulaD,,sgbH,sgbH,sgbH,False


In [121]:
W3_names[W3_names[W3_names.columns[0]]=='ulaD']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
3846,ulaD,sgaH,yjfV,b4196,JW4154,,,,,,


In [200]:
temp = pd.read_csv('../BLAST/Ecoli_K12_MG1655_U00096___all_genes_info_blast.csv', header=None)

In [201]:
temp[temp[temp.columns[0]]=='ulaD']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
3929,ulaD,sgaH,yjfV,b4196,JW4154,,,,,,,


In [204]:
W3_names[W3_names[W3_names.columns[1]]=='sgaH']

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
3846,ulaD,sgaH,yjfV,b4196,JW4154,,,,,,


## MG VS ATCC

In [124]:
#subsetting from full mapping
MG_ATCC_mapping = full_mapping.drop(['BW_names', 'DH_names','W3_name1',
       'W3_name2'], axis=1)

In [125]:
#drop not found ATCC, not dropping MG b/c MG is very complete 
MG_ATCC_mapping = MG_ATCC_mapping.dropna(subset=list(MG_ATCC_mapping.columns[2:-1]),how='all').drop_duplicates(keep='first')

### Cleaning up MA Blast 

In [126]:
MG_ATCC_blast = pd.read_table('../BLAST/parameters_more_sensitive/RBH_out_MG1655_ATCC_more_sensitive_dmnd_uniq.tab')
MG_ATCC_blast = MG_ATCC_blast[['A_gene', 'A_alt_name', 'B_gene', 'B_alt_name']]
MG_ATCC_blast.columns = ['MG_gene','MG_gene2', 'ATCC_gene', 'ATCC_gene2']
#3264 rows from blast 

#remove redundant ecpD gene
MG_ATCC_blast = MG_ATCC_blast[MG_ATCC_blast['MG_gene']!='yadV']
MG_ATCC_blast.columns = ['MG_gene','MG_gene2', 'ATCC_gene', 'ATCC_gene2']

In [127]:
## fix shifting in alt gene names 
MG_ATCC_blast['ATCC_genes'] = MG_ATCC_blast['ATCC_gene']
MG_ATCC_blast['ATCC_gene'] = np.where(MG_ATCC_blast['ATCC_gene2'].isna(), np.NaN, MG_ATCC_blast['ATCC_gene'])


MG_ATCC_blast['ATCC_alt2'] = np.where(MG_ATCC_blast['ATCC_genes'].str.startswith('c'), 
                                      MG_ATCC_blast['ATCC_genes'], np.NaN)

MG_ATCC_blast['ATCC_gene2'] = np.where(MG_ATCC_blast['ATCC_gene2'].isna(), MG_ATCC_blast['ATCC_alt2'],
                                      MG_ATCC_blast['ATCC_gene2'])

MG_ATCC_blast = MG_ATCC_blast[['MG_gene', 'MG_gene2','ATCC_gene', 'ATCC_gene2']]

MG_ATCC_blast['ATCC_gene2'] = MG_ATCC_blast['ATCC_gene2'].apply(lambda x:str(x))

In [128]:
## comma problems in more than 1 alternative names
MG_ATCC_blast['ATCC_alt'] = MG_ATCC_blast['ATCC_gene2'].apply(lambda x:x.split(',')[0])


alt2 =[]
for i in MG_ATCC_blast['ATCC_gene2']:
    try: 
        alt2.append(i.split(',')[1])
    except (IndexError):
        alt2.append('NA')

MG_ATCC_blast['ATCC_alt2'] = alt2

In [129]:
#extra rows
temp = MG_ATCC_blast[MG_ATCC_blast['ATCC_alt2'] != 'NA'].reset_index(drop=True)

In [130]:
temp = temp[['MG_gene', 'MG_gene2','ATCC_gene','ATCC_alt2']]
temp.rename(columns={'ATCC_alt2':'ATCC_alt'},inplace=True)

temp = temp.append(pd.DataFrame({'MG_gene':['rbsK'],'MG_gene2':['b3752'],
                                 'ATCC_gene':['rbsK'],'ATCC_alt':['c2492']}), ignore_index=True)

MG_ATCC_blast = MG_ATCC_blast[['MG_gene', 'MG_gene2','ATCC_gene','ATCC_alt']]
MG_ATCC_blast = pd.concat([MG_ATCC_blast,temp]).reset_index(drop=True)

#the row that was accidentally filter bc of true NA in alt name
#MG_ATCC_blast = MG_ATCC_blast.drop([1396]).reset_index(drop=True)

MG_ATCC_blast.describe()

Unnamed: 0,MG_gene,MG_gene2,ATCC_gene,ATCC_alt
count,3275,3273,3163,3275
unique,3263,3260,3149,3264
top,rbsK,b3752,rbsK,c3611
freq,3,3,3,6


In [131]:
MG_ATCC_blast = MG_ATCC_blast[~(MG_ATCC_blast['ATCC_alt']=='nan')]

In [132]:
MG_ATCC_blast[MG_ATCC_blast['ATCC_alt']=='c4889.1']

Unnamed: 0,MG_gene,MG_gene2,ATCC_gene,ATCC_alt
398,rpmE,b3936,rpmE,c4889.1


### Comparing

In [133]:
MA_blast = np.array(MG_ATCC_blast[['MG_gene','ATCC_gene']].dropna())
MA_blast2 = np.array(MG_ATCC_blast[['MG_gene2','ATCC_gene']].dropna())
MA_blast_alt = np.array(MG_ATCC_blast[['MG_gene','ATCC_alt']].dropna())
MA_blast_alt2 = np.array(MG_ATCC_blast[['MG_gene2','ATCC_alt']].dropna())

In [134]:
#checking blast using alphabet names only
MA_blast_check = pd.DataFrame({})
for i in range(len(MA_blast)):
    df = MG_ATCC_mapping[MG_ATCC_mapping['MG_id']== MA_blast[i][0]]
    if len(df)>=1:
        df['matched_MG']= MA_blast[i][0]
        df['matched_CF']= MA_blast[i][1]
        df['check'] = ((MG_ATCC_mapping['CF_names1']==MA_blast[i][1])|
                      (MG_ATCC_mapping['CF_names2']==MA_blast[i][1]))
        MA_blast_check = MA_blast_check.append(df, ignore_index=True)

print(len(MA_blast_check), MA_blast_check['check'].sum())

2552 2482


In [135]:
#checking blast2 using alt name of MG and alphabet names of ATCC 
MA_blast2_check2 = pd.DataFrame({})
for i in range(len(MA_blast2)):
    df = MG_ATCC_mapping[MG_ATCC_mapping['MG_names']== MA_blast2[i][0]]
    if len(df)>=1:
        df['matched_MG']= MA_blast2[i][0]
        df['matched_CF']= MA_blast2[i][1]
        df['check'] = ((MG_ATCC_mapping['CF_names1']==MA_blast2[i][1])|
                      (MG_ATCC_mapping['CF_names2']==MA_blast2[i][1]))
        MA_blast2_check2 = MA_blast2_check2.append(df, ignore_index=True)

print(len(MA_blast2_check2), MA_blast2_check2['check'].sum())

2664 2577


In [136]:
#checking blast using alp alt of MG and alt name of ATCC 
MA_blast_alt_check = pd.DataFrame({})
for i in range(len(MA_blast_alt)):
    df = MG_ATCC_mapping[MG_ATCC_mapping['MG_id']== MA_blast_alt[i][0]]
    if len(df)>=1:
        df['matched_MG']= MA_blast_alt[i][0]
        df['matched_CF']= MA_blast_alt[i][1]
        df['check'] = ((MG_ATCC_mapping['CF_alt1']==MA_blast_alt[i][1])|
                      (MG_ATCC_mapping['CF_alt2']==MA_blast_alt[i][1]))
        MA_blast_alt_check = MA_blast_alt_check.append(df, ignore_index=True)

print(len(MA_blast_alt_check), MA_blast_alt_check['check'].sum())

2621 2601


In [137]:
#check2ing blast using alt2 names only
MA_blast_alt2_check2 = pd.DataFrame({})
for i in range(len(MA_blast_alt2)):
    df = MG_ATCC_mapping[MG_ATCC_mapping['MG_names']== MA_blast_alt2[i][0]]
    if len(df)>=1:
        df['matched_MG']= MA_blast_alt2[i][0]
        df['matched_CF']= MA_blast_alt2[i][1]
        df['check'] = ((MG_ATCC_mapping['CF_alt1']==MA_blast_alt2[i][1])|
                      (MG_ATCC_mapping['CF_alt2']==MA_blast_alt2[i][1]))
        MA_blast_alt2_check2 = MA_blast_alt2_check2.append(df, ignore_index=True)
        
print(len(MA_blast_alt2_check2), MA_blast_alt2_check2['check'].sum())

2734 2713


### Check equivalent names

In [138]:
MA_check1 = MA_blast_check[MA_blast_check['check']==0][MA_blast_check.columns[[0,1,2,3,5,10]].to_list()]
MA_check2 = MA_blast2_check2[MA_blast2_check2['check']==0][MA_blast2_check2.columns[[0,1,2,3,5,10]].to_list()]
MA_check3 = MA_blast_alt_check[MA_blast_alt_check['check']==0][MA_blast_alt_check.columns[[0,1,2,3,5,10]].to_list()]
MA_check4 = MA_blast_alt2_check2[MA_blast_alt2_check2['check']==0][MA_blast_alt2_check2.columns[[0,1,2,3,5,10]].to_list()]

In [139]:
CF_names = pd.read_csv('./BLAST/Ecoli_CFT073_NC_004431__all_genes_info_blast.csv',header=None)
CF_names.drop(CF_names.columns[8], axis=1, inplace=True)

In [140]:
MA_check1 = pd.merge(MA_check1, CF_names, left_on='matched_CF', right_on=CF_names.columns[0])
MA_check1['check'] = MA_check1.drop(['matched_CF'],1).isin(MA_check1[['CF_names1','CF_names2','CF_alt1']]).any(1)
MA_include1 = MA_check1[MA_check1['check']]

In [141]:
MA_check2 = pd.merge(MA_check2, CF_names, left_on='matched_CF', right_on=CF_names.columns[0])
MA_check2['check'] = MA_check2.drop(['matched_CF'],1).isin(MA_check2[['CF_names1','CF_names2','CF_alt1']]).any(1)
MA_include2 = MA_check2[MA_check2['check']]

In [142]:
MA_check3['matched_CF1']= MA_check3['matched_CF'].apply(lambda x:x.split('c')[1]).str.slice(stop=4)
MA_check3['matched_CF'] = "c"+MA_check3['matched_CF1']
MA_check3['check'] = MA_check3['CF_alt1'] == MA_check3['matched_CF']

In [143]:
MA_include3_1 = MA_check3[MA_check3['check']][['ATCC_names','MG_names']]

In [144]:
MA_check4['matched_CF1']= MA_check4['matched_CF'].apply(lambda x:x.split('c')[1]).str.slice(stop=4)
MA_check4['matched_CF'] = "c"+MA_check4['matched_CF1']
MA_check4['check'] = MA_check4['CF_alt1'] == MA_check4['matched_CF']

In [145]:
MA_include4_1 = MA_check4[MA_check4['check']][['ATCC_names','MG_names']]

In [146]:
MA_check3 = MA_check3[MA_check3['check']]
MA_check3 = pd.merge(MA_check3, CF_names, left_on='matched_CF', right_on=CF_names.columns[1])
MA_check3['check'] = MA_check3.drop(['matched_CF'],1).isin(MA_check3[['CF_names1','CF_names2','CF_alt1']]).any(1)
MA_include3 = MA_check3[MA_check3['check']]

In [147]:
MA_check4 = MA_check4[MA_check4['check']]
MA_check4 = pd.merge(MA_check4, CF_names, left_on='matched_CF', right_on=CF_names.columns[1])
MA_check4['check'] = MA_check4.drop(['matched_CF'],1).isin(MA_check4[['CF_names1','CF_names2','CF_alt1']]).any(1)
MA_include4 = MA_check4[MA_check4['check']]

In [148]:
MA_include = pd.concat([MA_include1.iloc[:,[0,1]],
                        MA_include2.iloc[:,[0,1]],
                        MA_include3.iloc[:,[0,1]],
                        MA_include4.iloc[:,[0,1]],
                        MA_include3_1.iloc[:,[0,1]],
                        MA_include4_1.iloc[:,[0,1]]],axis=0).drop_duplicates(keep='first').reset_index(drop=True)

In [149]:
##Comebine all checks
MA_exclude = pd.concat([MA_blast_check.iloc[:, [0,1,-1]],
                        MA_blast2_check2.iloc[:, [0,1,-1]],
                        MA_blast_alt_check.iloc[:, [0,1,-1]],
                        MA_blast_alt2_check2.iloc[:, [0,1,-1]]],axis=0).groupby(['ATCC_names','MG_names'])['check'].sum().reset_index()

##show genes that are mismatched
MA_exclude = MA_exclude[MA_exclude['check']==0]

In [150]:
MA_exclude = MA_exclude[~MA_exclude['ATCC_names'].isin(np.array(MA_include['ATCC_names']))]

### Summarizing exclusion of genes 

In [151]:
### Get the homologuous genes in the other strain to exclude them as well

AD_exclude_final = pd.merge(AD_exclude, full_mapping.iloc[:, 0:4], on=['ATCC_names','DH_names']).drop(['check'], axis=1)

MD_exclude_final = pd.merge(MD_exclude, full_mapping.iloc[:, 0:4], on=['MG_names','DH_names']).drop(['check'], axis=1)

MA_exclude_final = pd.merge(MA_exclude, full_mapping.iloc[:, 0:4], on=['ATCC_names','MG_names']).drop(['check'], axis=1)

In [152]:
## Combine list of all excluded genes
df = pd.concat([AD_exclude_final, MD_exclude_final, MA_exclude_final], axis=0).drop_duplicates(keep='first')

In [153]:
#df.to_csv('list_of_excluded_genes.csv', index=False)

In [154]:
df.describe()

Unnamed: 0,ATCC_names,DH_names,BW_names,MG_names
count,84,84,84,84
unique,84,82,82,82
top,DR76_RS02260,ECDH10B_RS17860,BW25113_RS17305,b2000
freq,1,2,2,2


In [155]:
print(len(AD_exclude), len(MD_exclude), len(MA_exclude))

50 79 1


In [156]:
len(df)

84

In [157]:
df[df['MG_names']=='b4906']

Unnamed: 0,ATCC_names,DH_names,BW_names,MG_names


In [190]:
MA_exclude_final[MA_exclude_final['MG_names']=='b3581']

Unnamed: 0,ATCC_names,MG_names,BW_names,DH_names


In [None]:
Manual_check_again