In [1]:
import pandas as pd
import numpy as np
import warnings 
warnings.filterwarnings('ignore')

### DH to W3 mapping merging with W3 gene_info 
+ attained alphabetical names from CF gene info 
+ (preparing to map to full mapping)


In [2]:
##load DH_W3 mapping and W3 gene_info 
DH_mapping = pd.read_table('./DH10B_and_W3110_gene_map.txt')
W3_info = pd.read_table('../Ecoli_K12_W3110_NC_007779_gene_info.txt')

DH_mapping.columns = ['DH', 'DH_names', 'W3', 'W3_names', 'drop']
#drop bad format of missing column value`
DH_mapping = DH_mapping[['DH', 'DH_names', 'W3', 'W3_names']].dropna(how='any')

In [3]:
#take unique W3
DH_mapped = DH_mapping.groupby(['W3_names']).agg(
    DH_names=pd.NamedAgg(column='DH_names', aggfunc=lambda x:','.join(x))).reset_index()

In [4]:
#merged with W3 gene_info 
W3_mapped = pd.merge(W3_info[['gbk_locus_tag','gbk_gene_id']], DH_mapped, left_on='gbk_locus_tag', right_on='W3_names')

In [5]:
W3_mapped.describe()

Unnamed: 0,gbk_locus_tag,gbk_gene_id,W3_names,DH_names
count,3995,2740,3995,3995
unique,3995,2700,3995,3990
top,Y75_RS10500,rrf,Y75_RS10500,ECDH10B_RS23570
freq,1,5,1,2


In [6]:
#Expand DH names that were combined and separated by ','
extra= W3_mapped['DH_names'].str.split(',', n=4, expand=True)
extra.columns = ['DH_name1','DH_name2']
#merged back with W3 gene alphabetical name
extra['W3_names'] = W3_mapped['W3_names']
extra['gene_id'] = W3_mapped['gbk_gene_id']

In [7]:
#collapsing the multiple DH names columns into extra rows
new = pd.melt(extra, id_vars=['W3_names','gene_id'], value_vars=['DH_name1', 'DH_name2'], 
       value_name='DH_names')

new = new[['W3_names', 'DH_names', 'gene_id']].dropna(subset=['DH_names'])

new.fillna('-', inplace=True)

In [8]:
# making unique DH name to prepare to map with full mapping
# combining multiple alphabetical names rows into one row seprated by commas 
W3_final = new.groupby(['DH_names']).agg(
    W3_names=pd.NamedAgg(column='W3_names', aggfunc=lambda x:','.join(x)), 
    gene_id=pd.NamedAgg(column='gene_id', aggfunc=lambda x:','.join(x))).reset_index()

In [9]:
# make seperate columns for the multiple gene names 
extra_gene_id = W3_final['gene_id'].str.split(',', n=3, expand=True)
extra_gene_id.columns = ['W3_name1', 'W3_name2']
W3_final = pd.concat([W3_final, extra_gene_id], axis=1)

In [10]:
#drop original gene_id columns that has multiple columns
W3_final.drop(['gene_id', 'W3_names'],axis=1, inplace=True)
W3_final = W3_final.replace('-',None)

In [11]:
W3_final.describe()

Unnamed: 0,DH_names,W3_name1,W3_name2
count,4001,4001,2
unique,4001,2699,2
top,ECDH10B_RS19955,proV,rcsC
freq,1,18,1


### ATCC and CF mapping merging with CF gene info 
+ attained alphabetical names from CF gene info 
+ (preparing to map to full mapping)

#### include both CF gene_info files

In [12]:
#include both CF mappings 
CF_info1 = pd.read_table('../Ecoli_CFT073_NC_004431_gene_info.txt')
CF_info2 = pd.read_table('../Ecoli_CFT073_AE014075_gene_info.txt')

CF_names1 = CF_info1[['gbk_gene_id', 'gbk_locus_tag', 'gbk_old_locus_tag']]
CF_names2 = CF_info2[['gbk_gene_id', 'gbk_locus_tag']]

In [13]:
CF_name = pd.merge(CF_names1, CF_names2, left_on='gbk_old_locus_tag', right_on='gbk_locus_tag', 
                  how='outer')
CF_name = CF_name[CF_name['gbk_locus_tag_x'].notna()] #only select valid C_RS locus tags 

CF_name['check'] = CF_name['gbk_gene_id_x'] == CF_name['gbk_gene_id_y'] #check if both gen_info provides the same info

## check in consistent naming
# print(len(CF_name))
# CF_name[(CF_name['gbk_gene_id_x'].notna()) & (CF_name['gbk_gene_id_y'].notna()) & (CF_name['check']==False)]

In [14]:
## Organizing
CF_name['CF_alt_name'] = np.where(CF_name['gbk_old_locus_tag'].isna(), CF_name['gbk_locus_tag_y'],
                                  CF_name['gbk_old_locus_tag'])
CF_name.rename(columns={'gbk_gene_id_x':'CF_name1', 'gbk_gene_id_y':'CF_name2', 
                       'gbk_locus_tag_x':'locus_tag'}, inplace=True)
CF_name = CF_name[['CF_name1', 'CF_name2', 'locus_tag', 'CF_alt_name']]

CF_name.describe()

Unnamed: 0,CF_name1,CF_name2,locus_tag,CF_alt_name
count,2855,3532,5024,4738
unique,2756,3532,5024,4738
top,tnpA,entF,C_RS23100,c3206
freq,19,1,1,1


#### ATCC CF mapping merging with both CF gene info 

In [15]:
## ATCC to CF ----------
#load data
ATCC_mapping = pd.read_table('./ATCC_and_CFT073_gene_map.txt')
ATCC_mapping.columns = ['ATCC', 'ATCC_names', 'CF', 'CF_names', 'drop']
ATCC_mapping = ATCC_mapping[['ATCC', 'ATCC_names',
                             'CF', 'CF_names']].drop_duplicates(keep='first').dropna(how='any').reset_index(drop=True)


In [16]:
#take unique CF
ATCC_mapped = ATCC_mapping.groupby(['CF_names']).agg(
    ATCC_names=pd.NamedAgg(column='ATCC_names', aggfunc=lambda x:','.join(x))).reset_index()

In [17]:
#merged with CF gene_info 
CF_mapped = pd.merge(CF_name, ATCC_mapped, left_on='locus_tag', right_on='CF_names')

CF_mapped.describe()

Unnamed: 0,CF_name1,CF_name2,locus_tag,CF_alt_name,CF_names,ATCC_names
count,2517,3076,4021,3887,4021,4021
unique,2458,3076,4021,3887,4021,4011
top,tnpB,yrbK,C_RS23100,c4418,C_RS23100,DR76_RS12305
freq,8,1,1,1,1,2


In [18]:
#Expand ATCC names that were combined and separated by ','
extra= CF_mapped['ATCC_names'].str.split(',', n=4, expand=True)
extra.columns = ['ATCC_name1','ATCC_name2']
#merged back with CF gene alphabetical name
extra = pd.concat([CF_mapped.iloc[:, 0:4], extra], axis=1)

In [19]:
extra.describe()

Unnamed: 0,CF_name1,CF_name2,locus_tag,CF_alt_name,ATCC_name1,ATCC_name2
count,2517,3076,4021,3887,4021,25
unique,2458,3076,4021,3887,4006,25
top,tnpB,yrbK,C_RS23100,c4418,DR76_RS12305,DR76_RS19730
freq,8,1,1,1,3,1


In [20]:
#collapsing the multiple ATCC names columns into extra rows
new = pd.melt(extra, id_vars=['CF_name1', 'CF_name2', 'CF_alt_name','locus_tag'],
              value_vars=['ATCC_name1', 'ATCC_name2'], 
              value_name='ATCC_names')

new = new.dropna(subset=['ATCC_names'])

new.fillna('-', inplace=True)

In [21]:
# making unique ATCC name to prepare to map with full mapping
# combining multiple alphabetical names rows into one row seprated by commas 
CF_final = new.groupby(['ATCC_names']).agg(
    CF_name1=pd.NamedAgg(column='CF_name1', aggfunc=lambda x:','.join(x)), 
    CF_name2=pd.NamedAgg(column='CF_name2', aggfunc=lambda x:','.join(x)), 
    CF_alt=pd.NamedAgg(column='CF_alt_name', aggfunc=lambda x:','.join(x)), 
    locus_tag=pd.NamedAgg(column='locus_tag', aggfunc=lambda x:','.join(x))).reset_index()

In [22]:
# collapse the columns for multiple alphabetical names for both CF gene infos 
extra_names = CF_final['CF_name1'].str.split(',', n=4, expand=True)
extra_names.columns = ['CF_name1', 'CF_name2', 'CF_name3']
extra_names2 = CF_final['CF_name2'].str.split(',', n=4, expand=True)
extra_names2.columns = ['CF_name5', 'CF_name6', 'CF_name7']

extra = pd.concat([extra_names, extra_names2], axis=1).reset_index(drop=True)

#change df into np array
temp=np.array(extra)
del extra

#remove NA and only keep unique names
fixed =[]
for i in temp: 
     fixed.append(sorted(list(set(i[~pd.isnull(i)])))[::-1])
del temp

#remove '-' names 
fixed = pd.DataFrame(fixed)
fixed = fixed.replace('-', None)
fixed.columns = ['CF_names1', 'CF_names2', 'CF_names3']

In [23]:
#Collapse CF alternative names into columns
alt_names = CF_final['CF_alt'].str.split(',', n=4, expand=True)
alt_names=np.array(alt_names)
alt=[]
for i in alt_names:
    alt.append(sorted(list(set(i[~pd.isnull(i)])))[::-1]) #taking away NAs and Nones and duplicates 
alt = pd.DataFrame(alt)
alt= alt.replace('-', None)
alt.columns = ['CF_alt1', 'CF_alt2', 'CF_alt3']

In [24]:
CF_final = pd.concat([CF_final[['ATCC_names']], fixed, alt], axis=1)
CF_final.head()

Unnamed: 0,ATCC_names,CF_names1,CF_names2,CF_names3,CF_alt1,CF_alt2,CF_alt3
0,DR76_RS00080,yehE,-,,c2640,,
1,DR76_RS00085,mrp,apbC,,c2641,,
2,DR76_RS00090,metG,,,c2642,,
3,DR76_RS00095,metG,,,c2643,,
4,DR76_RS00100,yehI,,,c2644,,


In [25]:
CF_final.drop(['CF_names3','CF_alt3'], axis=1, inplace=True)

### Full Mapping 

In [26]:
##Full Mapping--------
full_mapping = pd.read_table('./ATCC_CFT073_K12MG_K12DH_BW_gene_map.txt')
full_mapping.columns = ['ATCC', 'ATCC_names', 'BW', 'BW_names', 'DH', 'DH_names', 'MG', 'MG_names', 'drop']
full_mapping = full_mapping[['ATCC_names','BW_names', 'DH_names', 'MG_names']]
full_mapping = full_mapping.drop_duplicates(keep='first') 
#3422

In [27]:
#merge DH names
full_mapping = pd.merge(full_mapping, W3_final, on='DH_names', how='left')


#merge ATCC names
full_mapping = pd.merge(full_mapping, CF_final, on='ATCC_names', how='left') 


#merge MG names
MG_info = pd.read_table('../Ecoli_K12_MG1655_chrom_U00096_gene_info.txt')[['gbk_gene_id', 'gbk_locus_tag']]
MG_info.columns=['MG_id', 'MG_names']

full_mapping = pd.merge(full_mapping, MG_info, on='MG_names', how='left')


In [28]:
W3_final.describe()

Unnamed: 0,DH_names,W3_name1,W3_name2
count,4001,4001,2
unique,4001,2699,2
top,ECDH10B_RS19955,proV,rcsC
freq,1,18,1


In [29]:
CF_final.describe()

Unnamed: 0,ATCC_names,CF_names1,CF_names2,CF_alt1,CF_alt2
count,4029,4029,988,4029,14
unique,4029,3247,690,3876,14
top,DR76_RS16505,tnpB,smrB,c0289,c1460
freq,1,58,12,6,1


In [30]:
full_mapping.isna().sum()

ATCC_names       0
BW_names         0
DH_names         0
MG_names         0
W3_name1       136
W3_name2      3420
CF_names1      358
CF_names2     2503
CF_alt1        358
CF_alt2       3416
MG_id            0
dtype: int64

## BLAST 

In [31]:
##Comparing---------------
ATCC_DH_blast = pd.read_table('../RHfiles/RBBH_ATCC_DH10B_more_sensitive.tab')
MG_DH_blast = pd.read_table('../RHfiles/RBH_out_MG1655_DH10B_more_sensitive_dmnd_uniq.tab')
MG_ATCC_blast = pd.read_table('../RHfiles/RBH_out_MG1655_ATCC_more_sensitive_dmnd_uniq.tab')

ATCC_DH_blast = ATCC_DH_blast[['A_gene', 'A_alt_name', 'B_gene']]
ATCC_DH_blast.columns = ['ATCC_gene','ATCC_gene2', 'DH_gene']
#3244 rows from blast 

MG_DH_blast = MG_DH_blast[['A_gene', 'A_alt_name', 'B_gene']]
MG_DH_blast.columns = ['MG_gene','MG_gene2', 'DH_gene']
#4162 rows from blast 

MG_ATCC_blast = MG_ATCC_blast[['A_gene', 'A_alt_name', 'B_gene', 'B_alt_name']]
MG_ATCC_blast.columns = ['MG_gene','MG_gene2', 'ATCC_gene', 'ATCC_gene2']
#3264 rows from blast 

In [32]:
#remove redundant ecpD gene
ATCC_DH_blast = ATCC_DH_blast[ATCC_DH_blast['DH_gene']!='yadV']

## ATCC VS DH 

In [33]:
#subsetting from full mapping
ATCC_DH_mapping = full_mapping.drop(['BW_names', 'MG_names','MG_id'], axis=1)

In [34]:
#drop not found DH
ATCC_DH_mapping = ATCC_DH_mapping.dropna(subset=list(ATCC_DH_mapping.columns[2:4]),how='all').drop_duplicates(keep='first')

#drop not found ATCC
ATCC_DH_mapping = ATCC_DH_mapping.dropna(subset=list(ATCC_DH_mapping.columns[5:]),how='all').drop_duplicates(keep='first')

### Cleaning AD blast 
alt names are in the alphabetical names columns if alp name is missing 

In [35]:
## fix shifting in alt gene names 
ATCC_DH_blast['ATCC_genes'] = ATCC_DH_blast['ATCC_gene']
ATCC_DH_blast['ATCC_gene'] = np.where(ATCC_DH_blast['ATCC_gene2'].isna(), np.NaN, ATCC_DH_blast['ATCC_gene'])
ATCC_DH_blast['ATCC_alt2'] = np.where(ATCC_DH_blast['ATCC_genes'].str.startswith('c'), 
                                      ATCC_DH_blast['ATCC_genes'], np.NaN)
ATCC_DH_blast['ATCC_gene2'] = np.where(ATCC_DH_blast['ATCC_gene2'].isna(), ATCC_DH_blast['ATCC_alt2'],
                                      ATCC_DH_blast['ATCC_gene2'])
ATCC_DH_blast = ATCC_DH_blast[['ATCC_gene', 'ATCC_gene2', 'DH_gene']]
ATCC_DH_blast['ATCC_gene2'] = ATCC_DH_blast['ATCC_gene2'].apply(lambda x:str(x))

In [36]:
## comma problems in more than 1 alternative names
ATCC_DH_blast['ATCC_alt'] = ATCC_DH_blast['ATCC_gene2'].apply(lambda x:x.split(',')[0])
alt2 =[]
for i in ATCC_DH_blast['ATCC_gene2']:
    try: 
        alt2.append(i.split(',')[1])
    except (IndexError):
        alt2.append('NA')

ATCC_DH_blast['ATCC_alt2'] = alt2

In [37]:
#extra rows
temp = ATCC_DH_blast[ATCC_DH_blast['ATCC_alt2'] != 'NA'].reset_index(drop=True)
temp.tail()

Unnamed: 0,ATCC_gene,ATCC_gene2,DH_gene,ATCC_alt,ATCC_alt2
6,ecpD,"c0401, c0171",ecpD,c0401,c0171
7,sucC,"c0805, c5036",sucC,c0805,c5036
8,tdcD,"c3873, c4530",tdcD,c3873,c4530
9,rbsK,"c4680, c0331, c2492",rbsK,c4680,c0331
10,arcB,"c5348, c3970",arcB,c5348,c3970


In [38]:
temp = temp[['ATCC_gene','ATCC_alt2','DH_gene']]
temp.rename(columns={'ATCC_alt2':'ATCC_alt'},inplace=True)

temp = temp.append(pd.DataFrame({'ATCC_gene':['rbsK'],'ATCC_alt':['c2492'],
                                 'DH_gene':['rbsK']}), ignore_index=True)

ATCC_DH_blast = ATCC_DH_blast[['ATCC_gene','ATCC_alt','DH_gene']]
ATCC_DH_blast = pd.concat([ATCC_DH_blast,temp]).reset_index(drop=True)

#the row that was accidentally filter bc of true NA in alt name
ATCC_DH_blast = ATCC_DH_blast.drop([1396]).reset_index(drop=True)

ATCC_DH_blast.describe()

Unnamed: 0,ATCC_gene,ATCC_alt,DH_gene
count,3152,3254,3254
unique,3140,3254,3241
top,rbsK,c0019,rbsK
freq,3,1,3


### Comparing

In [39]:
AD_blast = np.array(ATCC_DH_blast[['ATCC_gene','DH_gene']])
AD_blast_alt = np.array(ATCC_DH_blast[['ATCC_alt', 'DH_gene']])

In [40]:
ATCC_DH_mapping.describe()

Unnamed: 0,ATCC_names,DH_names,W3_name1,W3_name2,CF_names1,CF_names2,CF_alt1,CF_alt2
count,2943,2943,2943,2,2943,888,2943,4
unique,2933,2933,2214,2,2820,621,2890,3
top,DR76_RS28370,ECDH10B_RS10840,queD,rcsC,hofG,smrB,c4999,c5471
freq,2,3,11,1,5,11,3,2


In [41]:
ATCC_DH_mapping.columns[4:6]

Index(['CF_names1', 'CF_names2'], dtype='object')

In [42]:
#checking blast using alphabet names only
AD_blast_check = pd.DataFrame({})
for i in range(len(AD_blast)):
    df = ATCC_DH_mapping.iloc[:,:6][((ATCC_DH_mapping['CF_names1']== AD_blast[i][0])|
               (ATCC_DH_mapping['CF_names2']==AD_blast[i][0]))]
    if len(df)>=1:
        df['matched_CF']= AD_blast[i][0]
        df['matched_W3']= AD_blast[i][1]
        df['check'] = ((ATCC_DH_mapping['W3_name1']==AD_blast[i][1])|
                (ATCC_DH_mapping['W3_name2']==AD_blast[i][1]))
        AD_blast_check = AD_blast_check.append(df, ignore_index=True)

In [43]:
print(len(AD_blast_check), AD_blast_check['check'].sum())
AD_blast_check.head()

2720 1941


Unnamed: 0,ATCC_names,DH_names,W3_name1,W3_name2,CF_names1,CF_names2,matched_CF,matched_W3,check
0,DR76_RS01100,ECDH10B_RS12475,yfbR,,yfbR,,yfbR,yfbR,True
1,DR76_RS07060,ECDH10B_RS16790,agaR,,agaR,,agaR,agaR,True
2,DR76_RS09325,ECDH10B_RS15175,amiC,,amiC,,amiC,amiC,True
3,DR76_RS03755,ECDH10B_RS20010,asnC,,asnC,,asnC,asnC,True
4,DR76_RS00930,ECDH10B_RS12280,arnC,,arnC,arnB,arnC,arnC,True


In [44]:
#checking blast_alt using alt names only
AD_blast_alt_check = pd.DataFrame({})
for i in range(len(AD_blast_alt)):
    df = ATCC_DH_mapping.drop(ATCC_DH_mapping.columns[4:6], axis=1)[
        ((ATCC_DH_mapping['CF_alt1']== AD_blast_alt[i][0])|
               (ATCC_DH_mapping['CF_alt2']==AD_blast_alt[i][0]))]
    if len(df)>=1:
        df['matched_CF']= AD_blast_alt[i][0]
        df['matched_W3']= AD_blast_alt[i][1]
        df['check'] = ((ATCC_DH_mapping['W3_name1']==AD_blast_alt[i][1])|
                (ATCC_DH_mapping['W3_name2']==AD_blast_alt[i][1]))
        AD_blast_alt_check = AD_blast_alt_check.append(df, ignore_index=True)

In [45]:
print(len(AD_blast_alt_check), AD_blast_alt_check['check'].sum())
AD_blast_alt_check.head()

2660 1915


Unnamed: 0,ATCC_names,DH_names,W3_name1,W3_name2,CF_alt1,CF_alt2,matched_CF,matched_W3,check
0,DR76_RS01100,ECDH10B_RS12475,yfbR,,c2832,,c2832,yfbR,True
1,DR76_RS07060,ECDH10B_RS16790,agaR,,c3886,,c3886,agaR,True
2,DR76_RS09325,ECDH10B_RS15175,amiC,,c3411,,c3411,amiC,True
3,DR76_RS03755,ECDH10B_RS20010,asnC,,c4671,,c4671,asnC,True
4,DR76_RS00930,ECDH10B_RS12280,arnC,,c2796,,c2796,arnC,True


In [92]:
temp = AD_blast_alt_check[~AD_blast_alt_check['check']]

In [100]:
temp.isnull().sum()

ATCC_names      0
DH_names        0
W3_name1        0
W3_name2      745
CF_alt1         0
CF_alt2       742
matched_CF      0
matched_W3      0
check           0
dtype: int64

### combine the checks

In [46]:
##Comebine the two checks
AD_exclude = pd.concat([AD_blast_check.iloc[:, [0,1,-1]],
                AD_blast_alt_check.iloc[:, [0,1,-1]]],axis=0).groupby(['ATCC_names','DH_names'])['check'].sum().reset_index()

##show genes that are mismatched
AD_exclude = AD_exclude[AD_exclude['check']==0]

## MG VS DH

In [47]:
#subsetting from full mapping
MG_DH_mapping = full_mapping.drop(full_mapping.columns[6:-1].to_list(), axis=1).drop(full_mapping.columns[0:2].to_list(), axis=1)

In [110]:
#drop not found DH - no need to drop MG b/c MG is very complete
MG_DH_mapping = MG_DH_mapping.dropna(subset=list(MG_DH_mapping.columns[2:4]),how='all').drop_duplicates(keep='first')

### Comparing

In [113]:
## Load blast files
MG_DH_blast = pd.read_table('../RHfiles/RBH_out_MG1655_DH10B_more_sensitive_dmnd_uniq.tab')

MG_DH_blast = MG_DH_blast[['A_gene', 'A_alt_name', 'B_gene']]
MG_DH_blast.columns = ['MG_gene','MG_alt', 'DH_gene']

In [114]:
MD_blast = np.array(MG_DH_blast[['MG_gene','DH_gene']])
MD_blast_alt = np.array(MG_DH_blast[['MG_alt', 'DH_gene']])

In [115]:
#checking blast using alphabet names only
MD_blast_check = pd.DataFrame({})
for i in range(len(MD_blast)):
    df = MG_DH_mapping[(MG_DH_mapping['MG_id']== MD_blast[i][0])]
    if len(df)>=1:
        df['matched_MG']= MD_blast[i][0]
        df['matched_DH']= MD_blast[i][1]
        df['check'] = ((MG_DH_mapping['W3_name1']==MD_blast[i][1])|
                (MG_DH_mapping['W3_name2']==MD_blast[i][1]))
        MD_blast_check = MD_blast_check.append(df, ignore_index=True)

In [116]:
print(len(MD_blast_check), MD_blast_check['check'].sum())
MD_blast_check

3072 2201


Unnamed: 0,DH_names,MG_names,W3_name1,W3_name2,MG_id,matched_MG,matched_DH,check
0,ECDH10B_RS07215,b4596,osmB,,yciZ,yciZ,yciZ,False
1,ECDH10B_RS01450,b4671,rpmJ,,ykgR,ykgR,ykgR,False
2,ECDH10B_RS15545,b4684,uacT,,yqfG,yqfG,yqfG,False
3,ECDH10B_RS20635,b4686,yshB,,yshB,yshB,yshB,True
4,ECDH10B_RS12905,b2376,ypdI,,ypdI,ypdI,ypdI,True
...,...,...,...,...,...,...,...,...
3067,ECDH10B_RS22565,b4485,ytfQ,,ytfR,ytfR,ytfR,False
3068,ECDH10B_RS19750,b4477,dgoA,,dgoA,dgoA,dgoA,True
3069,ECDH10B_RS19745,b4478,dgoD,,dgoD,dgoD,dgoD,True
3070,ECDH10B_RS18900,b4453,ldrD,,ldrD,ldrD,ldrD,True


In [117]:
#checking blast_alt using alt names only
MD_blast_alt_check = pd.DataFrame({})
for i in range(len(MD_blast_alt)):
    df = MG_DH_mapping[(MG_DH_mapping['MG_names']== MD_blast_alt[i][0])]
    if len(df)>=1:
        df['matched_MG']= MD_blast_alt[i][0]
        df['matched_W3']= MD_blast_alt[i][1]
        df['check'] = ((MG_DH_mapping['W3_name1']==MD_blast_alt[i][1])|
                (MG_DH_mapping['W3_name2']==MD_blast_alt[i][1]))
        MD_blast_alt_check = MD_blast_alt_check.append(df, ignore_index=True)

In [118]:
print(len(MD_blast_alt_check), MD_blast_alt_check['check'].sum())
MD_blast_check

3214 2274


Unnamed: 0,DH_names,MG_names,W3_name1,W3_name2,MG_id,matched_MG,matched_DH,check
0,ECDH10B_RS07215,b4596,osmB,,yciZ,yciZ,yciZ,False
1,ECDH10B_RS01450,b4671,rpmJ,,ykgR,ykgR,ykgR,False
2,ECDH10B_RS15545,b4684,uacT,,yqfG,yqfG,yqfG,False
3,ECDH10B_RS20635,b4686,yshB,,yshB,yshB,yshB,True
4,ECDH10B_RS12905,b2376,ypdI,,ypdI,ypdI,ypdI,True
...,...,...,...,...,...,...,...,...
3067,ECDH10B_RS22565,b4485,ytfQ,,ytfR,ytfR,ytfR,False
3068,ECDH10B_RS19750,b4477,dgoA,,dgoA,dgoA,dgoA,True
3069,ECDH10B_RS19745,b4478,dgoD,,dgoD,dgoD,dgoD,True
3070,ECDH10B_RS18900,b4453,ldrD,,ldrD,ldrD,ldrD,True


### combine the checks

In [121]:
##Comebine the two checks
MD_exclude = pd.concat([MD_blast_check.iloc[:, [0,1,-1]],
                MD_blast_alt_check.iloc[:, [0,1,-1]]],axis=0).groupby(['MG_names','DH_names'])['check'].sum().reset_index()

##show genes that are mismatched
MD_exclude = MD_exclude[MD_exclude['check']==0]

## MG VS ATCC

In [56]:
#subsetting from full mapping
MG_ATCC_mapping = full_mapping.drop(['BW_names', 'DH_names','W3_name1',
       'W3_name2'], axis=1)

In [57]:
MG_ATCC_mapping.head()

Unnamed: 0,ATCC_names,MG_names,CF_names1,CF_names2,CF_alt1,CF_alt2,MG_id
0,DR76_RS03365,b3820,yigI,,c4741,,yigI
1,DR76_RS03360,b3821,pldA,,c4742,,pldA
2,DR76_RS03355,b3822,recQ,,c4744,,recQ
3,DR76_RS03350,b3823,rhtC,rhtB,c4745,,rhtC
4,DR76_RS03345,b3824,yigK,rhtB,c4746,,rhtB


In [58]:
#drop not found ATCC, not dropping MG b/c MG is very complete 
MG_ATCC_mapping = MG_ATCC_mapping.dropna(subset=list(MG_ATCC_mapping.columns[2:-1]),how='all').drop_duplicates(keep='first')

### Cleaning up MA Blast 

In [59]:
MG_ATCC_blast = pd.read_table('../RHfiles/RBH_out_MG1655_ATCC_more_sensitive_dmnd_uniq.tab')
MG_ATCC_blast = MG_ATCC_blast[['A_gene', 'A_alt_name', 'B_gene', 'B_alt_name']]
MG_ATCC_blast.columns = ['MG_gene','MG_gene2', 'ATCC_gene', 'ATCC_gene2']
#3264 rows from blast 

#remove redundant ecpD gene
MG_ATCC_blast = MG_ATCC_blast[MG_ATCC_blast['MG_gene']!='yadV']
MG_ATCC_blast.columns = ['MG_gene','MG_gene2', 'ATCC_gene', 'ATCC_gene2']

In [60]:
## fix shifting in alt gene names 
MG_ATCC_blast['ATCC_genes'] = MG_ATCC_blast['ATCC_gene']
MG_ATCC_blast['ATCC_gene'] = np.where(MG_ATCC_blast['ATCC_gene2'].isna(), np.NaN, MG_ATCC_blast['ATCC_gene'])


MG_ATCC_blast['ATCC_alt2'] = np.where(MG_ATCC_blast['ATCC_genes'].str.startswith('c'), 
                                      MG_ATCC_blast['ATCC_genes'], np.NaN)

MG_ATCC_blast['ATCC_gene2'] = np.where(MG_ATCC_blast['ATCC_gene2'].isna(), MG_ATCC_blast['ATCC_alt2'],
                                      MG_ATCC_blast['ATCC_gene2'])

MG_ATCC_blast = MG_ATCC_blast[['MG_gene', 'MG_gene2','ATCC_gene', 'ATCC_gene2']]

MG_ATCC_blast['ATCC_gene2'] = MG_ATCC_blast['ATCC_gene2'].apply(lambda x:str(x))

In [61]:
## comma problems in more than 1 alternative names
MG_ATCC_blast['ATCC_alt'] = MG_ATCC_blast['ATCC_gene2'].apply(lambda x:x.split(',')[0])


alt2 =[]
for i in MG_ATCC_blast['ATCC_gene2']:
    try: 
        alt2.append(i.split(',')[1])
    except (IndexError):
        alt2.append('NA')

MG_ATCC_blast['ATCC_alt2'] = alt2

In [62]:
#extra rows
temp = MG_ATCC_blast[MG_ATCC_blast['ATCC_alt2'] != 'NA'].reset_index(drop=True)

In [63]:
temp = temp[['MG_gene', 'MG_gene2','ATCC_gene','ATCC_alt2']]
temp.rename(columns={'ATCC_alt2':'ATCC_alt'},inplace=True)

temp = temp.append(pd.DataFrame({'MG_gene':['rbsK'],'MG_gene2':['b3752'],
                                 'ATCC_gene':['rbsK'],'ATCC_alt':['c2492']}), ignore_index=True)

MG_ATCC_blast = MG_ATCC_blast[['MG_gene', 'MG_gene2','ATCC_gene','ATCC_alt']]
MG_ATCC_blast = pd.concat([MG_ATCC_blast,temp]).reset_index(drop=True)

#the row that was accidentally filter bc of true NA in alt name
#MG_ATCC_blast = MG_ATCC_blast.drop([1396]).reset_index(drop=True)

MG_ATCC_blast.describe()

Unnamed: 0,MG_gene,MG_gene2,ATCC_gene,ATCC_alt
count,3275,3273,3163,3275
unique,3263,3260,3149,3264
top,rbsK,b3752,rbsK,c3611
freq,3,3,3,6


In [64]:
MG_ATCC_mapping.describe()

Unnamed: 0,ATCC_names,MG_names,CF_names1,CF_names2,CF_alt1,CF_alt2,MG_id
count,3054,3054,3054,918,3054,6,3054
unique,3049,3043,2928,640,3003,5,3043
top,DR76_RS10475,b1978,hofG,smrB,c4999,c5471,yeeJ
freq,2,3,5,11,3,2,3


### Comparing

In [65]:
MA_blast = np.array(MG_ATCC_blast[['MG_gene','ATCC_gene']])
MA_blast2 = np.array(MG_ATCC_blast[['MG_gene2','ATCC_gene']])
MA_blast_alt = np.array(MG_ATCC_blast[['MG_gene','ATCC_alt']])
MA_blast_alt2 = np.array(MG_ATCC_blast[['MG_gene2','ATCC_alt']])

In [66]:
#checking blast using alphabet names only
MA_blast_check = pd.DataFrame({})
for i in range(len(MA_blast)):
    df = MG_ATCC_mapping[MG_ATCC_mapping['MG_id']== MA_blast[i][0]]
    if len(df)>=1:
        df['matched_MG']= MA_blast[i][0]
        df['matched_CF']= MA_blast[i][1]
        df['check'] = ((MG_ATCC_mapping['CF_names1']==MA_blast[i][1])|
                      (MG_ATCC_mapping['CF_names2']==MA_blast[i][1]))
        MA_blast_check = MA_blast_check.append(df, ignore_index=True)

print(len(MA_blast_check), MA_blast_check['check'].sum())

2631 2483


In [67]:
#checking blast2 using alt name of MG and alphabet names of ATCC 
MA_blast2_check2 = pd.DataFrame({})
for i in range(len(MA_blast2)):
    df = MG_ATCC_mapping[MG_ATCC_mapping['MG_names']== MA_blast2[i][0]]
    if len(df)>=1:
        df['matched_MG']= MA_blast2[i][0]
        df['matched_CF']= MA_blast2[i][1]
        df['check'] = ((MG_ATCC_mapping['CF_names1']==MA_blast2[i][1])|
                      (MG_ATCC_mapping['CF_names2']==MA_blast2[i][1]))
        MA_blast2_check2 = MA_blast2_check2.append(df, ignore_index=True)

print(len(MA_blast2_check2), MA_blast2_check2['check'].sum())

2743 2578


In [68]:
#checking blast using alp alt of MG and alt name of ATCC 
MA_blast_alt_check = pd.DataFrame({})
for i in range(len(MA_blast_alt)):
    df = MG_ATCC_mapping[MG_ATCC_mapping['MG_id']== MA_blast_alt[i][0]]
    if len(df)>=1:
        df['matched_MG']= MA_blast_alt[i][0]
        df['matched_CF']= MA_blast_alt[i][1]
        df['check'] = ((MG_ATCC_mapping['CF_alt1']==MA_blast_alt[i][1])|
                      (MG_ATCC_mapping['CF_alt2']==MA_blast_alt[i][1]))
        MA_blast_alt_check = MA_blast_alt_check.append(df, ignore_index=True)

print(len(MA_blast_alt_check), MA_blast_alt_check['check'].sum())

2631 2601


In [69]:
#check2ing blast using alt2 names only
MA_blast_alt2_check2 = pd.DataFrame({})
for i in range(len(MA_blast_alt2)):
    df = MG_ATCC_mapping[MG_ATCC_mapping['MG_names']== MA_blast_alt2[i][0]]
    if len(df)>=1:
        df['matched_MG']= MA_blast_alt2[i][0]
        df['matched_CF']= MA_blast_alt2[i][1]
        df['check'] = ((MG_ATCC_mapping['CF_alt1']==MA_blast_alt2[i][1])|
                      (MG_ATCC_mapping['CF_alt2']==MA_blast_alt2[i][1]))
        MA_blast_alt2_check2 = MA_blast_alt2_check2.append(df, ignore_index=True)
        
print(len(MA_blast_alt2_check2), MA_blast_alt2_check2['check'].sum())

2743 2713


In [70]:
##Comebine all checks
MA_exclude = pd.concat([MA_blast_check.iloc[:, [0,1,-1]],
                        MA_blast2_check2.iloc[:, [0,1,-1]],
                        MA_blast_alt_check.iloc[:, [0,1,-1]],
                        MA_blast_alt2_check2.iloc[:, [0,1,-1]]],axis=0).groupby(['ATCC_names','MG_names'])['check'].sum().reset_index()

##show genes that are mismatched
MA_exclude = MA_exclude[MA_exclude['check']==0]

### Summarizing exclusion of genes 

In [123]:
### Get the homologuous genes in the other strain to exclude them as well

AD_exclude_final = pd.merge(AD_exclude, full_mapping.iloc[:, 0:4], on=['ATCC_names','DH_names']).drop(['check'], axis=1)

MD_exclude_final = pd.merge(MD_exclude, full_mapping.iloc[:, 0:4], on=['MG_names','DH_names']).drop(['check'], axis=1)

MA_exclude_final = pd.merge(MA_exclude, full_mapping.iloc[:, 0:4], on=['ATCC_names','MG_names']).drop(['check'], axis=1)

In [126]:
## Combine list of all excluded genes
df = pd.concat([AD_exclude_final, MD_exclude_final, MA_exclude_final], axis=0).drop_duplicates(keep='first')
#df.to_csv('list_of_excluded_genes.csv', index=False)

In [125]:
df.describe()

Unnamed: 0,ATCC_names,DH_names,BW_names,MG_names
count,999,999,999,999
unique,992,988,985,985
top,DR76_RS00355,ECDH10B_RS10840,BW25113_RS10335,b1978
freq,2,3,3,3


In [122]:
print(len(AD_exclude), len(MD_exclude), len(MA_exclude))

737 942 13
