# Knockouts for CCO

1.  Given  molar percentage measurements from step N, run `MaxMass` (our modified version of `MinGenome`)  using those molar percentages to predict which genes should be knocked out in step N+1. Compare our `MaxMass` predictions with the choice of actual step N+1 and `MinGenome` step to see how similar they are.
2.  Take the iBAQ measurements for step N, remove all genes that were knocked out in actual step N+1 and recalculate the molar percentage.  Compare with actual molar percentages in step N+1 using [KL divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence) 
$$D_{KL}(P\|Q) = -\sum_iP(i)\log\frac{Q(i)}{P(i)}$$

or even more simply the Euclidean distance metric 

$$D_2(P,Q) = \sum_i\|P(i)-Q(i)\|^2$$

where $P$ is the actual molar percentages and $Q$ is the predicted molar percentage. This gives us a measure of how much protein expression changed as a result of the knockouts. If $D_{KL}=0$ then it is an exact match. If it is greater than This doesn't tell us how much protein capacity we reclaimed, but it will give us an idea of how valid our assumptions are for using molar percentage to choose which genes to knock out.
 

In [16]:
def explode(df, lst_cols, fill_value=''):
    # make sure `lst_cols` is a list
    if lst_cols and not isinstance(lst_cols, list):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)

    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()

    if (lens > 0).all():
        # ALL lists in cells aren't empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .loc[:, df.columns]
    else:
        # at least one list in cells is empty
        return pd.DataFrame({
            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
            for col in idx_cols
        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \
          .loc[:, df.columns]

In [108]:
import pandas as pd



Unnamed: 0_level_0,gene,Step
locus,Unnamed: 1_level_1,Unnamed: 2_level_1
b0986,ymcC,1
b0987,ymcD,1
b4516,insA,1
b0988,insB,1
b0989,cspH,1
b0990,cspG,1
b0991,ymcE,1
b4517,gnsA,1
b0992,yccM,1
b0993,torS,1


In [7]:
uniprot2blattner = pd.read_table('../data/Ecoli/blatter-to-uniprot.tab')
uniprot2blattner

Unnamed: 0,Blattner,Uniprot
0,b0778,P13000
1,b1589,P0AAJ1
2,b4230,P39328
3,b1478,P39451
4,b1338,P77357
5,b3076,P06864
6,b0774,P12995
7,b3592,P0ACA1
8,b0198,P31547
9,b2579,P68066


In [125]:
K12toW3110 = pd.read_table('E_coli_K12_and_E_coli_W3110_BBH.tab')
K12toW3110['Uniprot'] = K12toW3110['E_coli_K12'].str.split('|').str.get(1)
K12toW3110 = K12toW3110.join(uniprot2blattner.set_index('Uniprot'), on='Uniprot')
ecoli_ko = pd.read_table('E.coli_kos.tab',index_col='locus')
ecoli_ko = ecoli_ko\
            .join( K12toW3110\
                      .set_index('Blattner')
                 )[['gene',
                    'Step',
                    'E_coli_W3110']]
ecoli_ko[ecoli_ko['E_coli_W3110'].isnull()].to_csv('missing_mapping.tab',sep='\t')

In [145]:
#            .dropna()\
#            .reset_index()\
#            .set_index( 'E_coli_W3110' )
ecoli_ko[ecoli_ko['Step'] <= 10]

Unnamed: 0,gene,Step,E_coli_W3110
b0223,yafJ,9,W3110_lambdaRed.CDS.219
b0224,yafK,9,W3110_lambdaRed.CDS.220
b0225,yafQ,9,W3110_lambdaRed.CDS.221
b0226,dinJ,9,W3110_lambdaRed.CDS.222
b0227,yafL,9,W3110_lambdaRed.CDS.223
b0228,yafM,9,W3110_lambdaRed.CDS.224
b0231,dinB,9,W3110_lambdaRed.CDS.227
b0232,yafN,9,W3110_lambdaRed.CDS.228
b0233,yafO,9,W3110_lambdaRed.CDS.229
b0234,yafP,9,W3110_lambdaRed.CDS.230


# Preliminary Mol %

In [129]:
cols = ['protein_ID','iBAQ_Step04_1', 'iBAQ_Step04_2', 'iBAQ_Step04_3',
       'iBAQ_Step05_1', 'iBAQ_Step05_2', 'iBAQ_Step05_3', 'iBAQ_Step09_1',
       'iBAQ_Step09_2', 'iBAQ_Step09_3', 'iBAQ_Step10_1', 'iBAQ_Step10_2',
       'iBAQ_Step10_3', 'iBAQ_W3110_1', 'iBAQ_W3110_2', 'iBAQ_W3110_3']
ibaq = pd.read_table('E_coli_data_frame.txt',index_col='protein_ID',usecols=cols)
ibaq

Unnamed: 0_level_0,iBAQ_Step04_1,iBAQ_Step04_2,iBAQ_Step04_3,iBAQ_Step05_1,iBAQ_Step05_2,iBAQ_Step05_3,iBAQ_Step09_1,iBAQ_Step09_2,iBAQ_Step09_3,iBAQ_Step10_1,iBAQ_Step10_2,iBAQ_Step10_3,iBAQ_W3110_1,iBAQ_W3110_2,iBAQ_W3110_3
protein_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
W3110_lambdaRed.CDS.1,48022000.0,24511000.0,22610000.0,37573000.0,30924000.0,23502000.0,48790000.0,29735000.0,16835000.0,43719000.0,44150000.0,18430000.0,28640000.0,18533000.0,9904200.0
W3110_lambdaRed.CDS.100,1654700.0,565980.0,531380.0,500520.0,465140.0,336120.0,971770.0,659310.0,89329.0,339930.0,344140.0,116910.0,915480.0,219240.0,216360.0
W3110_lambdaRed.CDS.1007,1462800.0,1126600.0,1133400.0,741430.0,1002000.0,359410.0,533150.0,244960.0,283440.0,332380.0,595220.0,279790.0,987190.0,1744100.0,669350.0
W3110_lambdaRed.CDS.101,19219000.0,9363700.0,11542000.0,11923000.0,10054000.0,7747000.0,12960000.0,11146000.0,7598200.0,11815000.0,11033000.0,5908200.0,16522000.0,11304000.0,7059200.0
W3110_lambdaRed.CDS.1011,17565000.0,8614300.0,8547600.0,5892500.0,3463100.0,2441300.0,8540000.0,6713100.0,2363500.0,8529400.0,8116300.0,2375300.0,11760000.0,7108000.0,3729900.0
W3110_lambdaRed.CDS.1012,1729800.0,793410.0,788310.0,845620.0,1042200.0,1369100.0,1070600.0,850960.0,366750.0,1097600.0,1149600.0,328090.0,872420.0,905830.0,443360.0
W3110_lambdaRed.CDS.1014,24976000.0,35504000.0,17694000.0,12897000.0,12062000.0,8149600.0,4839600.0,1456200.0,2214100.0,6287800.0,4353100.0,2217600.0,23839000.0,16932000.0,4986800.0
W3110_lambdaRed.CDS.1017,236550.0,216360.0,99950.0,245490.0,35887.0,41136.0,208160.0,134670.0,95152.0,237960.0,151470.0,40239.0,216360.0,216360.0,216360.0
W3110_lambdaRed.CDS.1018,7285300.0,3377900.0,3436800.0,2820100.0,1596500.0,1700100.0,2932700.0,2562800.0,1335700.0,3327100.0,3637900.0,1429200.0,5758200.0,3385200.0,2195000.0
W3110_lambdaRed.CDS.1022,4580800.0,3448000.0,3245200.0,2462200.0,2120600.0,1363700.0,2925400.0,2605800.0,1343500.0,3203800.0,3191900.0,1354300.0,3087000.0,2139100.0,1527700.0


In [130]:
melted_ibaq = ibaq.reset_index().melt(id_vars=['protein_ID'],value_name='iBAQ')
melted_ibaq['Replicate'] = melted_ibaq['variable'].str.split('_').str.get(-1)
melted_ibaq['Strain'] = melted_ibaq['variable'].str.rsplit('_',n=1).str.get(0)
melted_ibaq

Unnamed: 0,protein_ID,variable,iBAQ,Replicate,Strain
0,W3110_lambdaRed.CDS.1,iBAQ_Step04_1,48022000.0,1,iBAQ_Step04
1,W3110_lambdaRed.CDS.100,iBAQ_Step04_1,1654700.0,1,iBAQ_Step04
2,W3110_lambdaRed.CDS.1007,iBAQ_Step04_1,1462800.0,1,iBAQ_Step04
3,W3110_lambdaRed.CDS.101,iBAQ_Step04_1,19219000.0,1,iBAQ_Step04
4,W3110_lambdaRed.CDS.1011,iBAQ_Step04_1,17565000.0,1,iBAQ_Step04
5,W3110_lambdaRed.CDS.1012,iBAQ_Step04_1,1729800.0,1,iBAQ_Step04
6,W3110_lambdaRed.CDS.1014,iBAQ_Step04_1,24976000.0,1,iBAQ_Step04
7,W3110_lambdaRed.CDS.1017,iBAQ_Step04_1,236550.0,1,iBAQ_Step04
8,W3110_lambdaRed.CDS.1018,iBAQ_Step04_1,7285300.0,1,iBAQ_Step04
9,W3110_lambdaRed.CDS.1022,iBAQ_Step04_1,4580800.0,1,iBAQ_Step04


In [140]:
mean_ibaq = melted_ibaq.groupby(by=['protein_ID','Strain'])['iBAQ'].mean().unstack()
blattner_mean_ibaq = mean_ibaq.join(K12toW3110.set_index('E_coli_W3110')['Blattner']).reset_index().dropna().set_index('Blattner')
blattner_mean_ibaq_steps = blattner_mean_ibaq.join(ecoli_ko)
blattner_mean_ibaq_steps[blattner_mean_ibaq_steps['Step'] <=10]
#blattner_mean_ibaq_steps[blattner_mean_ibaq_steps['Predicted_iBAQ_Step04'] = blattner_mean_ibaq_steps[]

Unnamed: 0,index,iBAQ_Step04,iBAQ_Step05,iBAQ_Step09,iBAQ_Step10,iBAQ_W3110,gene,Step,E_coli_W3110
b0223,W3110_lambdaRed.CDS.219,6.906567e+04,2.163600e+05,2.163600e+05,2.163600e+05,1.185387e+05,yafJ,9.0,W3110_lambdaRed.CDS.219
b0224,W3110_lambdaRed.CDS.220,1.363707e+05,1.600310e+05,2.163600e+05,2.163600e+05,5.166000e+04,yafK,9.0,W3110_lambdaRed.CDS.220
b0226,W3110_lambdaRed.CDS.222,1.967633e+06,1.135767e+06,2.163600e+05,2.163600e+05,1.196633e+06,dinJ,9.0,W3110_lambdaRed.CDS.222
b0261,W3110_lambdaRed.CDS.258,3.055200e+05,1.590133e+05,2.163600e+05,2.163600e+05,1.660300e+05,mmuM,9.0,W3110_lambdaRed.CDS.258
b0273,W3110_lambdaRed.CDS.270,1.828567e+06,1.965100e+06,2.120167e+05,1.952467e+05,5.549867e+05,argF,9.0,W3110_lambdaRed.CDS.270
b0281,W3110_lambdaRed.CDS.278,2.533167e+04,8.993267e+04,2.163600e+05,2.163600e+05,3.821600e+04,intF,9.0,W3110_lambdaRed.CDS.278
b0287,W3110_lambdaRed.CDS.284,5.417000e+06,2.344800e+06,2.163600e+05,2.163600e+05,3.306733e+06,yagU,9.0,W3110_lambdaRed.CDS.284
b0296,W3110_lambdaRed.CDS.293,4.819930e+07,1.473853e+06,1.781800e+05,2.163600e+05,5.949550e+07,ykgM,9.0,W3110_lambdaRed.CDS.293
b0304,W3110_lambdaRed.CDS.300,1.269653e+06,5.710100e+05,3.842533e+05,4.122867e+05,5.715300e+05,ykgC,9.0,W3110_lambdaRed.CDS.300
b0312,W3110_lambdaRed.CDS.308,8.257200e+05,4.228567e+05,2.163600e+05,2.163600e+05,5.602300e+05,betB,9.0,W3110_lambdaRed.CDS.308


## Steps for 

In [126]:
mol_pct = mean_ibaq/mean_ibaq.sum(axis=0).sort_values(ascending=False)
mol_pct

Unnamed: 0,iBAQ_Step04,iBAQ_Step05,iBAQ_Step09,iBAQ_Step10,iBAQ_W3110,E_coli_K12,Identity_E_coli_K12.to.E_coli_W3110,Identity_E_coli_W3110.to.E_coli_K12,Uniprot,Blattner
W3110_lambdaRed.CDS.1,0.001011,0.001866,0.001746,0.001937,0.000934,sp|P00561|AK1H_ECOLI,99.88,99.88,P00561,b0002
W3110_lambdaRed.CDS.100,0.000029,0.000026,0.000032,0.000015,0.000022,sp|P0A6I9|COAE_ECOLI,100.00,100.00,P0A6I9,b0103
W3110_lambdaRed.CDS.1007,0.000040,0.000043,0.000019,0.000022,0.000056,sp|P0A8D6|YMDB_ECOLI,100.00,100.00,P0A8D6,b1045
W3110_lambdaRed.CDS.101,0.000426,0.000603,0.000581,0.000524,0.000571,sp|P60560|GUAC_ECOLI,100.00,100.00,P60560,b0104
W3110_lambdaRed.CDS.1011,0.000369,0.000239,0.000323,0.000347,0.000370,sp|P33136|OPGG_ECOLI,100.00,100.00,P33136,b1048
W3110_lambdaRed.CDS.1012,0.000035,0.000066,0.000042,0.000047,0.000036,sp|P62517|OPGH_ECOLI,100.00,100.00,P62517,b1049
W3110_lambdaRed.CDS.1014,0.000831,0.000672,0.000156,0.000234,0.000749,sp|P25738|MSYB_ECOLI,100.00,100.00,P25738,b1051
W3110_lambdaRed.CDS.1017,0.000006,0.000007,0.000008,0.000008,0.000011,sp|P24188|YCEA_ECOLI,100.00,100.00,P24188,b1055
W3110_lambdaRed.CDS.1018,0.000150,0.000124,0.000125,0.000153,0.000186,sp|P0A8X2|YCEI_ECOLI,100.00,100.00,P0A8X2,b1056
W3110_lambdaRed.CDS.1022,0.000120,0.000121,0.000126,0.000141,0.000111,sp|P40874|MTOX_ECOLI,100.00,100.00,P40874,b1059


# iBAQ excel

In [31]:
ibaq = pd.read_excel('CCO_iBAQ_MolPercentage.xlsx',
                       sheet_name='iBAQ_MolPerc', 
                       header=[0,1])\
            .xs('iBAQ',axis=1)
ibaq.columns

Index(['iBAQ DGF-298_22', 'iBAQ DGF-298_23', 'iBAQ DGF-298_24',
       'iBAQ MGF-01_10', 'iBAQ MGF-01_11', 'iBAQ MGF-01_12', 'iBAQ MGF-02_16',
       'iBAQ MGF-02_17', 'iBAQ MGF-02_18', 'iBAQ Step04_07', 'iBAQ Step04_08',
       'iBAQ Step04_09', 'iBAQ Step05_13', 'iBAQ Step05_14', 'iBAQ Step05_15',
       'iBAQ Step09_19', 'iBAQ Step09_20', 'iBAQ Step09_21', 'iBAQ Step10_04',
       'iBAQ Step10_05', 'iBAQ Step10_06', 'iBAQ W3110_01', 'iBAQ W3110_02',
       'iBAQ W3110_03'],
      dtype='object', name='Protein IDs')

In [33]:
melted_ibaq = ibaq.reset_index().melt(id_vars=['index'],value_name='iBAQ')
melted_ibaq['Replicate'] = melted_ibaq['Protein IDs'].str.split('_').str.get(-1)
melted_ibaq['Strain'] = melted_ibaq['Protein IDs'].str.split('_').str.get(0).str.split(' ').str.get(-1)
melted_ibaq

Unnamed: 0,index,Protein IDs,iBAQ,Replicate,Strain
0,W3110_lambdaRed.CDS.1007,iBAQ DGF-298_22,0.000000e+00,22,DGF-298
1,W3110_lambdaRed.CDS.1014,iBAQ DGF-298_22,0.000000e+00,22,DGF-298
2,W3110_lambdaRed.CDS.1034,iBAQ DGF-298_22,0.000000e+00,22,DGF-298
3,W3110_lambdaRed.CDS.1035,iBAQ DGF-298_22,0.000000e+00,22,DGF-298
4,W3110_lambdaRed.CDS.1036,iBAQ DGF-298_22,0.000000e+00,22,DGF-298
5,W3110_lambdaRed.CDS.1038,iBAQ DGF-298_22,0.000000e+00,22,DGF-298
6,W3110_lambdaRed.CDS.1041,iBAQ DGF-298_22,0.000000e+00,22,DGF-298
7,W3110_lambdaRed.CDS.1042,iBAQ DGF-298_22,0.000000e+00,22,DGF-298
8,W3110_lambdaRed.CDS.1043,iBAQ DGF-298_22,0.000000e+00,22,DGF-298
9,W3110_lambdaRed.CDS.1127,iBAQ DGF-298_22,0.000000e+00,22,DGF-298


In [28]:
melted_molpct.groupby(by=['index','Strain']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,mol %
index,Strain,Unnamed: 2_level_1
W3110_lambdaRed.CDS.1,DGF-298,0.179089
W3110_lambdaRed.CDS.1,MGF-01,0.093845
W3110_lambdaRed.CDS.1,MGF-02,0.109474
W3110_lambdaRed.CDS.1,Step04,0.101340
W3110_lambdaRed.CDS.1,Step05,0.186276
W3110_lambdaRed.CDS.1,Step09,0.171936
W3110_lambdaRed.CDS.1,Step10,0.194278
W3110_lambdaRed.CDS.1,W3110,0.091442
W3110_lambdaRed.CDS.100,DGF-298,0.005234
W3110_lambdaRed.CDS.100,MGF-01,0.003249
