In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import re

from matplotlib import pyplot as plt
from matplotlib.lines import Line2D

from collections import Counter

all_markers = list(Line2D.markers.keys())

# Analyzing NOSC of the E. coli proteome
This notebook calculates the condition dependent proteome NOSC for the datasets aggregated by Chure & Bellivue et al. Cell Systems 2021. To calculate the proteome NOSC we need near-complete proteome datasets such that the uncertainty related to unmeasured proteins is small. Recent datasets of this quality were collected by Griffin Chure and Nathan Bellivue. 

Since proteome NOSC is a weighted average of contributions from individual proteins, we do not need absolute measurements with real units. Rather, we need accurate and complete compositional data, i.e. protein A makes P% of the proteome. 

# Differences between lab practices
Valgepea 2013 uses data collected in Valgepea 2010, where E. coli was cultured in a custom minimal medium documented in Nahku et al. 2010. Peebo et al. 2015 is from the same group (Vilu) and appears to use the same growth medium base with some amino acid supplementation. 

Minimal media used by Schmidt et al. 2015 uses an M9 base that is distinct from the medium in the Vilu group. 

# Known issues
I am using MG1655 coding sequences for all samples, but Schmidt et al. and Peebo et al. are working with BW25113. The BW25113 is derived from MG1655 with a small number deletions and other changes documented in the genome announcement. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4200154/

# Some additional E. coli data sources
* Hui et al. MSB 2015 is from Hwa & Williamson groups
* Maser et al. 2021 from Vilu and Nahku groups
* Mori et al. 2022 

In [2]:
raw_abund_df = pd.read_csv('../data/proteomes/Coli/Chure_compiled_absolute_measurements.csv', index_col=0).reset_index().set_index('b_number')
nosc_df = pd.read_csv('../data/genomes/Coli/MG1655/MG1655_ref_prot_NOSC.csv').set_index('b_number')

In [3]:
nosc_df

Unnamed: 0_level_0,aa_seq,num_aas,mw_daltons,transmembrane_aas,fraction_transmembrane,primary_accession,accessions,gene_name,description,locus_tags,...,COG_IDs,KEGG_IDs,isoform_accessions,KEGG_path_IDs,KEGG_pathways,Ce_total,NC,NOSC,eC_ratio,organism
b_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b4586,MRLHVKLKEFLSMFFMAILFFPAFNASLFFTGVKPLYSIIKCSTEI...,159,18118,60,0.377358,A5A605,"A5A605,P71282",ykfM,Uncharacterized protein YkfM,"ykfM,b4586",...,ENOG5031M8I,eco:b4586,,,,-301.25,862.0,-0.349478,4.349478,coli
b1252,MTLDLPRRFPWPTLLSVCIHGAVVAGLLYTSVHQVIELPAPAQPIS...,239,26094,31,0.129707,P02929,"P02929,P76831,P94719,P94722,P94726,P94728,P947...",tonB,Protein TonB,"tonB,exbA,b1252,JW5195",...,COG0810,"ecj:JW5195,eco:b1252",,,,-244.16,1173.0,-0.208150,4.208150,coli
b3635,MPELPEVETSRRGIEPHLVGATILHAVVRNGRLRWPVSEEIYRLSD...,269,30290,0,0.000000,P05523,"P05523,Q2M7U9",mutM,Formamidopyrimidine-DNA glycosylase,"mutM,fpg,b3635,JW3610",...,COG0266,"ecj:JW3610,eco:b3635",,path:eco03410,Base excision repair - Escherichia coli K-12 M...,-231.79,1358.0,-0.170685,4.170685,coli
b1709,MSIVMQLQDVAESTRLGPLSGEVRAGEILHLVGPNGAGKSTLLARM...,249,27081,0,0.000000,P06611,P06611,btuD,Vitamin B12 import ATP-binding protein BtuD,"btuD,b1709,JW1699",...,COG4138,"ecj:JW1699,eco:b1709",,path:eco02010,ABC transporters - Escherichia coli K-12 MG1655,-157.82,1189.0,-0.132733,4.132733,coli
b2306,MSENKLNVIDLHKRYGEHEVLKGVSLQANAGDVISIIGSSGSGKST...,257,28653,0,0.000000,P07109,"P07109,P77299",hisP,Histidine transport ATP-binding protein HisP,"hisP,b2306,JW2303",...,COG4598,"ecj:JW2303,eco:b2306",,path:eco02010,ABC transporters - Escherichia coli K-12 MG1655,-201.82,1266.0,-0.159415,4.159415,coli
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
b4346-P15005-2,MSKTESYCLEDALNDLFIPETTIETILKRLTIKKNIILQGPPGVGK...,298,53157,0,0.000000,P15005-2,"P15005,Q2M5X0",mcrB,Type IV methyl-directed restriction enzyme Eco...,"mcrB,rglB,b4346,JW5871",...,COG1401,"ecj:JW5871,eco:b4346","P15005-1,P15005-2",,,-239.32,1563.0,-0.153116,4.153116,coli
b1120-P75960-2,MEKPRVLVLTGAGISAESGIRTFRAADGLWEEHRVEDVATPEGFDR...,242,31464,0,0.000000,P75960-2,P75960,cobB,NAD-dependent protein deacylase,"cobB,ycfY,b1120,JW1106",...,COG0846,"ecj:JW1106,eco:b1120","P75960-1,P75960-2","path:eco00760,path:eco01100",Nicotinate and nicotinamide metabolism - Esche...,-133.51,1189.0,-0.112288,4.112288,coli
b0149-P02919-2,MPRKGKGKGKGRKPRGKRGWLWLLLKLAIVFAVLIAIYGVYLDQKI...,799,94293,23,0.027251,P02919-2,"P02919,P75664",mrcB,Penicillin-binding protein 1B,"mrcB,pbpF,ponB,b0149,JW0145",...,COG0744,"ecj:JW0145,eco:b0149","P02919-1,P02919-2","path:eco00550,path:eco01100",Peptidoglycan biosynthesis - Escherichia coli ...,-559.41,3952.0,-0.141551,4.141551,coli
b2234-P00452-2,MDWAAEGLHNVSISQVELRSHIQFYDGIKTSDIHETIIKAAADLIS...,736,85775,0,0.000000,P00452-2,"P00452,P78088,P78177",nrdA,Ribonucleoside-diphosphate reductase 1 subunit...,"nrdA,dnaF,b2234,JW2228",...,COG0209,"ecj:JW2228,eco:b2234","P00452-1,P00452-2","path:eco00230,path:eco00240,path:eco01100,path...",Purine metabolism - Escherichia coli K-12 MG16...,-478.53,3700.0,-0.129332,4.129332,coli


In [4]:
# Take the mean of replicates for the same gene. 
# Note 1: have to do this because didn't report which measurement is from which replicate.
# Note 2: the Schmidt data is already aggregated across two measurement methods.
# NB: Assuming same dataset, strain, cond, gene and growth rate implies replicate. 
counts = raw_abund_df.reset_index().groupby('dataset,strain,condition,b_number,growth_rate_hr'.split(',')).mean()
# renaming tot_per_cell for clarity and to match how we treat other datasets
abund_df = counts.reset_index().set_index('b_number').rename(
    columns=dict(tot_per_cell='copies_per_cell'))

# Keep only the b-numbers that we have abundance and NOSC data for.
overlapping_idx = set(nosc_df.index.values).intersection(abund_df.index.values)
abund_df = abund_df.loc[overlapping_idx].copy()
nosc_df = nosc_df.loc[overlapping_idx].copy()

# Add NOSC and NC data to the abundance dataframe
abund_df['NOSC'] = nosc_df.loc[abund_df.index.values].NOSC
abund_df['NC_per'] = nosc_df.loc[abund_df.index.values].NC
abund_df['num_aas'] = nosc_df.loc[abund_df.index.values].num_aas
abund_df['mw_daltons'] = nosc_df.loc[abund_df.index.values].mw_daltons
abund_df['organism_key'] = 'coli'
abund_df['species'] = 'E. coli'
abund_df['majority_protein_ids'] = nosc_df.loc[abund_df.index.values].primary_accession
abund_df['fraction_transmembrane'] = nosc_df.loc[abund_df.index.values].fraction_transmembrane

# Formal Carbon-bound e- per protein copy
abund_df['Ce_per'] = nosc_df.loc[abund_df.index].Ce_total
# Total Carbon-bound e-/cell on this protein, i.e. weighted by copies/cell 
abund_df['Ce_total'] = abund_df.Ce_per * abund_df.copies_per_cell
# Total Carbon atoms on this protein, i.e. weighted by copies/cell 
abund_df['NC_total'] = abund_df.NC_per * abund_df.copies_per_cell
abund_df
abund_df.to_csv('../data/proteomes/Coli/Chure_mean_absolute_measurements.csv')
abund_df.head()

Unnamed: 0_level_0,dataset,strain,condition,growth_rate_hr,copies_per_cell,fg_per_cell,NOSC,NC_per,num_aas,mw_daltons,organism_key,species,majority_protein_ids,fraction_transmembrane,Ce_per,Ce_total,NC_total
b_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
b0167,li_2014,MG1655,MOPS complete,1.934364,206.892387,0.035165,-0.126187,4569.0,890,102390,coli,E. coli,P27249,0.0,-576.55,-119283.805932,945291.317842
b0167,li_2014,MG1655,MOPS complete without methionine,1.56939,122.291713,0.020786,-0.126187,4569.0,890,102390,coli,E. coli,P27249,0.0,-576.55,-70507.286956,558750.835314
b0167,li_2014,MG1655,MOPS minimal,0.7387,190.617683,0.032399,-0.126187,4569.0,890,102390,coli,E. coli,P27249,0.0,-576.55,-109900.625254,870932.194582
b0167,peebo_2015,BW25113,glucose_minimal,0.21,,,-0.126187,4569.0,890,102390,coli,E. coli,P27249,0.0,-576.55,,
b0167,peebo_2015,BW25113,glucose_minimal,0.22,4.488067,0.000763,-0.126187,4569.0,890,102390,coli,E. coli,P27249,0.0,-576.55,-2587.595263,20505.97998


In [5]:
# amino acid counts per protein
aa_counts = [Counter(a) for a in nosc_df.aa_seq]
aa_counts_df = pd.DataFrame(aa_counts, index=nosc_df.index).replace({np.NaN: 0})
aa_counts_df.head()

Unnamed: 0_level_0,M,N,T,L,P,E,Q,Y,A,G,...,V,W,R,D,I,K,H,F,S,U
b_number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
b0167,19,28.0,48.0,122.0,40.0,62.0,54.0,16.0,65.0,46.0,...,50.0,16.0,77.0,52.0,46.0,22.0,38.0,38.0,42.0,0.0
b0284,19,26.0,52.0,60.0,38.0,41.0,27.0,17.0,95.0,71.0,...,51.0,8.0,39.0,43.0,34.0,32.0,19.0,18.0,36.0,0.0
b3725,8,12.0,17.0,27.0,10.0,16.0,14.0,9.0,19.0,16.0,...,11.0,2.0,13.0,15.0,18.0,16.0,4.0,12.0,14.0,0.0
b1760,1,4.0,3.0,8.0,8.0,4.0,7.0,3.0,8.0,4.0,...,10.0,1.0,5.0,4.0,3.0,3.0,1.0,3.0,6.0,0.0
b2597,4,8.0,8.0,9.0,4.0,12.0,6.0,1.0,11.0,4.0,...,7.0,1.0,5.0,4.0,9.0,9.0,5.0,2.0,4.0,0.0


In [6]:
# get the fast growth sample from schmidt
mask = np.logical_and(abund_df.dataset == 'schmidt_2016',
                      abund_df.condition == 'chemostat_u0.5')
mean_abund_fast_growth = abund_df[mask].reset_index().set_index('b_number').replace({np.NaN: 0})

overlapping_ids = set(mean_abund_fast_growth.index).intersection(aa_counts_df.index)
tmp = aa_counts_df.loc[overlapping_ids].multiply(mean_abund_fast_growth.loc[overlapping_ids].copies_per_cell, axis=0).sum()
expression_weighted_aas = tmp / tmp.sum()
expression_weighted_aas.name = 'aa_freq'

expression_weighted_aas.to_csv('../data/proteomes/Coli/Schmidt_mu0.5_expression_weighted_aa_freqs.csv')

In [7]:
proteome_nosc_df = abund_df.groupby('dataset,strain,condition,growth_rate_hr'.split(',')).sum()
proteome_nosc_df = proteome_nosc_df[['Ce_total', 'NC_total']].copy() 
proteome_nosc_df['proteome_NOSC'] = proteome_nosc_df.Ce_total / proteome_nosc_df.NC_total
proteome_nosc_df = proteome_nosc_df.reset_index()
proteome_nosc_df.to_csv('../data/proteomes/Coli/Chure_proteome_NOSC.csv', index=False)
proteome_nosc_df

Unnamed: 0,dataset,strain,condition,growth_rate_hr,Ce_total,NC_total,proteome_NOSC
0,li_2014,MG1655,MOPS complete,1.934364,-2873416000.0,19793880000.0,-0.145167
1,li_2014,MG1655,MOPS complete without methionine,1.56939,-2085635000.0,14560260000.0,-0.143242
2,li_2014,MG1655,MOPS minimal,0.7387,-1020722000.0,7199098000.0,-0.141785
3,peebo_2015,BW25113,glucose_minimal,0.21,-632492900.0,4438966000.0,-0.142487
4,peebo_2015,BW25113,glucose_minimal,0.22,-637481400.0,4479156000.0,-0.142322
5,peebo_2015,BW25113,glucose_minimal,0.26,-663498300.0,4644507000.0,-0.142857
6,peebo_2015,BW25113,glucose_minimal,0.31,-698134700.0,4862400000.0,-0.143578
7,peebo_2015,BW25113,glucose_minimal,0.36,-734931200.0,5084070000.0,-0.144556
8,peebo_2015,BW25113,glucose_minimal,0.41,-767130300.0,5321234000.0,-0.144164
9,peebo_2015,BW25113,glucose_minimal,0.46,-805216500.0,5566319000.0,-0.144659
