# Standardizing Mass Spectrometry Data to Calculate Periplasmic Protein Density

In [2]:
import numpy as np
import pandas as pd 
import size.empirical
import size.analytical 
import size.viz
import altair as alt
import scipy.stats
colors, palette = size.viz.altair_style()

In [3]:
# Load the calibration data
cal_data = pd.read_csv('../../data/source/Basan2015/Basan2015_calibration_curve.csv')

# Compute cell volume 
cell_vol = size.analytical.volume(cal_data['cell_length'].values, cal_data['cell_width'].values)
cal_data['reported_volume'] = cell_vol
cal_data['empirical_volume'] = size.empirical.lambda2size(cal_data['growth_rate_hr'])

# Compute the calibration
cal_data['fg_protein_per_cell_reported'] = 1E9 * cal_data['ug_protein_per_OD'].values / cal_data['cells_per_OD']

# Compute a basic linear regression
popt = scipy.stats.linregress(cal_data['reported_volume'], cal_data['fg_protein_per_cell_reported'])
popt

LinregressResult(slope=148.12387407516337, intercept=-30.064200131549057, rvalue=0.9934372704584403, pvalue=6.446280207916842e-05, stderr=8.527037199039187, intercept_stderr=28.343268287566772)

In [4]:
vol_range = np.linspace(0, 10, 100)
fit = popt[0] * vol_range + popt[1]
fit_df = pd.DataFrame([])
fit_df['volume'] = vol_range
fit_df['protein_per_cell'] = fit

data_base = alt.Chart(cal_data)
fit_base = alt.Chart(fit_df)

points = data_base.mark_point(size=100, opacity=0.75).encode(
        x=alt.X('reported_volume:Q', title='volume per cell [fL]'),
        y=alt.Y('fg_protein_per_cell_reported:Q', title='protein per cell [fg]')
)
lines = fit_base.mark_line().encode(
        x=alt.X('volume:Q', title='volume per cell [fL]'),
        y=alt.Y('protein_per_cell:Q', title='protein per cell [fg]')
)

points + lines

In [5]:
# Load the proteomics data 
data = pd.read_csv('../../data/compiled_mass_fractions.csv')
data = data[~data['go_terms'].isnull()]

# Thickness of the periplasm
delta = 0.025

# Compute the cell volume from the growth rate
data['cell_width'] = size.empirical.lambda2width(data['growth_rate_hr'])
data['cell_length'] = size.empirical.lambda2length(data['growth_rate_hr'])
data['cell_volume'] = size.analytical.volume(data['cell_length'], data['cell_width'])
data['envelope_volume'] = size.analytical.envelope_volume(data['cell_length'], 
                                                          data['cell_width'], 
                                                          delta)

# Given the slope, compute the total protein per cell
tot_mass = popt[0] * data['cell_volume'] + popt[1]
data['fg_per_cell'] = data['mass_frac'] * tot_mass


# Using the GO classification, compute the mass, mass fraction, and periplasmic protein density
periplasm = data[data['go_terms'].str.contains('GO:0042597')]
periplasm_grouped = periplasm.groupby(['dataset_name', 'condition', 
                                       'growth_rate_hr', 'cell_volume',
                                       'envelope_volume']).sum().reset_index()
periplasm_grouped['density'] = periplasm_grouped['fg_per_cell'].values / periplasm_grouped['envelope_volume']

In [6]:
base = alt.Chart(periplasm_grouped)

mass_fracs = base.mark_point().encode(
                x=alt.X('growth_rate_hr:Q', title='growth rate [per hr]'),
                y=alt.Y('fg_per_cell:Q', title='periplasmic protein mass fraction'),
                color=alt.Color('dataset_name:N',)
)

mass_fracs

In [7]:

data = pd.read_csv('../../data/compiled_mass_fractions.csv')
data[data['dataset_name']=='Soufi et al. 2015']


Unnamed: 0,gene_name,b_number,mass_frac,condition,strain,growth_rate_hr,dataset_name,go_terms,cog_class,cog_letter
47054,isph,b0029,0.000050,M9_glucose,BW25113,0.693147,Soufi et al. 2015,,metabolism,I
47055,upps,b0174,0.000008,M9_glucose,BW25113,0.693147,Soufi et al. 2015,,metabolism,I
47056,phoe,b0241,0.000000,M9_glucose,BW25113,0.693147,Soufi et al. 2015,,cellular processes and signaling,M
47057,malz,b0403,0.000004,M9_glucose,BW25113,0.693147,Soufi et al. 2015,,metabolism,G
47058,fes,b0585,0.000502,M9_glucose,BW25113,0.693147,Soufi et al. 2015,,metabolism,P
...,...,...,...,...,...,...,...,...,...,...
49313,yhev,b4551,0.000000,M9_glucose,BW25113,0.693147,Soufi et al. 2015,,poorly characterized,R
49314,yibt,b4554,0.000064,M9_glucose,BW25113,0.693147,Soufi et al. 2015,,Not Assigned,Not Assigned
49315,yifl,b4558,0.000003,M9_glucose,BW25113,0.693147,Soufi et al. 2015,,poorly characterized,S
49316,copa,b0484,0.000095,M9_glucose,BW25113,0.693147,Soufi et al. 2015,,metabolism,P


In [32]:
data = pd.read_csv('../../data/compiled_mass_fractions.csv')

In [33]:
data[data['dataset_name']=='Caglar et al. 2017']

Unnamed: 0,gene_name,b_number,mass_frac,condition,strain,growth_rate_hr,dataset_name,go_terms,cog_class,cog_letter
49318,yehB,,0.000009,gluconate_growth,REL606,0.663012,Caglar et al. 2017,GO:0055085; GO:0009297; GO:0006974; GO:0043711...,cellular processes and signaling,W
49319,carA,,0.000521,gluconate_growth,REL606,0.663012,Caglar et al. 2017,GO:0006807; GO:0008652; GO:0044205; GO:0005951...,metabolism,E
49320,carA,,0.000634,gluconate_growth,REL606,0.663012,Caglar et al. 2017,GO:0006807; GO:0008652; GO:0044205; GO:0005951...,metabolism,E
49321,carA,,0.000404,gluconate_growth,REL606,0.663012,Caglar et al. 2017,GO:0006807; GO:0008652; GO:0044205; GO:0005951...,metabolism,E
49322,carA,,0.000404,gluconate_growth,REL606,0.663012,Caglar et al. 2017,GO:0006807; GO:0008652; GO:0044205; GO:0005951...,metabolism,E
...,...,...,...,...,...,...,...,...,...,...
60011,yjjX,,0.000003,glucose_time_course,REL606,0.774755,Caglar et al. 2017,GO:0005829; GO:0016787; GO:0046872; GO:0046677...,metabolism,F
60012,creA,,0.000025,glucose_time_course,REL606,0.774755,Caglar et al. 2017,GO:0005829,cellular processes and signaling,T
60013,creB,,0.000010,glucose_time_course,REL606,0.774755,Caglar et al. 2017,GO:0032993; GO:0003700; GO:0001216; GO:0000976...,information storage and processing,K
60014,creC,,0.000002,glucose_time_course,REL606,0.774755,Caglar et al. 2017,GO:0047484; GO:0005887; GO:0046777; GO:0019660...,cellular processes and signaling,T
