In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import re

from matplotlib import pyplot as plt

# Analyzing NOSC of the Cyanobacterial (PCC6803) proteome
Data from Zavrel et al. Elife 2019

# Known issues
Using a reference genome for 6803 GT-S, but the data for GT-L. Mapping is by gene name, which is imperfect. 

In [2]:
conds_df = pd.read_csv('../data/proteomes/Synechocystis/Zavrel_PCC6803_conditions.csv', dtype=dict(cond_id='str')).set_index('cond_id')
nosc_df = pd.read_csv('../data/genomes/Synechocystis/PCC6803/PCC6803_ref_prot_NOSC.csv')

# add a majority_protein_ids in a format matching the yeast data
raw_abund_df = pd.read_csv('../data/proteomes/Synechocystis/Zavrel_PCC6803_proteome.csv')
raw_abund_df['majority_protein_ids'] = [':'.join(ids.split(';')) for ids in raw_abund_df['Majority protein IDs']]
raw_abund_df = raw_abund_df.set_index('majority_protein_ids')
raw_abund_df.head(2)

Unnamed: 0_level_0,Protein names,Gene names,Majority protein IDs,Entry name,C: Gene ontology (biological process),C: Gene ontology (cellular component),C: Gene ontology (molecular function),C: Gene ontology (GO),Length,Peptides,...,Q-value,Score,Intensity,MS/MS count,27.5,55,110,220,440,1100
majority_protein_ids,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P05429,Photosystem II CP47 reaction center protein (P...,psbB slr0906,P05429,PSBB_SYNY3,photosynthetic electron transport in photosyst...,integral component of membrane [GO:0016021];pl...,chlorophyll binding [GO:0016168];electron tran...,chlorophyll binding [GO:0016168];electron tran...,507,29,...,0.0,323.31,105000000000.0,1058,69.73314,62.31298,52.86816,64.775,54.62756,37.33008
P09190,Cytochrome b559 subunit alpha (PSII reaction c...,psbE ssr3451,P09190,PSBE_SYNY3,photosynthetic electron transport chain [GO:00...,integral component of membrane [GO:0016021];ph...,heme binding [GO:0020037];metal ion binding [G...,heme binding [GO:0020037];integral component o...,81,5,...,0.0,180.14,14311000000.0,107,30.13294,24.67194,20.30978,22.72638,19.82798,15.74768


In [3]:
abund_ids = set(raw_abund_df.index.values.tolist())
cds_ids = set(nosc_df.primary_accession.values.tolist())

# There are ≈50 entries where there was > 1 majority hit.
# That is: the relevant peptides report on multiple proteins, often
# because they are alternate translations of the same gene. 
missing_ids = abund_ids.difference(cds_ids)
print(missing_ids)
print(len(missing_ids), 'not found')

shared_ids = abund_ids.intersection(cds_ids)

# The missing IDs are mostly due to isoforms of proteins that differ slightly in sequence. 
# This code identifies the individual IDs and makes a fictional row that represents the average
# of each of the isoforms.
print('Adding fictional IDs for those representing a mixture of isoforms')
lookup_table = nosc_df.set_index('primary_accession')
fakes = []
for my_id in missing_ids:
    NCs = []
    Ces = []
    for x in my_id.split(':'):
        if x in lookup_table.index:
            row = lookup_table.loc[x]
            NCs.append(row.NC)
            Ces.append(row.Ce)
            
    if len(NCs) == 0:
        continue
    print('Adding fictional protein for {0} representing {1} isoforms'.format(
        my_id, len(NCs)))
    
    NC = np.mean(NCs)
    Ce = np.mean(Ces)
    fake_protein = dict(primary_accession=my_id, NC=NC, Ce=Ce, NOSC=(Ce/NC))
    fakes.append(fake_protein)
    
extended_nosc_df = nosc_df.append(fakes, ignore_index=True)

# recheck which IDs are missing
cds_ids = set(extended_nosc_df.primary_accession.values.tolist())
missing_ids = abund_ids.difference(cds_ids)
print('After update, missing {0} IDs'.format(len(missing_ids)))
print(missing_ids)
shared_ids = abund_ids.intersection(cds_ids)

{'P74732', 'Q6ZEL1:Q6ZEQ3:P73620:P72822:Q6ZEP8:Q6ZEL2:Q6ZEK6', 'P73734', 'Q6ZEM5:Q6YRW8', 'P74195:P74627:P74059', 'Q6YRS8:P73838', 'Q6YRW7:Q6ZEM6', 'P74303', 'P73009:P74083', 'P73037'}
10 not found
Adding fictional IDs for those representing a mixture of isoforms
Adding fictional protein for Q6ZEL1:Q6ZEQ3:P73620:P72822:Q6ZEP8:Q6ZEL2:Q6ZEK6 representing 7 isoforms
Adding fictional protein for Q6ZEM5:Q6YRW8 representing 2 isoforms
Adding fictional protein for P74195:P74627:P74059 representing 3 isoforms
Adding fictional protein for Q6YRS8:P73838 representing 2 isoforms
Adding fictional protein for Q6YRW7:Q6ZEM6 representing 2 isoforms
Adding fictional protein for P73009:P74083 representing 2 isoforms
After update, missing 4 IDs
{'P74732', 'P73734', 'P73037', 'P74303'}


In [4]:
# Checking the percentage of unmapped genes. 
data_cols = ['27.5', '55', '110', '220', '440', '1100']
mapped_sum = raw_abund_df[data_cols].loc[shared_ids].sum()
total = raw_abund_df[data_cols].sum()
pct_diff = 100*(total - mapped_sum)/total

# Missing only ≈0.05% of the proteome due to mapping failure.
# Should be OK to proceed. 
pct_diff

27.5    0.048848
55      0.049458
110     0.052726
220     0.051666
440     0.055324
1100    0.060204
dtype: float64

In [5]:
data_cols = ['27.5', '55', '110', '220', '440', '1100']
long_abund_df = raw_abund_df.reset_index().melt(
    id_vars=['majority_protein_ids', 'Gene names', 'Length', 'Mol. weight [kDa]'],
    value_vars=data_cols,
    var_name='red_light_intensity_uE_m_s',
    value_name='copies_per_cell').rename(
    columns={'Gene names': 'gene_name',
             'Mol. weight [kDa]': 'mw_daltons',
             'Length': 'num_aas'})

# use the extended_nosc_df to calculate the condition-dependent proteome NOSC
# first need to add a few empty rows for which we don't have NOSC data for some reason.
# TODO: figure out why not? we already know they account for very little expression (above).
missing_ids = set(long_abund_df.majority_protein_ids).difference(
    extended_nosc_df.primary_accession)
empty_rows = pd.DataFrame(dict(primary_accession=list(missing_ids)))
my_nosc_df = pd.concat([extended_nosc_df, empty_rows]).set_index('primary_accession')

NCs = my_nosc_df.loc[long_abund_df.majority_protein_ids].NC.values
Ces = my_nosc_df.loc[long_abund_df.majority_protein_ids].Ce.values
NOSCs = my_nosc_df.loc[long_abund_df.majority_protein_ids].NOSC.values
ftm = my_nosc_df.loc[long_abund_df.majority_protein_ids].fraction_transmembrane.values
ftmC = my_nosc_df.loc[long_abund_df.majority_protein_ids].fraction_transmembrane_C.values
long_abund_df['NC_per'] = NCs
long_abund_df['Ce_per'] = Ces
long_abund_df['NC_total'] = long_abund_df.copies_per_cell.multiply(NCs)
long_abund_df['Ce_total'] = long_abund_df.copies_per_cell.multiply(Ces)
long_abund_df['NOSC'] = NOSCs 
long_abund_df['fraction_transmembrane'] = ftm 
long_abund_df['fraction_transmembrane_C'] = ftmC 
long_abund_df['condition'] = (
    'photobio_' + long_abund_df.red_light_intensity_uE_m_s.astype(str).values + '_uE')

# Add annotation of the growth mode -- here photobioreactors in turbidostat mode
long_abund_df['growth_mode'] = 'photobioreactor'
# Add annotation of stress conds -- these are all growth in variable light conds
long_abund_df['stress'] = False

# convert to daltons
long_abund_df.mw_daltons *= 1000
# commas for multiple gene names
long_abund_df.gene_name = long_abund_df.gene_name.str.replace(' ', ',')
long_abund_df.head()

Unnamed: 0,majority_protein_ids,gene_name,num_aas,mw_daltons,red_light_intensity_uE_m_s,copies_per_cell,NC_per,Ce_per,NC_total,Ce_total,NOSC,fraction_transmembrane,fraction_transmembrane_C,condition,growth_mode,stress
0,P05429,"psbB,slr0906",507,55902.0,27.5,69.73314,2589.0,-403.72,180539.09946,-28152.663281,-0.155937,0.177515,0.183855,photobio_27.5_uE,photobioreactor,False
1,P09190,"psbE,ssr3451",81,9448.5,27.5,30.13294,432.0,-63.67,13017.43008,-1918.56429,-0.147384,0.17284,0.180556,photobio_27.5_uE,photobioreactor,False
2,P09192,"psbD,sll0849",352,39492.0,27.5,51.21058,1856.0,-358.02,95046.83648,-18334.411852,-0.192899,0.241477,0.235991,photobio_27.5_uE,photobioreactor,False
3,P09193,"psbC,sll0851",460,50302.0,27.5,60.46868,2337.0,-450.17,141315.30516,-27221.185676,-0.192627,0.195652,0.204536,photobio_27.5_uE,photobioreactor,False
4,P0DJF8,"mtnP,sll0135",326,35318.0,27.5,1.255867,1587.0,-289.36,1993.061564,-363.397791,-0.182331,0.0,0.0,photobio_27.5_uE,photobioreactor,False


In [6]:
# Data re in fg/ul, convert to proteins/ul
# TODO: save this unit conversion
data_cols = ['27.5', '55', '110', '220', '440', '1100']
# fg/mol = kg/mol * 1e18 fg/kg
mws_fg_mol = raw_abund_df.loc[shared_ids]['Mol. weight [kDa]'] * 1e18 
# copies/ul = 6.02e23 copies/mol * fg/ul / (fg/mol)
copies_per_ul = raw_abund_df[data_cols].loc[shared_ids].multiply(6.02e23/mws_fg_mol, axis=0)

Ces = my_nosc_df.loc[shared_ids].Ce.values
NCs = my_nosc_df.loc[shared_ids].NC.values
Ce_total = copies_per_ul.multiply(Ces, axis=0)
NC_total = copies_per_ul.multiply(NCs, axis=0)

cond_NOSC = Ce_total.sum()/NC_total.sum()
conds_df['proteome_NOSC'] = cond_NOSC
conds_df.to_csv('../data/proteomes/Synechocystis/Zavrel_proteome_NOSC.csv')

conds_df

Unnamed: 0_level_0,red_light_intensity_uE_m_s,growth_rate_hr,spare_media_flow_rate_hr,dry_weight_mg_L,proteome_NOSC
cond_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
27.5,27.5,0.025,0.028,132,-0.132964
55.0,55.0,0.039,0.034,166,-0.134464
110.0,110.0,0.059,0.048,173,-0.135985
220.0,220.0,0.081,0.07,220,-0.136507
440.0,440.0,0.104,0.121,221,-0.136293
660.0,660.0,0.104,0.118,260,
880.0,880.0,0.99,0.089,256,
1100.0,1100.0,0.93,0.086,231,-0.139379


In [7]:
# Saving the full dataset with per-row annotations
tmp = long_abund_df.copy()

tmp['dataset'] = 'zavrel_2019'
tmp['strain'] = 'PCC6803'
tmp['species'] = 'Synechocystis sp.'
tmp['organism_key'] = 'PCC6803'
tmp['growth_rate_hr'] = conds_df.loc[tmp.red_light_intensity_uE_m_s].growth_rate_hr.values

tmp.to_csv('../data/proteomes/Synechocystis/Zavrel_protein_measurements.csv')

In [8]:
tmp

Unnamed: 0,majority_protein_ids,gene_name,num_aas,mw_daltons,red_light_intensity_uE_m_s,copies_per_cell,NC_per,Ce_per,NC_total,Ce_total,...,fraction_transmembrane,fraction_transmembrane_C,condition,growth_mode,stress,dataset,strain,species,organism_key,growth_rate_hr
0,P05429,"psbB,slr0906",507,55902.0,27.5,69.733140,2589.0,-403.72,180539.099460,-28152.663281,...,0.177515,0.183855,photobio_27.5_uE,photobioreactor,False,zavrel_2019,PCC6803,Synechocystis sp.,PCC6803,0.025
1,P09190,"psbE,ssr3451",81,9448.5,27.5,30.132940,432.0,-63.67,13017.430080,-1918.564290,...,0.172840,0.180556,photobio_27.5_uE,photobioreactor,False,zavrel_2019,PCC6803,Synechocystis sp.,PCC6803,0.025
2,P09192,"psbD,sll0849",352,39492.0,27.5,51.210580,1856.0,-358.02,95046.836480,-18334.411852,...,0.241477,0.235991,photobio_27.5_uE,photobioreactor,False,zavrel_2019,PCC6803,Synechocystis sp.,PCC6803,0.025
3,P09193,"psbC,sll0851",460,50302.0,27.5,60.468680,2337.0,-450.17,141315.305160,-27221.185676,...,0.195652,0.204536,photobio_27.5_uE,photobioreactor,False,zavrel_2019,PCC6803,Synechocystis sp.,PCC6803,0.025
4,P0DJF8,"mtnP,sll0135",326,35318.0,27.5,1.255867,1587.0,-289.36,1993.061564,-363.397791,...,0.000000,0.000000,photobio_27.5_uE,photobioreactor,False,zavrel_2019,PCC6803,Synechocystis sp.,PCC6803,0.025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10099,Q6ZEU6,sll5034,207,23200.0,1100,0.321208,1019.0,-133.93,327.310870,-43.019377,...,0.000000,0.000000,photobio_1100_uE,photobioreactor,False,zavrel_2019,PCC6803,Synechocystis sp.,PCC6803,0.930
10100,Q79EE4,ggtA,363,40755.0,1100,0.166779,1812.0,-301.54,302.202714,-50.290401,...,0.000000,0.000000,photobio_1100_uE,photobioreactor,False,zavrel_2019,PCC6803,Synechocystis sp.,PCC6803,0.930
10101,Q79EE5,slr1588,421,47088.0,1100,0.147759,2114.0,-383.21,312.362526,-56.622726,...,0.000000,0.000000,photobio_1100_uE,photobioreactor,False,zavrel_2019,PCC6803,Synechocystis sp.,PCC6803,0.930
10102,Q79EF1,desB,359,41918.0,1100,0.161586,1989.0,-361.97,321.394594,-58.489292,...,0.264624,0.284062,photobio_1100_uE,photobioreactor,False,zavrel_2019,PCC6803,Synechocystis sp.,PCC6803,0.930
