In [1]:
from collections import Counter
from itertools import combinations

import numpy as np
import pandas as pd
from scipy.stats import pearsonr

In [2]:
datasets_list = pd.read_excel('../data/processed/datasets_list.xlsx')

In [3]:
datasets_interim = [pd.read_excel(f'../data/interim/{i[1]["object"]}/{i[1]["study"]}_int.xlsx') for i in datasets_list.iterrows()]

In [4]:
n_proteins = []
n_assigned_proteins = []
assigned_percent = []
total_copies_per_cell = []
assigned_copies_percent = []

for i in datasets_interim:
    nps = len(i)
    nap = len(i[~i['assigned_id'].isna()])
    acp = i[~i['assigned_id'].isna()]['copies_per_cell'].sum() / i['copies_per_cell'].sum()
    
    n_proteins.append(nps)
    n_assigned_proteins.append(nap)
    assigned_percent.append(round(nap / nps * 100, 2))
    assigned_copies_percent.append(round(acp * 100, 2))
    total_copies_per_cell.append(round(i['copies_per_cell'].sum()))

In [5]:
datasets_list['n_proteins'] = n_proteins
datasets_list['n_assigned_proteins'] = n_assigned_proteins
datasets_list['assigned_percent'] = assigned_percent
datasets_list['assigned_copies_percent'] = assigned_copies_percent
datasets_list['total_copies_per_cell'] = total_copies_per_cell

datasets_list['total_copies_per_um3'] = datasets_list['object'].map({'ecoli': 2.15, 'scerevisiae': 42.0, 'hela': 2800.0})
datasets_list['total_copies_per_um3'] = np.round(datasets_list['total_copies_per_cell'] / datasets_list['total_copies_per_um3']).astype(int)

datasets_list.to_excel('../results/datasets_processing_stats.xlsx', index=False)

In [6]:
datasets_list = datasets_list.iloc[:-2]

In [7]:
datasets_ecoli = {i[1]["code"]: pd.read_excel(f'../data/processed/ecoli/{i[1]["study"]}_processed.xlsx') for i in datasets_list[datasets_list['object'] == 'ecoli'].iterrows()}
datasets_ecoli_core = datasets_ecoli.copy()
for i in ['TA10']: del datasets_ecoli_core[i]
datasets_ecoli_core = datasets_ecoli_core.values()

datasets_scerevisiae = {i[1]["code"]: pd.read_excel(f'../data/processed/scerevisiae/{i[1]["study"]}_processed.xlsx') for i in datasets_list[datasets_list['object'] == 'scerevisiae'].iterrows()}
datasets_scerevisiae_core = datasets_scerevisiae.copy()
for i in ['GH03', 'LA16']: del datasets_scerevisiae_core[i]
datasets_scerevisiae_core = datasets_scerevisiae_core.values()

datasets_hela = {i[1]["code"]: pd.read_excel(f'../data/processed/hela/{i[1]["study"]}_processed.xlsx') for i in datasets_list[datasets_list['object'] == 'hela'].iterrows()}
datasets_hela_core = datasets_hela.values()

In [8]:
ecoli_id_counter = Counter()
for d in datasets_ecoli_core:
    ecoli_id_counter += Counter(d['assigned_id'])

if '_other' in ecoli_id_counter: del ecoli_id_counter['_other']
ecoli_id_counter = pd.Series(ecoli_id_counter)
ecoli_core_n0 = ecoli_id_counter[ecoli_id_counter == len(datasets_ecoli_core)].index.to_list()
ecoli_core_n1 = ecoli_id_counter[ecoli_id_counter >= len(datasets_ecoli_core) - 1].index.to_list()

scerevisiae_id_counter = Counter()
for d in datasets_scerevisiae_core:
    scerevisiae_id_counter += Counter(d['assigned_id'])

if '_other' in scerevisiae_id_counter: del scerevisiae_id_counter['_other']
scerevisiae_id_counter = pd.Series(scerevisiae_id_counter)
scerevisiae_core_n0 = scerevisiae_id_counter[scerevisiae_id_counter == len(datasets_scerevisiae_core)].index.to_list()
scerevisiae_core_n1 = scerevisiae_id_counter[scerevisiae_id_counter >= len(datasets_scerevisiae_core) - 1].index.to_list()

hela_id_counter = Counter()
for d in datasets_hela_core:
    hela_id_counter += Counter(d['assigned_id'])

if '_other' in hela_id_counter: del hela_id_counter['_other']
hela_id_counter = pd.Series(hela_id_counter)
hela_core_n0 = hela_id_counter[hela_id_counter == len(datasets_hela_core)].index.to_list()
hela_core_n1 = hela_id_counter[hela_id_counter >= len(datasets_hela_core) - 1].index.to_list()

In [9]:
len(ecoli_core_n0), len(scerevisiae_core_n0), len(hela_core_n0)

(934, 1452, 5051)

In [10]:
len(ecoli_core_n1), len(scerevisiae_core_n1), len(hela_core_n1)

(1501, 2151, 6734)

In [11]:
cores = {'ecoli': ecoli_core_n0, 'scerevisiae': scerevisiae_core_n0, 'hela': hela_core_n0}

In [12]:
datasets_list['core_input'] = (datasets_list['code']
    .map({'TA09': False, 'GH03': False, 'LA16': False})
    .combine_first(pd.Series(np.repeat(True, len(datasets_list))))
)

In [13]:
core_input_percent = []
core_copies_prop_n0 = []
core_copies_prop_n1 = []

for d in datasets_ecoli:
    core_input_percent.append(len(datasets_ecoli[d][datasets_ecoli[d]['assigned_id'].isin(ecoli_core_n0)]) / len(ecoli_core_n0))
    core_copies_prop_n0.append(datasets_ecoli[d][datasets_ecoli[d]['assigned_id'].isin(ecoli_core_n0)]['copies_per_cell'].sum() / datasets_ecoli[d]['copies_per_cell'].sum())
    core_copies_prop_n1.append(datasets_ecoli[d][datasets_ecoli[d]['assigned_id'].isin(ecoli_core_n1)]['copies_per_cell'].sum() / datasets_ecoli[d]['copies_per_cell'].sum())
    
for d in datasets_scerevisiae:
    core_input_percent.append(len(datasets_scerevisiae[d][datasets_scerevisiae[d]['assigned_id'].isin(scerevisiae_core_n0)]) / len(scerevisiae_core_n0))
    core_copies_prop_n0.append(datasets_scerevisiae[d][datasets_scerevisiae[d]['assigned_id'].isin(scerevisiae_core_n0)]['copies_per_cell'].sum() / datasets_scerevisiae[d]['copies_per_cell'].sum())
    core_copies_prop_n1.append(datasets_scerevisiae[d][datasets_scerevisiae[d]['assigned_id'].isin(scerevisiae_core_n1)]['copies_per_cell'].sum() / datasets_scerevisiae[d]['copies_per_cell'].sum())
    
for d in datasets_hela:
    core_input_percent.append(len(datasets_hela[d][datasets_hela[d]['assigned_id'].isin(hela_core_n0)]) / len(hela_core_n0))
    core_copies_prop_n0.append(datasets_hela[d][datasets_hela[d]['assigned_id'].isin(hela_core_n0)]['copies_per_cell'].sum() / datasets_hela[d]['copies_per_cell'].sum())
    core_copies_prop_n1.append(datasets_hela[d][datasets_hela[d]['assigned_id'].isin(hela_core_n1)]['copies_per_cell'].sum() / datasets_hela[d]['copies_per_cell'].sum())

In [14]:
datasets_list['core_copies_percent_n0'] = core_copies_prop_n0
datasets_list['core_copies_percent_n1'] = core_copies_prop_n1
datasets_list['core_input_percent'] = core_input_percent

In [15]:
datasets_list.to_excel('../results/datasets_calculations.xlsx', index=False)

In [16]:
ecoli_id = ecoli_id_counter.to_frame().rename({0: 'n_datasets'}, axis=1)
ecoli_id.index.name = 'assigned_id'

datasets_ecoli_core_norm = []

for i in datasets_ecoli_core:
    i = i[i['assigned_id'] != '_other'].copy()
    i['parts_per_mil'] = i['copies_per_cell'] / i['copies_per_cell'].sum() * 1000000
    
    datasets_ecoli_core_norm.append(i[['assigned_id', 'parts_per_mil']])
    
x = pd.concat(datasets_ecoli_core_norm)
x = x[x['assigned_id'] != '_other'].groupby('assigned_id').mean()

ecoli_id = ecoli_id.join(x)

ecoli_id.to_excel('../results/ecoli_cores.xlsx')

scerevisiae_id = scerevisiae_id_counter.to_frame().rename({0: 'n_datasets'}, axis=1)
scerevisiae_id.index.name = 'assigned_id'

datasets_scerevisiae_core_norm = []

for i in datasets_scerevisiae_core:
    i = i[i['assigned_id'] != '_other'].copy()
    i['parts_per_mil'] = i['copies_per_cell'] / i['copies_per_cell'].sum() * 1000000
    
    datasets_scerevisiae_core_norm.append(i[['assigned_id', 'parts_per_mil']])
    
x = pd.concat(datasets_scerevisiae_core_norm)
x = x[x['assigned_id'] != '_other'].groupby('assigned_id').mean()

scerevisiae_id = scerevisiae_id.join(x)

scerevisiae_id.to_excel('../results/scerevisiae_cores.xlsx')

hela_id = hela_id_counter.to_frame().rename({0: 'n_datasets'}, axis=1)
hela_id.index.name = 'assigned_id'

datasets_hela_core_norm = []

for i in datasets_hela_core:
    i = i[i['assigned_id'] != '_other'].copy()
    i['parts_per_mil'] = i['copies_per_cell'] / i['copies_per_cell'].sum() * 1000000
    
    datasets_hela_core_norm.append(i[['assigned_id', 'parts_per_mil']])
    
x = pd.concat(datasets_hela_core_norm)
x = x[x['assigned_id'] != '_other'].groupby('assigned_id').mean()

hela_id = hela_id.join(x)

hela_id.to_excel('../results/hela_cores.xlsx')

In [17]:
correlations = {}

for o in datasets_list['object'].unique():
    studies_object = datasets_list.loc[datasets_list['object'] == o, 'study']
    
    datasets_object = {i: pd.read_excel(f'../data/processed/{o}/{i}_processed.xlsx') for i in studies_object}
    datasets_object_r = {i: datasets_object[i][datasets_object[i]['assigned_id'] != '_other'] for i in studies_object}
    
    c = pd.DataFrame(index=studies_object, columns=studies_object)
    
    for j,l in combinations(studies_object, 2):
        jdf = datasets_object_r[j].set_index('assigned_id').join(datasets_object_r[l].set_index('assigned_id'), how='inner', rsuffix='_2').loc[:, ['copies_per_cell', 'copies_per_cell_2']]    
        c.loc[j, l] = pearsonr(jdf['copies_per_cell'], jdf['copies_per_cell_2'])[0]
        
        jdf = jdf[jdf.index.isin(cores[o])]
        c.loc[l, j] = pearsonr(jdf['copies_per_cell'], jdf['copies_per_cell_2'])[0]
        
    correlations[o] = c

In [18]:
for o in datasets_list['object'].unique():
    correlations[o].index.name = None
    correlations[o].columns.name = None
    
    correlations[o].to_excel(f'../results/correlation_{o}.xlsx')

In [19]:
datasets_list

Unnamed: 0,object,study,code,method,n_proteins,n_assigned_proteins,assigned_percent,assigned_copies_percent,total_copies_per_cell,total_copies_per_um3,core_input,core_copies_percent_n0,core_copies_percent_n1,core_input_percent
0,ecoli,taniguchi2010,TA10,FL,1018,1015,99.71,100.0,94571,43987,True,0.739465,0.87164,0.444325
1,ecoli,valgepea2013,VA13,MS,1179,1179,100.0,100.0,4293284,1996876,True,0.877605,0.973178,1.0
2,ecoli,li2014,LI14,RP,3883,3587,92.38,95.18,5627623,2617499,True,0.759078,0.866133,1.0
3,ecoli,wisniewski2014,WI14,MS,2261,2228,98.54,96.88,1321542,614671,True,0.793106,0.902692,1.0
4,ecoli,soufi2015,SO15,MS,1913,1912,99.95,100.0,11214979,5216269,True,0.799098,0.925206,1.0
5,ecoli,schmidt2016,SC16,MS,2355,2350,99.79,99.94,5070410,2358330,True,0.81507,0.913042,1.0
6,ecoli,radzikowski2016,RA16,MS,1959,1954,99.74,99.89,2220410,1032749,True,0.807299,0.930865,1.0
7,scerevisiae,ghaemmaghami2003,GH03,WB,3868,3844,99.38,92.35,46664471,1111059,False,0.559706,0.70173,0.86708
8,scerevisiae,kulak2014,KU14,MS,4570,4494,98.34,89.09,48114163,1145575,True,0.687232,0.856777,1.0
9,scerevisiae,lawless2016,LA16,SRM,1167,1118,95.8,76.75,56322039,1341001,False,0.651786,0.741809,0.409091


In [20]:
data_untargeted = datasets_list[~datasets_list['code'].isin(['TA10', 'GH03', 'LA16'])]

In [21]:
avg_mass = [[], [], []]

for k, o in enumerate(data_untargeted['object'].unique()):
    studies_object = data_untargeted.loc[data_untargeted['object'] == o, 'study']
    proteome = pd.read_excel(f'../data/processed/proteomes/{o}_proteome.xlsx')
    proteome = proteome.set_index('uniprot_id')['mass']
    
    datasets_object = [pd.read_excel(f'../data/processed/{o}/{i}_processed.xlsx') for i in studies_object]
    
    for i in datasets_object:
        i = i[i['assigned_id'] != '_other']
        masses = i['assigned_id'].map(proteome)
        avg_mass[k].append((i['copies_per_cell'] * masses).sum() / i['copies_per_cell'].sum())
        
avg_mass = pd.DataFrame(data=avg_mass).T
avg_mass.columns = ['ecoli', 'scerevisiae', 'hela']

In [22]:
avg_mass_avg = avg_mass.mean()

In [23]:
protein_per_cell = pd.Series({'ecoli': 0.280, 'scerevisiae': 5.6, 'hela': 250.0})

In [24]:
protein_per_cell

ecoli            0.28
scerevisiae      5.60
hela           250.00
dtype: float64

In [25]:
copies_per_cell_milo = protein_per_cell * (6*10**23) * (10**(-12)) / avg_mass_avg

In [26]:
copies_per_cell_milo

ecoli             5724808.094557
scerevisiae      91394886.112417
hela           3798443501.313684
dtype: object

In [27]:
volumes = pd.Series({'ecoli': 2.15, 'scerevisiae': 42.0, 'hela': 2425.0})

In [28]:
copies_per_cell_milo / volumes

ecoli          2662701.439329
scerevisiae    2176068.716962
hela            1566368.45415
dtype: object

In [29]:
min_core_n0_percent = data_untargeted.groupby('object')['core_copies_percent_n0'].min()

In [30]:
all_ids = {'ecoli': ecoli_id_counter, 'scerevisiae': scerevisiae_id_counter, 'hela': hela_id_counter}

In [60]:
int_proteomes = {}

for i, j in enumerate(data_untargeted['object'].unique()):
    data_object = datasets_list.loc[datasets_list['object'] == j]
    
    int_proteomes[j] = pd.DataFrame(index=all_ids[j].index)
    
    datasets_object = [pd.read_excel(f'../data/processed/{j}/{m}_processed.xlsx') for m in data_object['study'].values]
    
    for k, l in enumerate(datasets_object):
        l = l[l['assigned_id'] != '_other']
        
        factor = (copies_per_cell_milo.loc[j] * (data_object['assigned_copies_percent'].iloc[k] / 100) * min_core_n0_percent.loc[j] * data_object['core_input_percent'].iloc[k]) / l[l['assigned_id'].isin(cores[j])]['copies_per_cell'].sum()
        
        l = l.set_index('assigned_id')['copies_per_cell']
        
        l = l * factor
        
        int_proteomes[j][data_object['code'].iloc[k]] = int_proteomes[j].index.map(l)
        
    int_proteomes[j]['avg'] = int_proteomes[j].mean(axis=1).apply(np.ceil).astype(int)
    int_proteomes[j]['std'] = int_proteomes[j].iloc[:, :-1].std(axis=1)
    
    int_proteomes[j].index.name = 'uniprot_id'
    
    int_proteomes[j].to_excel(f'../results/proteome_{j}_normalized.xlsx')

In [49]:
copies_per_cell_milo

ecoli             5724808.094557
scerevisiae      91394886.112417
hela           3798443501.313684
dtype: object

In [59]:
int_proteomes['hela'].sum()

NA11    3.100263e+09
WI12    3.031016e+09
KU14    2.972689e+09
HE15    2.890236e+09
IT16    3.220158e+09
BJ17    3.178046e+09
MO21    2.858023e+09
avg     3.135458e+09
std     2.038891e+09
dtype: float64

In [33]:
#ribosomes
data_ecoli = datasets_list.loc[datasets_list['object'] == 'ecoli']
datasets_ecoli = [pd.read_excel(f'../data/processed/ecoli/{i}_processed.xlsx') for i in data_ecoli['study']]
ribosome_ecoli = pd.read_excel(f'../data/processed/ribosomes/ecoli_ribosome.xlsx')['uniprot_id']
int_ecoli = pd.read_excel('../results/proteome_ecoli_normalized.xlsx', index_col=0)

ratio_ecoli_ribo = pd.Series(index=ribosome_ecoli.values, data=1)
ratio_ecoli_ribo.loc['P0A7K2'] = 4

ribosome_ecoli = pd.DataFrame(index=ribosome_ecoli.values, columns=data_ecoli['code'].to_list() + ['int'])
ribosome_ecoli.index.name = 'uniprot_id'

for i, j in enumerate(datasets_ecoli):
    ribosome_ecoli[data_ecoli['code'].iloc[i]] = ribosome_ecoli.index.map(j.set_index('assigned_id')['copies_per_cell'])
    
ribosome_ecoli['int'] = ribosome_ecoli.index.map(int_ecoli['avg'])

ribosome_ecoli = ribosome_ecoli.divide(ratio_ecoli_ribo, axis=0)

ribosome_ecoli = ribosome_ecoli.drop('TA10', axis=1)

ribosome_ecoli.to_excel('../results/ribosome_ecoli.xlsx')

In [34]:
#ribosomes
data_scerevisiae = datasets_list.loc[datasets_list['object'] == 'scerevisiae']
datasets_scerevisiae = [pd.read_excel(f'../data/processed/scerevisiae/{i}_processed.xlsx') for i in data_scerevisiae['study']]
ribosome_scerevisiae = pd.read_excel(f'../data/processed/ribosomes/scerevisiae_ribosome.xlsx', index_col=0)
int_scerevisiae = pd.read_excel('../results/proteome_scerevisiae_normalized.xlsx', index_col=0)


for i, j in enumerate(datasets_scerevisiae):
    ribosome_scerevisiae[data_scerevisiae['code'].iloc[i]] = ribosome_scerevisiae.index.map(j.set_index('assigned_id')['copies_per_cell'])
    
ribosome_scerevisiae['int'] = ribosome_scerevisiae.index.map(int_scerevisiae['avg'])
ribosome_scerevisiae = ribosome_scerevisiae.groupby('ribosomal_part').sum()

ribosome_scerevisiae.to_excel('../results/ribosome_scerevisiae.xlsx')

In [35]:
#ribosomes
data_hela = datasets_list.loc[datasets_list['object'] == 'hela']
datasets_hela = [pd.read_excel(f'../data/processed/hela/{i}_processed.xlsx') for i in data_hela['study']]
ribosome_hela = pd.read_excel(f'../data/processed/ribosomes/hela_ribosome.xlsx', index_col=0)
int_hela = pd.read_excel('../results/proteome_hela_normalized.xlsx', index_col=0)

for i, j in enumerate(datasets_hela):
    ribosome_hela[data_hela['code'].iloc[i]] = ribosome_hela.index.map(j.set_index('assigned_id')['copies_per_cell'])
    
ribosome_hela['int'] = ribosome_hela.index.map(int_hela['avg'])

ribosome_hela.to_excel('../results/ribosome_hela.xlsx')

In [36]:
int_proteomes['scerevisiae'].loc[:, ['avg']]

Unnamed: 0_level_0,avg
uniprot_id,Unnamed: 1_level_1
A5Z2X5,86523
D6VTK4,735
O13297,2232
O13329,1419
O13516,69575
...,...
Q07500,5107
Q08969,6420
Q12031,11995
Q12405,2493


In [37]:
ho2018 = pd.read_excel('../data/prepared/scerevisiae/ho2018_int.xlsx')

In [38]:
proteome_s = pd.read_excel('../data/processed/proteomes/scerevisiae_proteome.xlsx')

In [39]:
proteome_s = proteome_s.set_index('orf_id')['uniprot_id']

In [40]:
ho2018['uniprot_id'] = ho2018['orf_id'].map(proteome_s)

In [41]:
ho2018 = ho2018[['uniprot_id', 'copies_per_cell']].set_index('uniprot_id')

In [42]:
ho2018 = ho2018[~ho2018['copies_per_cell'].isna()]

In [43]:
mys = int_proteomes['scerevisiae'].loc[:, ['avg']]

In [44]:
x = ho2018.join(mys, how='right')

In [45]:
x = x[~x.isna().any(1)]

In [46]:
#Mean
pearsonr(x['copies_per_cell'].to_numpy(), x['avg'].to_numpy())

(0.8121435590714823, 0.0)

In [47]:
#Median
pearsonr(x['copies_per_cell'].to_numpy(), x['avg'].to_numpy())

(0.8121435590714823, 0.0)

In [48]:
int_proteomes['scerevisiae'].sum()

GH03    6.891814e+07
KU14    6.164851e+07
LA16    1.971771e+07
LT17    5.491866e+07
MP17    8.602268e+07
WA19    9.139468e+07
XI22    6.592621e+07
avg     8.671872e+07
std     7.185476e+07
dtype: float64