In [1]:
from collections import Counter
from itertools import combinations

import numpy as np
import pandas as pd
from scipy.stats import pearsonr

In [2]:
datasets_list = pd.read_excel('../data/processed/datasets_list.xlsx')

In [3]:
datasets_list = datasets_list[datasets_list['object'] == 'scerevisiae']

In [4]:
datasets_list = datasets_list.iloc[:-1]

In [5]:
datasets_interim = [pd.read_excel(f'../data/interim/{i[1]["object"]}/{i[1]["study"]}_int.xlsx') for i in datasets_list.iterrows()]

In [6]:
n_proteins = []
n_assigned_proteins = []
assigned_percent = []
total_copies_per_cell = []
assigned_copies_percent = []

for i in datasets_interim:
    nps = len(i)
    nap = len(i[~i['assigned_id'].isna()])
    acp = i[~i['assigned_id'].isna()]['copies_per_cell'].sum() / i['copies_per_cell'].sum()
    
    n_proteins.append(nps)
    n_assigned_proteins.append(nap)
    assigned_percent.append(round(nap / nps * 100, 2))
    assigned_copies_percent.append(round(acp * 100, 2))
    total_copies_per_cell.append(round(i['copies_per_cell'].sum()))

In [7]:
datasets_list['n_proteins'] = n_proteins
datasets_list['n_assigned_proteins'] = n_assigned_proteins
datasets_list['assigned_percent'] = assigned_percent
datasets_list['assigned_copies_percent'] = assigned_copies_percent
datasets_list['total_copies_per_cell'] = total_copies_per_cell

datasets_list['total_copies_per_um3'] = datasets_list['object'].map({'ecoli': 2.15, 'scerevisiae': 42.0, 'hela': 2800.0})
datasets_list['total_copies_per_um3'] = np.round(datasets_list['total_copies_per_cell'] / datasets_list['total_copies_per_um3']).astype(int)

datasets_list.to_excel('../results/datasets_processing_stats.xlsx', index=False)

In [8]:
datasets_scerevisiae = {i[1]["code"]: pd.read_excel(f'../data/processed/scerevisiae/{i[1]["study"]}_processed.xlsx') for i in datasets_list[datasets_list['object'] == 'scerevisiae'].iterrows()}
datasets_scerevisiae_core = datasets_scerevisiae.copy()
for i in ['GH03', 'LA16']: del datasets_scerevisiae_core[i]
datasets_scerevisiae_core = datasets_scerevisiae_core.values()

In [9]:
scerevisiae_id_counter = Counter()
for d in datasets_scerevisiae_core:
    scerevisiae_id_counter += Counter(d['assigned_id'])

if '_other' in scerevisiae_id_counter: del scerevisiae_id_counter['_other']
scerevisiae_id_counter = pd.Series(scerevisiae_id_counter)
scerevisiae_core_n0 = scerevisiae_id_counter[scerevisiae_id_counter == len(datasets_scerevisiae_core)].index.to_list()
scerevisiae_core_n1 = scerevisiae_id_counter[scerevisiae_id_counter >= len(datasets_scerevisiae_core) - 1].index.to_list()

In [10]:
len(scerevisiae_core_n0)

1452

In [11]:
len(scerevisiae_core_n1)

2151

In [12]:
core_input_percent = []
core_copies_prop_n0 = []
core_copies_prop_n1 = []
    
for d in datasets_scerevisiae:
    core_input_percent.append(len(datasets_scerevisiae[d][datasets_scerevisiae[d]['assigned_id'].isin(scerevisiae_core_n0)]) / len(scerevisiae_core_n0))
    core_copies_prop_n0.append(datasets_scerevisiae[d][datasets_scerevisiae[d]['assigned_id'].isin(scerevisiae_core_n0)]['copies_per_cell'].sum() / datasets_scerevisiae[d]['copies_per_cell'].sum())
    core_copies_prop_n1.append(datasets_scerevisiae[d][datasets_scerevisiae[d]['assigned_id'].isin(scerevisiae_core_n1)]['copies_per_cell'].sum() / datasets_scerevisiae[d]['copies_per_cell'].sum())

In [13]:
datasets_list['core_copies_percent_n0'] = core_copies_prop_n0
datasets_list['core_copies_percent_n1'] = core_copies_prop_n1
datasets_list['core_input_percent'] = core_input_percent

In [18]:
core_copies_prop_n0

[0.5597061516379759,
 0.6872321190560658,
 0.6517859191556431,
 0.9242184062111324,
 0.6206245784120902,
 0.5841441777307798,
 0.6863475405350354]

In [19]:
core_input_percent

[0.8670798898071626, 1.0, 0.4090909090909091, 1.0, 1.0, 1.0, 1.0]

In [20]:
scerevisiae_id = scerevisiae_id_counter.to_frame().rename({0: 'n_datasets'}, axis=1)
scerevisiae_id.index.name = 'assigned_id'

datasets_scerevisiae_core_norm = []

for i in datasets_scerevisiae_core:
    i = i[i['assigned_id'] != '_other'].copy()
    i['parts_per_mil'] = i['copies_per_cell'] / i['copies_per_cell'].sum() * 1000000
    
    datasets_scerevisiae_core_norm.append(i[['assigned_id', 'parts_per_mil']])
    
x = pd.concat(datasets_scerevisiae_core_norm)
x = x[x['assigned_id'] != '_other'].groupby('assigned_id').mean()

scerevisiae_id = scerevisiae_id.join(x)

#scerevisiae_id.to_excel('../results/scerevisiae_cores.xlsx')

In [21]:
data_core = datasets_list[~datasets_list['code'].isin(['TA10', 'GH03', 'LA16', 'LT17'])]

In [47]:
min_core_n0_percent = pd.Series({'scerevisiae': data_core['core_copies_percent_n0'].max()})

In [23]:
copies_per_cell_milo = pd.Series(index=['scerevisiae'], data=91394886.112417)

In [24]:
copies_per_cell_milo

scerevisiae    9.139489e+07
dtype: float64

In [48]:
int_proteomes = {}

for i, j in enumerate(data_core['object'].unique()):
    data_object = datasets_list.loc[datasets_list['object'] == j]
    
    int_proteomes[j] = pd.DataFrame(index=scerevisiae_id_counter.index)
    
    datasets_object = [pd.read_excel(f'../data/processed/{j}/{m}_processed.xlsx') for m in data_object['study'].values]
    
    for k, l in enumerate(datasets_object):
        l = l[l['assigned_id'] != '_other']
        
        factor = (copies_per_cell_milo.loc[j] * (data_object['assigned_copies_percent'].iloc[k] / 100) * min_core_n0_percent.loc[j] * data_object['core_input_percent'].iloc[k]) / l[l['assigned_id'].isin(scerevisiae_core_n0)]['copies_per_cell'].sum()
        
        l = l.set_index('assigned_id')['copies_per_cell']
        
        l = l * factor
        
        int_proteomes[j][data_object['code'].iloc[k]] = int_proteomes[j].index.map(l)
        
    int_proteomes[j]['avg'] = int_proteomes[j].median(axis=1).apply(np.ceil).astype(int)
    int_proteomes[j]['std'] = int_proteomes[j].iloc[:, :-1].std(axis=1)
    
    int_proteomes[j].index.name = 'uniprot_id'
    
    #int_proteomes[j].to_excel(f'../results/proteome_{j}_normalized.xlsx')

In [50]:
int_proteomes['scerevisiae'].sum()

GH03    8.108060e+07
KU14    7.252804e+07
LA16    2.319743e+07
LT17    6.461053e+07
MP17    1.012037e+08
WA19    1.075237e+08
XI22    7.756066e+07
avg     8.384352e+07
std     8.453547e+07
dtype: float64

In [36]:
int_proteomes['scerevisiae'].sum()

GH03    6.891814e+07
KU14    6.164851e+07
LA16    1.971771e+07
LT17    5.491866e+07
MP17    8.602268e+07
WA19    9.139468e+07
XI22    6.592621e+07
avg     7.126698e+07
std     7.185476e+07
dtype: float64

In [28]:
min_core_n0_percent

scerevisiae    0.584144
dtype: float64

In [32]:
for k, l in enumerate(datasets_object):
    l = l[l['assigned_id'] != '_other']
    print(l['copies_per_cell'].sum())
    print(l[l['assigned_id'].isin(scerevisiae_core_n0)]['copies_per_cell'].sum())
    print('\n')

43090664.49010545
26117116.947281618


42808321.54850869
33027564.621488303


43225561.23101596
36708823.0010378


93695864.61482428
88816084.19754827


250751159.25286677
155622332.49765334


71781635.98751713
41930919.197625116


67964632.98451167
50668470.10325055


