In [1]:
from collections import Counter
from itertools import combinations

import numpy as np
import pandas as pd
from scipy.stats import pearsonr

In [2]:
datasets_list = pd.read_excel('../data/processed/datasets_list.xlsx')

In [3]:
datasets_interim = [pd.read_excel(f'../data/interim/{i[1]["object"]}/{i[1]["study"]}_int.xlsx') for i in datasets_list.iterrows()]

In [4]:
n_proteins = []
n_assigned_proteins = []
assigned_percent = []
total_copies_per_cell = []
assigned_copies_percent = []

for i in datasets_interim:
    nps = len(i)
    nap = len(i[~i['assigned_id'].isna()])
    acp = i[~i['assigned_id'].isna()]['copies_per_cell'].sum() / i['copies_per_cell'].sum()
    
    n_proteins.append(nps)
    n_assigned_proteins.append(nap)
    assigned_percent.append(round(nap / nps * 100, 2))
    assigned_copies_percent.append(round(acp * 100, 2))
    total_copies_per_cell.append(round(i['copies_per_cell'].sum()))

In [5]:
datasets_list['n_proteins'] = n_proteins
datasets_list['n_assigned_proteins'] = n_assigned_proteins
datasets_list['assigned_percent'] = assigned_percent
datasets_list['assigned_copies_percent'] = assigned_copies_percent
datasets_list['total_copies_per_cell'] = total_copies_per_cell

datasets_list['total_copies_per_um3'] = datasets_list['object'].map({'ecoli': 2.15, 'scerevisiae': 42.0, 'hela': 2800.0})
datasets_list['total_copies_per_um3'] = np.round(datasets_list['total_copies_per_cell'] / datasets_list['total_copies_per_um3']).astype(int)

datasets_list.to_excel('../results/datasets_processing_stats.xlsx', index=False)

In [6]:
datasets_list = datasets_list.iloc[:-2]

In [58]:
datasets_ecoli = {i[1]["code"]: pd.read_excel(f'../data/processed/ecoli/{i[1]["study"]}_processed.xlsx') for i in datasets_list[datasets_list['object'] == 'ecoli'].iterrows()}
datasets_ecoli_core = datasets_ecoli.copy()
for i in ['TA10']: del datasets_ecoli_core[i]
datasets_ecoli_core = datasets_ecoli_core.values()

datasets_scerevisiae = {i[1]["code"]: pd.read_excel(f'../data/processed/scerevisiae/{i[1]["study"]}_processed.xlsx') for i in datasets_list[datasets_list['object'] == 'scerevisiae'].iterrows()}
datasets_scerevisiae_core = datasets_scerevisiae.copy()
for i in ['GH03', 'LA16']: del datasets_scerevisiae_core[i]
datasets_scerevisiae_core = datasets_scerevisiae_core.values()

datasets_hela = {i[1]["code"]: pd.read_excel(f'../data/processed/hela/{i[1]["study"]}_processed.xlsx') for i in datasets_list[datasets_list['object'] == 'hela'].iterrows()}
datasets_hela_core = datasets_hela.values()

In [59]:
ecoli_id_counter = Counter()
for d in datasets_ecoli_core:
    ecoli_id_counter += Counter(d['assigned_id'])

if '_other' in ecoli_id_counter: del ecoli_id_counter['_other']
ecoli_id_counter = pd.Series(ecoli_id_counter)
ecoli_core_n0 = ecoli_id_counter[ecoli_id_counter == len(datasets_ecoli_core)].index.to_list()
ecoli_core_n1 = ecoli_id_counter[ecoli_id_counter >= len(datasets_ecoli_core) - 1].index.to_list()

scerevisiae_id_counter = Counter()
for d in datasets_scerevisiae_core:
    scerevisiae_id_counter += Counter(d['assigned_id'])

if '_other' in scerevisiae_id_counter: del scerevisiae_id_counter['_other']
scerevisiae_id_counter = pd.Series(scerevisiae_id_counter)
scerevisiae_core_n0 = scerevisiae_id_counter[scerevisiae_id_counter == len(datasets_scerevisiae_core)].index.to_list()
scerevisiae_core_n1 = scerevisiae_id_counter[scerevisiae_id_counter >= len(datasets_scerevisiae_core) - 1].index.to_list()

hela_id_counter = Counter()
for d in datasets_hela_core:
    hela_id_counter += Counter(d['assigned_id'])

if '_other' in hela_id_counter: del hela_id_counter['_other']
hela_id_counter = pd.Series(hela_id_counter)
hela_core_n0 = hela_id_counter[hela_id_counter == len(datasets_hela_core)].index.to_list()
hela_core_n1 = hela_id_counter[hela_id_counter >= len(datasets_hela_core) - 1].index.to_list()

In [9]:
len(ecoli_core_n0), len(scerevisiae_core_n0), len(hela_core_n0)

(934, 1452, 5051)

In [10]:
len(ecoli_core_n1), len(scerevisiae_core_n1), len(hela_core_n1)

(1501, 2151, 6734)

In [11]:
cores = {'ecoli': ecoli_core_n0, 'scerevisiae': scerevisiae_core_n0, 'hela': hela_core_n0}

In [60]:
datasets_list['core_input'] = (datasets_list['code']
    .map({'TA09': False, 'GH03': False, 'LA16': False})
    .combine_first(pd.Series(np.repeat(True, len(datasets_list))))
)

In [13]:
core_copies_prop_n0 = []
core_copies_prop_n1 = []

for d in datasets_ecoli:
    core_copies_prop_n0.append(datasets_ecoli[d][datasets_ecoli[d]['assigned_id'].isin(ecoli_core_n0)]['copies_per_cell'].sum() / datasets_ecoli[d]['copies_per_cell'].sum())
    core_copies_prop_n1.append(datasets_ecoli[d][datasets_ecoli[d]['assigned_id'].isin(ecoli_core_n1)]['copies_per_cell'].sum() / datasets_ecoli[d]['copies_per_cell'].sum())
    
for d in datasets_scerevisiae:
    core_copies_prop_n0.append(datasets_scerevisiae[d][datasets_scerevisiae[d]['assigned_id'].isin(scerevisiae_core_n0)]['copies_per_cell'].sum() / datasets_scerevisiae[d]['copies_per_cell'].sum())
    core_copies_prop_n1.append(datasets_scerevisiae[d][datasets_scerevisiae[d]['assigned_id'].isin(scerevisiae_core_n1)]['copies_per_cell'].sum() / datasets_scerevisiae[d]['copies_per_cell'].sum())
    
for d in datasets_hela:
    core_copies_prop_n0.append(datasets_hela[d][datasets_hela[d]['assigned_id'].isin(hela_core_n0)]['copies_per_cell'].sum() / datasets_hela[d]['copies_per_cell'].sum())
    core_copies_prop_n1.append(datasets_hela[d][datasets_hela[d]['assigned_id'].isin(hela_core_n1)]['copies_per_cell'].sum() / datasets_hela[d]['copies_per_cell'].sum())

In [14]:
datasets_list['core_copies_percent_n0'] = core_copies_prop_n0
datasets_list['core_copies_percent_n1'] = core_copies_prop_n1

In [15]:
datasets_list.to_excel('../results/datasets_calculations.xlsx', index=False)

In [64]:
ecoli_id = ecoli_id_counter.to_frame().rename({0: 'n_datasets'}, axis=1)
ecoli_id.index.name = 'assigned_id'

datasets_ecoli_core_norm = []

for i in datasets_ecoli_core:
    i = i[i['assigned_id'] != '_other']
    i['parts_per_mil'] = i['copies_per_cell'] / i['copies_per_cell'].sum() * 1000000
    
    datasets_ecoli_core_norm.append(i[['assigned_id', 'parts_per_mil']])
    
x = pd.concat(datasets_ecoli_core_norm)
x = x[x['assigned_id'] != '_other'].groupby('assigned_id').mean()

ecoli_id = ecoli_id.join(x)

ecoli_id.to_excel('../results/ecoli_cores.xlsx')

scerevisiae_id = scerevisiae_id_counter.to_frame().rename({0: 'n_datasets'}, axis=1)
scerevisiae_id.index.name = 'assigned_id'

datasets_scerevisiae_core_norm = []

for i in datasets_scerevisiae_core:
    i = i[i['assigned_id'] != '_other']
    i['parts_per_mil'] = i['copies_per_cell'] / i['copies_per_cell'].sum() * 1000000
    
    datasets_scerevisiae_core_norm.append(i[['assigned_id', 'parts_per_mil']])
    
x = pd.concat(datasets_scerevisiae_core_norm)
x = x[x['assigned_id'] != '_other'].groupby('assigned_id').mean()

scerevisiae_id = scerevisiae_id.join(x)

scerevisiae_id.to_excel('../results/scerevisiae_cores.xlsx')

hela_id = hela_id_counter.to_frame().rename({0: 'n_datasets'}, axis=1)
hela_id.index.name = 'assigned_id'

datasets_hela_core_norm = []

for i in datasets_hela_core:
    i = i[i['assigned_id'] != '_other']
    i['parts_per_mil'] = i['copies_per_cell'] / i['copies_per_cell'].sum() * 1000000
    
    datasets_hela_core_norm.append(i[['assigned_id', 'parts_per_mil']])
    
x = pd.concat(datasets_hela_core_norm)
x = x[x['assigned_id'] != '_other'].groupby('assigned_id').mean()

hela_id = hela_id.join(x)

hela_id.to_excel('../results/hela_cores.xlsx')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i['parts_per_mil'] = i['copies_per_cell'] / i['copies_per_cell'].sum() * 1000000
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i['parts_per_mil'] = i['copies_per_cell'] / i['copies_per_cell'].sum() * 1000000
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  i['parts_per_mil'] = i['copies_per_cell'] /

In [16]:
correlations = {}

for o in datasets_list['object'].unique():
    studies_object = datasets_list.loc[datasets_list['object'] == o, 'study']
    
    datasets_object = {i: pd.read_excel(f'../data/processed/{o}/{i}_processed.xlsx') for i in studies_object}
    datasets_object_r = {i: datasets_object[i][datasets_object[i]['assigned_id'] != '_other'] for i in studies_object}
    
    c = pd.DataFrame(index=studies_object, columns=studies_object)
    
    for j,l in combinations(studies_object, 2):
        jdf = datasets_object_r[j].set_index('assigned_id').join(datasets_object_r[l].set_index('assigned_id'), how='inner', rsuffix='_2').loc[:, ['copies_per_cell', 'copies_per_cell_2']]    
        c.loc[j, l] = pearsonr(jdf['copies_per_cell'], jdf['copies_per_cell_2'])[0]
        
        jdf = jdf[jdf.index.isin(cores[o])]
        c.loc[l, j] = pearsonr(jdf['copies_per_cell'], jdf['copies_per_cell_2'])[0]
        
    correlations[o] = c

In [17]:
for o in datasets_list['object'].unique():
    correlations[o].index.name = None
    correlations[o].columns.name = None
    
    correlations[o].to_excel(f'../results/correlation_{o}.xlsx')

In [26]:
datasets_ecoli_core

dict_values([     assigned_id  copies_per_cell
0         P00350          17503.0
1         P00363            136.5
2         P00370           7482.0
3         P00393            709.5
4         P00448          36846.5
...          ...              ...
1175      Q47622             41.0
1176      Q47690             74.0
1177      Q59385            477.0
1178      Q93K97            233.5
1179      _other              0.0

[1180 rows x 2 columns],      assigned_id  copies_per_cell
0         A5A605                1
1         A5A607                5
2         A5A612                2
3         A5A613              895
4         A5A614              216
...          ...              ...
3583      Q7DFV3              124
3584      Q93K97              223
3585      Q9XB42                1
3586      V9HVX0               96
3587      _other           271507

[3588 rows x 2 columns],      assigned_id  copies_per_cell
0         P00350      3166.941750
1         P00363      2105.713931
2         P00370 

In [None]:
datasets_list_untarg