# NOC processing
#### This is the fourth and final step of the enrichment calculation
Calculate NOC proportion and append to the enrichment table  

Key output files
- `yyyy-mm-dd_enrichment_table_NOC_prop.csv`: enrichment values and NOC proportion  
- `yyyy-mm-dd_volcano_table.csv`: enrichment values and p-values  
  
both files are in the directory `output/enrichment_and_volcano_tables`

In [1]:
import os
import sys
from datetime import datetime
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

script_path = Path.cwd().parent.parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent.parent / "data"
sys.path.append(str(script_path))

from pyseus import basic_processing as ip
from pyseus import contrast_tools as ct

### Load imputed NOC/Bulk table
The correct datestamp is required to find the input files

In [2]:
timestamp = datetime.now().strftime("%Y-%m-%d")
print(f"Timestamp: {timestamp}")
outprefix = f"{timestamp}_QC_filter_impute"

outdir = Path.cwd() / "output"
preprocessing_out = outdir / f"preprocessing"

NOC_path = preprocessing_out / f"{outprefix}_filtered_table.csv"

try:
    noc_table = pd.read_csv(NOC_path, header=[0], index_col=0)
except FileNotFoundError:
    print(f"File {NOC_path} not found.\nPlease rerun the previous steps or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {NOC_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Timestamp: 2023-12-04


### Calculation of proportion

In [3]:
# take only the noc columns
NOC_kw = ["Cytosol", "Nuclear", "Organelle"]
noc_cols = [
    col for col in list(noc_table)
    if (any(kw in col for kw in NOC_kw)) and "Proteome" not in col
]
print(noc_cols)

['Infected_Cytosol_1', 'Infected_Cytosol_2', 'Infected_Cytosol_3', 'Infected_Nuclear_1', 'Infected_Nuclear_2', 'Infected_Nuclear_3', 'Infected_Organelle_1', 'Infected_Organelle_2', 'Infected_Organelle_3']


In [4]:
# just select the noc_cols + Protein IDs for merge purpose
noc_table = noc_table[["Protein IDs"] + noc_cols].copy()

In [5]:
# group the tables and take the median

# start RawTables class from basic processing to group the samples
noc_process = ip.RawTables(
    proteingroup=None, file_designated=True,
    info_cols=['Protein IDs'], sample_cols=noc_cols
)

# designate the pre-computed noc filtered table on this class
noc_process.filtered_table = noc_table

# transform the intensity to linear space
#noc_process.transform_intensities(func=np.exp2)

# group replicates
noc_process.group_replicates(reg_exp=r'(.*)_\d+$')

# calculate the median
noc_medians = ip.median_replicates(noc_process.grouped_table)
noc_medians.head()

Intensity values have not been transformed yet from filtered table,
we recommend using transform_intensities() method before grouping replicates.

Using filtered_table to group replicates.


Unnamed: 0,Infected_Nuclear,Infected_Cytosol,Infected_Organelle,Protein IDs
2,267320000.0,116740000.0,100650000.0,A0A023T6R1;Q96A72;F5H6P7;F5H6N1;F5H3U9;F5H124
3,302530000.0,171030000.0,170150000.0,Q9Y5S9;A0A023T787;A0A0J9YW13
5,14070000.0,2313300.0,45439000.0,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6Z...
6,0.0,2772200.0,1104400.0,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...
7,30991000.0,3596800.0,55897000.0,Q99805;A0A024QYR8;B3KSG9


In [6]:
# we then take the sum of each column 
noc_medians['noc_sum_inf'] = noc_medians['Infected_Cytosol'] + noc_medians['Infected_Organelle'] + noc_medians['Infected_Nuclear']


# remove all proteingroups where both inf and uninf noc_sum is 0 -> there is no information on these
noc_medians = noc_medians[(noc_medians['noc_sum_inf'] > 0)].reset_index(drop=True)


# create a new dataframe with just the proportion: intensity divided by the sum
noc_proportion = noc_medians[['Protein IDs']].copy()
noc_proportion['NOC_cytosol_Infected'] = noc_medians['Infected_Cytosol'] / noc_medians['noc_sum_inf']
noc_proportion['NOC_organelle_Infected'] = noc_medians['Infected_Organelle'] / noc_medians['noc_sum_inf']
noc_proportion['NOC_nuclear_Infected'] = noc_medians['Infected_Nuclear'] / noc_medians['noc_sum_inf']


### Merge NOC proportion with IP enrichment

In [7]:
# finally, merge with the enrichment table

# read the enrichment table
enrich_out_dir = outdir / "enrichment_and_volcano_tables"
enrichment_csv_path = enrich_out_dir / f'{timestamp}_enrichment_table.csv'
enrichments = pd.read_csv(enrichment_csv_path, header=[0, 1], index_col=0)
meta_cols = enrichments["metadata"].columns
enrichments = enrichments.droplevel(0, axis=1)

In [8]:
# finally, merge with the enrichment table
enrichment_nocs = enrichments.merge(noc_proportion, on='Protein IDs', how='inner')

# standard formatting
enrichment_nocs = ct.standard_pyseus_headers(enrichment_nocs, meta_cols=meta_cols)

In [9]:
# save the enrichment table with NOC proportions
enrichment_NOC_csv_path = enrich_out_dir / f'{timestamp}_enrichment_table_NOC_prop.csv'
enrichment_nocs.to_csv(enrichment_NOC_csv_path)

In [10]:
# enrichment table with NOC proportions
enrichment_nocs

Unnamed: 0_level_0,metadata,metadata,metadata,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample
Unnamed: 0_level_1,Protein IDs,Majority protein IDs,Gene names,17-SLC30A2_Infected,14-GOLGA2_Infected,17-ATP1B3_Infected,17-RPL36_Infected,12-WT_Infected,12-LAMP1_Infected,12-YWHAQ_Infected,...,09-WT_Infected,10-RTN4_Infected,11-SEC31A_Infected,09-HSP90AA1_Infected,10-EXOC2_Infected,09-TOMM20_Infected,10-WT_Infected,NOC_cytosol_Infected,NOC_organelle_Infected,NOC_nuclear_Infected
0,A0A023T6R1;Q96A72;F5H6P7;F5H6N1;F5H3U9;F5H124,A0A023T6R1;Q96A72;F5H6P7;F5H6N1,FLJ10292;MAGOHB,-0.153721,-0.991876,-0.487636,1.801878,0.397270,-0.945137,-0.524625,...,2.427443,-1.404833,-0.933127,-0.652504,1.772409,0.502488,1.373609,0.240845,0.207650,0.551505
1,Q9Y5S9;A0A023T787;A0A0J9YW13,Q9Y5S9;A0A023T787,RBM8A;RBM8,-1.118800,-0.461100,-1.304100,0.610900,1.549150,-0.127200,0.362400,...,1.601700,-1.694300,0.352900,-3.383000,1.100000,-1.311100,0.557200,0.265694,0.264327,0.469979
2,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6Z...,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6ZTK5,TM9SF4,-3.445300,1.197850,-4.945500,-3.140200,-1.216500,-0.995400,-3.524800,...,0.196500,-1.205400,1.233000,-6.680200,-0.819500,-3.319200,0.337500,0.037419,0.734994,0.227588
3,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,PTEN,0.370895,0.264732,0.716220,-0.431213,-1.352463,-0.327094,0.322768,...,0.205414,-0.509183,-0.280560,0.976346,0.275284,-0.471726,-0.908906,0.715111,0.284889,0.000000
4,Q99805;A0A024QYR8;B3KSG9,Q99805;A0A024QYR8;B3KSG9,TM9SF2,-1.637100,1.353050,-2.353700,-1.692200,-3.085650,-0.651500,-3.231350,...,0.151900,-1.434300,1.472700,-7.951400,-0.159900,-2.908700,-0.052200,0.039750,0.617750,0.342500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8371,X5D2T3;X1WI28;P27635;B8A6G2;X5D2W5;A6QRI9;Q96L21,X5D2T3;X1WI28;P27635,RPL10,0.746000,-0.111700,0.928900,2.935600,0.676750,-0.265450,0.236100,...,-0.374000,0.124800,0.171400,-1.241500,0.163500,0.540700,0.752400,0.552010,0.318835,0.129154
8372,X5D7P8,X5D7P8,STK39,0.509498,0.032787,0.718537,-1.866703,0.026481,-0.576165,1.023599,...,0.372828,-1.091708,-0.136871,0.914852,0.988118,-0.020222,0.344576,0.876850,0.123150,0.000000
8373,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,CRMP1,-0.339776,0.753024,-0.496244,-0.291901,-0.336175,1.006970,-0.764510,...,-0.643068,0.232216,1.209000,0.052735,-0.552581,0.055227,-0.431497,1.000000,0.000000,0.000000
8374,X5DQZ7,X5DQZ7,GPX1,-1.462731,3.127882,-0.747647,-0.300528,-0.126492,-1.682565,-1.341266,...,-1.720063,1.696521,-0.648949,4.766470,-1.034065,4.240770,-1.249227,0.616662,0.383338,0.000000
