# NOC processing
#### This is the fourth and final step of the enrichment calculation
Calculate NOC proportion and append to the enrichment table  

Key output files
- `yyyy-mm-dd_enrichment_table_NOC_prop.csv`: enrichment values and NOC proportion  
- `yyyy-mm-dd_volcano_table.csv`: enrichment values and p-values  
  
both files are in the directory `output/enrichment_and_volcano_tables`

In [1]:
import os, sys
from pathlib import Path
import pandas as pd

script_path = Path.cwd().parent.parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent.parent / "data"
sys.path.append(str(script_path))

from pyseus import basic_processing as ip
from pyseus import contrast_tools as ct

### Load imputed NOC/Bulk table
The correct datestamp is required to find the input files

In [2]:
%store -r fig5_timestamp FIG5_USE_FROZEN
if FIG5_USE_FROZEN:
    raise Exception("USE_FROZEN is true, you probably want to skip enrichment and proceed from 3.aligned_umap")
timestamp = fig5_timestamp
print(f"Timestamp: {timestamp}")

outprefix = f"{timestamp}_QC_filter_impute"

outdir = Path.cwd() / "output"
preprocessing_out = outdir / f"preprocessing"

NOC_path = preprocessing_out / f"{outprefix}_filtered_table.csv"

try:
    noc_table = pd.read_csv(NOC_path, header=[0], index_col=0)
except FileNotFoundError:
    print(f"File {NOC_path} not found.\nPlease rerun the previous steps or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {NOC_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")


Timestamp: 2023-12-04


### Calculation of proportion

In [3]:
# take only the noc columns
NOC_kw = ["Cytosol", "Nuclear", "Organelle"]
noc_cols = [
    col for col in list(noc_table)
    if (any(kw in col for kw in NOC_kw)) and "Proteome" not in col
]
print(noc_cols)

['UnInfected_Cytosol_1', 'UnInfected_Cytosol_2', 'UnInfected_Cytosol_3', 'UnInfected_Nuclear_1', 'UnInfected_Nuclear_2', 'UnInfected_Nuclear_3', 'UnInfected_Organelle_1', 'UnInfected_Organelle_2', 'UnInfected_Organelle_3']


In [4]:
# just select the noc_cols + Protein IDs for merge purpose
noc_table = noc_table[['Protein IDs'] + noc_cols].copy()

In [5]:
# group the tables and take the median

# start RawTables class from basic processing to group the samples
noc_process = ip.RawTables(
    proteingroup=None, file_designated=True,
    info_cols=["Protein IDs"], sample_cols=noc_cols,
)

# designate the pre-computed noc filtered table on this class
noc_process.filtered_table = noc_table

# transform the intensity to linear space
#noc_process.transform_intensities(func=np.exp2)

# group replicates
noc_process.group_replicates(reg_exp=r"(.*)_\d+$")

# calculate the median
noc_medians = ip.median_replicates(noc_process.grouped_table)
noc_medians.head()

Intensity values have not been transformed yet from filtered table,
we recommend using transform_intensities() method before grouping replicates.

Using filtered_table to group replicates.


Unnamed: 0,UnInfected_Nuclear,UnInfected_Organelle,UnInfected_Cytosol,Protein IDs
2,166520000.0,76811000.0,75060000.0,A0A023T6R1;Q96A72;F5H6P7;F5H6N1;F5H3U9;F5H124
3,238020000.0,194700000.0,245940000.0,Q9Y5S9;A0A023T787;A0A0J9YW13
5,22978000.0,51652000.0,4330700.0,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6Z...
6,0.0,0.0,3272600.0,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...
7,24210000.0,77290000.0,9258400.0,Q99805;A0A024QYR8;B3KSG9


In [6]:
# we then take the sum of each column 
noc_medians['noc_sum_uninf'] = noc_medians['UnInfected_Cytosol'] + noc_medians['UnInfected_Organelle'] + noc_medians['UnInfected_Nuclear']


# remove all proteingroups where both inf and uninf noc_sum is 0 -> there is no information on these
noc_medians = noc_medians[(noc_medians['noc_sum_uninf'] > 0)].reset_index(drop=True)


# create a new dataframe with just the proportion: intensity divided by the sum
noc_proportion = noc_medians[['Protein IDs']].copy()
noc_proportion['NOC_cytosol_UnInfected'] = noc_medians['UnInfected_Cytosol'] / noc_medians['noc_sum_uninf']
noc_proportion['NOC_organelle_UnInfected'] = noc_medians['UnInfected_Organelle'] / noc_medians['noc_sum_uninf']
noc_proportion['NOC_nuclear_UnInfected'] = noc_medians['UnInfected_Nuclear'] / noc_medians['noc_sum_uninf']


### Merge NOC proportion with IP enrichment

In [7]:
# finally, merge with the enrichment table

# read the enrichment table
enrich_out_dir = outdir / "enrichment_and_volcano_tables"
enrichment_csv_path = enrich_out_dir / f"{timestamp}_enrichment_table.csv"
enrichments = pd.read_csv(enrichment_csv_path, header=[0, 1], index_col=0)
meta_cols = enrichments["metadata"].columns
enrichments = enrichments.droplevel(0, axis=1)

In [8]:
# finally, merge with the enrichment table
enrichment_nocs = enrichments.merge(noc_proportion, on="Protein IDs", how="inner")

# standard formatting
enrichment_nocs = ct.standard_pyseus_headers(enrichment_nocs, meta_cols=meta_cols)

In [9]:
# save the enrichment table with NOC proportions
enrichment_NOC_csv_path = enrich_out_dir / f"{timestamp}_enrichment_table_NOC_prop.csv"
enrichment_nocs.to_csv(enrichment_NOC_csv_path)

In [10]:
# enrichment table with NOC proportions
enrichment_nocs

Unnamed: 0_level_0,metadata,metadata,metadata,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample
Unnamed: 0_level_1,Protein IDs,Majority protein IDs,Gene names,14-GOLGA2,14-RAB14,17-ATP1B3,14-RAB11A,17-MAP1LC3B,12-LAMP1,12-WT,...,10-AP2B1,11-GPR107,11-CEP350,09-PSMB7,09-EDC4,09-HSP90AA1,10-RTN4,NOC_cytosol_UnInfected,NOC_organelle_UnInfected,NOC_nuclear_UnInfected
0,A0A023T6R1;Q96A72;F5H6P7;F5H6N1;F5H3U9;F5H124,A0A023T6R1;Q96A72;F5H6P7;F5H6N1,FLJ10292;MAGOHB,2.153417,1.892810,-0.572400,1.456910,2.979922,-1.784115,-1.925398,...,0.569077,-1.941187,-0.501610,0.407900,2.150198,-1.514841,0.252975,0.235748,0.241247,0.523005
1,Q9Y5S9;A0A023T787;A0A0J9YW13,Q9Y5S9;A0A023T787,RBM8A;RBM8,0.023500,-0.330450,-1.834500,-1.285350,0.131400,-0.118800,1.427950,...,-0.336600,-2.043600,0.895300,-0.327100,1.614550,-2.443700,-1.899050,0.362391,0.286889,0.350721
2,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6Z...,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6ZTK5,TM9SF4,4.584300,4.124300,-3.557200,4.259000,4.301950,-1.206450,-5.139850,...,0.620900,5.885700,0.397300,-5.327942,-0.149000,-5.032831,1.852200,0.054846,0.654148,0.291006
3,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,PTEN,0.454289,0.499897,0.578235,0.382422,-0.555647,0.213112,-0.135801,...,0.517645,-0.749186,-0.364061,1.318186,0.035807,-0.054300,-0.000587,1.000000,0.000000,0.000000
4,Q99805;A0A024QYR8;B3KSG9,Q99805;A0A024QYR8;B3KSG9,TM9SF2,2.469750,2.903350,-1.687400,2.420950,3.002300,0.968700,-6.612522,...,1.823800,5.827450,-0.438800,-1.587400,-1.223000,-3.389100,1.524950,0.083591,0.697825,0.218584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8532,X5D7P8,X5D7P8,STK39,-0.363124,-0.845920,0.500627,0.603359,-0.009725,0.045149,-0.478913,...,0.122412,0.041333,0.087444,0.745895,0.028347,1.780592,-0.887082,0.765637,0.145727,0.088636
8533,X5D8X9,X5D8X9,CNTNAP2,2.571864,3.155311,3.373042,2.671111,-1.736598,-2.082729,-1.978545,...,-0.985532,-0.133089,2.203500,-0.512365,0.204038,-0.310244,-1.684899,0.000000,0.802964,0.197036
8534,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,CRMP1,0.766317,0.519500,-0.460152,0.239464,-1.826895,0.405054,0.481500,...,-0.328330,-0.351563,-0.076900,-0.273607,1.418425,-0.032250,0.604392,1.000000,0.000000,0.000000
8535,X5DQZ7,X5DQZ7,GPX1,2.632244,3.655411,-0.809530,3.298411,1.251901,-0.055754,-0.293289,...,-0.878981,0.200192,0.969230,-0.284345,0.045823,4.108577,0.575631,0.720741,0.279259,0.000000
