### Pre-processing and significance testing of Organellar IPs in uninfected state

#### This is the third step of the Organellar IPs analysis

Some Org-IP samples may have very similar profiles, which may lead to underestimation of significance if outlier proteins overlap. 

To tackle this problem, enrichment values are pre-calculated using every other sample in the batch, and these enrichment values are correlated across samples.

The correlation values are then used to filter out highly correlated org-IP samples from being used in the significance test together.

In [8]:
import os, sys
from pathlib import Path
import pandas as pd

script_path = Path.cwd().parent.parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent.parent / "data"
sys.path.append(str(script_path))
from pyseus import primary_analysis as pa
from pyseus import spatial_tools as st

### Load the imputed IP table
The correct datestamp is required to find the input files

In [9]:
%store -r fig5_timestamp FIG5_USE_FROZEN
if FIG5_USE_FROZEN:
    raise Exception("USE_FROZEN is true, you probably want to skip enrichment and proceed from 3.aligned_umap")
timestamp = fig5_timestamp
print(f"Timestamp: {timestamp}")

outprefix = f"{timestamp}_QC_filter_impute"

outdir = Path.cwd() / "output"
preprocessing_out = outdir / f"preprocessing"

Timestamp: 2024-10-27


In [10]:
# load the bait imputed table
IP_path = preprocessing_out / f"{outprefix}_imputed_table.csv"

try:
    bait_imputed_table = pd.read_csv(IP_path, header=[0, 1], index_col=0)
except FileNotFoundError:
    print(f"File {IP_path} not found.\nPlease run 1.QC_filter_and_impute.ipynb first or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {IP_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

### Correlation filter

In [11]:
batches = [["14", "12", "17"], ["09", "10", "11"]]

# imputed, preprocessed table from the earlier steps
grouped = bait_imputed_table.copy()

# initiate lists to add dataframes for concatenation at the end
volcano_tables = []
enrichment_tables = []
volcano_tables_againstWT = []
enrichment_tables_againstWT = []

# create a folder to save the correlation tables
correlation_table_dir = outdir / "correlation_tables"
os.makedirs(correlation_table_dir, exist_ok=True)

# iterate through batches
for i, batch in enumerate(batches):
    print("batch: " + str(i))
    new_cols = []
    # select all the samples from each batch
    for col in list(grouped):
        if col[0] == "metadata":
            new_cols.append(col)
        # identifying each batch
        elif (col[1].split("-")[0] in batch):  # splitting the dash and seeing if that experiment is in the batch youre running
            new_cols.append(col)
    selected = grouped[new_cols].copy()

    # initial calculation of complement enrichments to identify highly correlated samples
    analysis = pa.AnalysisTables(grouped_table=selected, auto_group=False)
    analysis.generate_export_bait_matrix()

    # analysis.select_wildtype_controls(wt_re='-WT') # select only WT as controls, this is onfirmed by Manu to be the better than using all samples as the intial controls
    analysis.select_matching_WT_control(wt_re="-WT")  # select matching WT as controls.

    # print(analysis.exclusion_matrix) # uncomment to check the exclusion matrix
    analysis.exclusion_matrix.to_csv(correlation_table_dir / f"{timestamp}_negative_control_batch_{i}.csv", index=False)

    analysis.simple_pval_enrichment(std_enrich=False)
    analysis.convert_to_enrichment_table()
    complements = analysis.enrichment_table.copy()
    control_mat = analysis.exclusion_matrix.copy()

    enrichment_tables_againstWT.append(analysis.enrichment_table.copy())  # export the enrichments against WT
    volcano_tables_againstWT.append(analysis.simple_pval_table.copy())  # export the pvals against WT

    # using the previous calculations, find highly correlated sample (filter is at >0.35)
    spatial = st.SpatialTables(
        preprocessed_table=selected,
        enrichment_table=complements,
        control_mat=control_mat,
    )
    spatial.enrichment_corr_control_mat(corr=0.35)

    # save the correlation table for each batch
    spatial.corr_mat.to_csv(correlation_table_dir / f"{timestamp}_corr_table_batch_{i}.csv", index=False)
    spatial.sample_corrs.to_csv(correlation_table_dir / f"{timestamp}_corr_val_table_batch_{i}.csv", index=True)

    # calculate the final pval and enrichment table for the batch
    spatial.new_corr_ARI(just_enrichment=True, labels=None, reference=None, std_enrich=False)

    # append respective dataframes to the concatenation list
    volcano_tables.append(spatial.corr_pval_table)
    enrichment_tables.append(spatial.corr_enrichment_table)

batch: 0
P-val calculations..
Finished!
P-val calculations..
Finished!
batch: 1
P-val calculations..
Finished!
P-val calculations..
Finished!


The final step is to concatenate results from the batches. 

The enrichment table can be used for multiple purposes, including UMAP generation.

In [12]:
# concatenate the final enrichment table
enrichment_metadata = enrichment_tables[0].loc[:, ["metadata"]].copy()
enrichment_samples = [x.drop(["metadata"], axis=1, level=0) for x in enrichment_tables]

final_enrichments = pd.concat([enrichment_metadata] + enrichment_samples, axis=1)

# concatenate the final volcano table
volcano_metadata = volcano_tables[0].loc[:, ["metadata"]].copy()
volcano_samples = [x.drop(["metadata"], axis=1, level=0) for x in volcano_tables]

final_volcano = pd.concat([volcano_metadata] + volcano_samples, axis=1)

In [13]:
# save the final tables to files
enrich_out_dir = outdir / "enrichment_and_volcano_tables"
os.makedirs(enrich_out_dir, exist_ok=True)

volcano_csv_path = enrich_out_dir / f"{timestamp}_volcano_table.csv"
enrichment_csv_path = enrich_out_dir / f"{timestamp}_enrichment_table.csv"

final_volcano.to_csv(volcano_csv_path)
final_enrichments.to_csv(enrichment_csv_path)

In [14]:
final_enrichments

Unnamed: 0_level_0,metadata,metadata,metadata,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample
Unnamed: 0_level_1,Protein IDs,Majority protein IDs,Gene names,12-YWHAQ,14-GOLGA2,14-WT,14-RAB7A,17-RPL36,17-MAP1LC3B,17-G3BP1,...,09-PEX3,10-AP2B1,10-TOMM20,10-WT,10-RTN4,09-PSMB7,11-CEP350,11-EEA1,09-WT,09-EDC4
0,A0A023T6R1;Q96A72;F5H6P7;F5H6N1;F5H3U9;F5H124,A0A023T6R1;Q96A72;F5H6P7;F5H6N1,FLJ10292;MAGOHB,-2.365154,1.903443,1.492111,0.410130,0.131175,3.167371,-0.844879,...,0.268200,0.193761,0.520921,2.873520,-0.550241,-0.999255,-1.663909,0.169378,2.185520,2.286857
1,Q9Y5S9;A0A023T787;A0A0J9YW13,Q9Y5S9;A0A023T787,RBM8A;RBM8,0.844500,0.023500,0.249350,-1.191250,-0.067500,0.131400,0.825300,...,-0.496950,-0.336600,-2.969400,1.972000,-1.899050,-0.327100,0.895300,-1.618800,1.836900,1.614550
2,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6Z...,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6ZTK5,TM9SF4,-5.392200,4.584300,1.166750,2.824950,-1.936000,4.301950,-1.202100,...,-0.092300,0.620900,-2.685600,0.006200,1.852200,-4.414473,0.397300,-1.672800,0.315600,-0.149000
3,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,PTEN,0.292543,0.061496,0.554866,-0.403786,-0.889107,-0.557692,-1.011447,...,-0.027236,-0.152848,0.974687,-0.359181,-0.353914,-0.007011,0.117718,-0.425096,0.214107,0.711269
4,Q99805;A0A024QYR8;B3KSG9,Q99805;A0A024QYR8;B3KSG9,TM9SF2,0.442400,2.469750,-0.580900,2.383950,-2.063500,3.002300,-1.847700,...,0.016550,1.823800,-2.382900,-0.608600,1.524950,-1.587400,-0.438800,2.055000,-0.744500,-1.223000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8587,X5D7P8,X5D7P8,STK39,0.553451,0.017426,0.560006,0.217108,-0.581026,0.236772,-0.177288,...,-0.079309,-0.587919,0.821211,0.287963,-0.071477,0.206176,-0.027301,0.200878,0.202897,-0.480174
8588,X5D8X9,X5D8X9,CNTNAP2,-2.114554,2.865008,2.535765,1.996572,-2.510131,-2.170080,-3.362914,...,1.445820,-0.975355,0.550481,-0.415994,-0.021098,-0.591036,1.924365,1.081366,-0.351450,0.757855
8589,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,CRMP1,-0.498195,-0.335685,0.592048,-0.029638,0.191383,-0.969134,-0.248104,...,-0.289652,-0.161352,0.486597,0.249164,-0.678933,0.670191,1.661657,1.424154,0.051400,0.278300
8590,X5DQZ7,X5DQZ7,GPX1,-1.167690,2.563687,0.173506,2.126644,-0.268470,0.031190,-0.335495,...,0.209912,-0.291020,3.251738,-0.797759,0.706188,-0.409023,-0.045873,-0.304209,-0.219237,-0.578410
