### Correlation filter

#### This is the third step of the enrichment calculation

Some OrgIP samples have very similar profiles, which may lead to underestimation of significance if outlier proteins overlap. 

To tackle this problem, enrichment values are pre-calculated using every other sample in the batch, and these enrichment values are correlated across samples.

The correlation values are then used to filter out highly correlated org-IP samples from being used in the significance test together.

In [2]:
import os, sys
from pathlib import Path
import pandas as pd

script_path = Path.cwd().parent.parent / "script"
data_path = Path.cwd().parent.parent / "data"
sys.path.append(str(script_path))
from pyseus import primary_analysis as pa
from pyseus import spatial_tools as st # NOTE: if you run into the "No module named '_curses'" error, and are using Windows, please pip-install windows-curses using: pip install windows-curses

### Load the imputed IP table
The correct datestamp is required to find the input files

In [3]:
%store -r timestamp USE_FROZEN
if USE_FROZEN:
    raise Exception("USE_FROZEN is true, you probably want to skip enrichment and proceed from Fig1")
print(f"Timestamp: {timestamp}")

outprefix = f"{timestamp}_QC_filter_impute"

outdir = Path.cwd() / "output"
preprocessing_out = outdir / f"preprocessing"

Timestamp: 2024-10-26


In [4]:
# load the bait imputed table
IP_path = preprocessing_out / f"{outprefix}_imputed_table.csv"

try:
    bait_imputed_table = pd.read_csv(IP_path, header=[0, 1], index_col = 0)
except FileNotFoundError:
    print(f"File {IP_path} not found.\nPlease run 1.QC_filter_and_impute.ipynb first or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {IP_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

### Correlation filter

setup the batches accordin to `2.Batch_selection.ipynb`

In [5]:
batches = [
    ["12", "13"],
    ["01", "03", "04", "05"],
    ["02", "06", "07"],
    ["14", "15", "17"],
    ["09", "10", "11"],
]

In [6]:
grouped = bait_imputed_table.copy()

# initiate lists to add dataframes for concatenation at the end 
volcano_tables = []
enrichment_tables = []
volcano_tables_againstWT = []
enrichment_tables_againstWT = []

# create a folder to save the correlation tables
correlation_table_dir = outdir / "correlation_tables"
os.makedirs(correlation_table_dir, exist_ok=True)

# iterate through batches
for i, batch in enumerate(batches):
    print('batch: ' + str(i))
    new_cols = []
    # select all the samples from each batch
    for col in list(grouped):
        if col[0] == 'metadata':
            new_cols.append(col)
        # identifying each batch
        elif col[1].split('-')[0] in batch:
            new_cols.append(col) 
    selected = grouped[new_cols].copy()

    # initial calculation of complement enrichments to identify highly correlated samples
    analysis = pa.AnalysisTables(grouped_table=selected, auto_group=False)
    analysis.generate_export_bait_matrix()

    # analysis.select_wildtype_controls(wt_re='-WT') # select only WT as controls.
    analysis.select_matching_WT_control(wt_re='-WT') # select matching WT as controls. , this is confirmed to be the better than using only WT samples as the intial controls

    # print(analysis.exclusion_matrix) # uncomment to check the exclusion matrix
    analysis.exclusion_matrix.to_csv(correlation_table_dir / f"{timestamp}_negative_control_batch_{i}.csv", index=False)

    
    analysis.simple_pval_enrichment(std_enrich=False)
    analysis.convert_to_enrichment_table()
    complements = analysis.enrichment_table.copy()
    control_mat = analysis.exclusion_matrix.copy()

    enrichment_tables_againstWT.append(analysis.enrichment_table.copy()) # export the enrichments against WT
    volcano_tables_againstWT.append(analysis.simple_pval_table.copy()) # export the pvals against WT
    
    # using the previous calculations, find highly correlated sample (filter is at >0.35)
    spatial = st.SpatialTables(
        preprocessed_table=selected,
        enrichment_table=complements,
        control_mat=control_mat)
    spatial.enrichment_corr_control_mat(corr=0.35)

    # save the correlation table for each batch
    spatial.corr_mat.to_csv(correlation_table_dir / f"{timestamp}_corr_table_batch_{i}.csv", index=False)  
    spatial.sample_corrs.to_csv(correlation_table_dir / f"{timestamp}_corr_val_table_batch_{i}.csv", index=True)
    
    # calculate the final pval and enrichment table for the batch
    spatial.new_corr_ARI(just_enrichment=True, labels=None, reference=None, std_enrich=False)

    # appending respective dataframes to the concatenation list
    volcano_tables.append(spatial.corr_pval_table)
    enrichment_tables.append(spatial.corr_enrichment_table)
    

batch: 0
P-val calculations..
Finished!
P-val calculations..
Finished!
batch: 1
P-val calculations..
Finished!
P-val calculations..
Finished!
batch: 2
P-val calculations..
Finished!
P-val calculations..
Finished!
batch: 3
P-val calculations..
Finished!
P-val calculations..
Finished!
batch: 4
P-val calculations..
Finished!
P-val calculations..
Finished!


The final step is to concatenate results from the batches. 

The enrichment table can be used for multiple purposes, including UMAP generation.

In [7]:
# concatenate the final enrichment table
enrichment_metadata = enrichment_tables[0].loc[:, ["metadata"]].copy()
enrichment_samples = [x.drop(["metadata"], axis=1, level=0) for x in enrichment_tables]

final_enrichments = pd.concat([enrichment_metadata] + enrichment_samples, axis=1)

# concatenate the final volcano table
volcano_metadata = volcano_tables[0].loc[:, ["metadata"]].copy()
volcano_samples = [x.drop(["metadata"], axis=1, level=0) for x in volcano_tables]

final_volcano = pd.concat([volcano_metadata] + volcano_samples, axis=1)

In [8]:
# save the final tables to files
enrich_out_dir = outdir / "enrichment_and_volcano_tables"
os.makedirs(enrich_out_dir, exist_ok=True)

volcano_csv_path = enrich_out_dir / f"{timestamp}_volcano_table.csv"
enrichment_csv_path = enrich_out_dir / f"{timestamp}_enrichment_table.csv"

final_volcano.to_csv(volcano_csv_path)
final_enrichments.to_csv(enrichment_csv_path)

In [9]:
# check the enrichment table
final_enrichments

Unnamed: 0_level_0,metadata,metadata,metadata,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample
Unnamed: 0_level_1,Protein IDs,Majority protein IDs,Gene names,13-GOLGA2,12-PNPLA2,12-SEC61B,12-WT,13-RAB14,12-LAMP1,13-RAB7A,...,11-SEC31A,11-GPR107,09-ATG101,09-WT,09-HSP90AA1,09-PSMB7,10-AP2B1,10-WT,11-CEP350,10-TOMM20
0,A0A023T6R1;Q96A72;F5H6P7;F5H6N1;F5H3U9;F5H124,A0A023T6R1;Q96A72;F5H6P7;F5H6N1,FLJ10292;MAGOHB,-0.832567,0.590713,-0.265127,-0.829845,0.218249,0.006886,-0.538597,...,-0.275391,-1.179204,3.033243,2.089331,-0.768255,-0.628961,-0.824471,2.777331,-0.213170,-0.559929
1,Q9Y5S9;A0A023T787;A0A0J9YW13,Q9Y5S9;A0A023T787,RBM8A;RBM8,-0.430800,0.677400,-0.631700,1.425600,-0.984800,-0.588500,-1.357550,...,0.979850,-2.108900,-0.728450,1.836900,-2.443700,-0.327100,-0.336600,1.972000,0.895300,-2.969400
2,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6Z...,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6ZTK5,TM9SF4,6.283809,-4.335088,4.012383,-3.924100,4.320300,3.148130,4.496779,...,1.688300,5.934600,2.393000,0.315600,-4.311600,-3.643343,0.620900,0.006200,0.397300,-2.685600
3,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,PTEN,0.739701,-0.845441,0.848930,-0.035370,-0.035675,0.088362,0.075128,...,-0.231775,0.270198,-0.442885,0.085321,-0.052780,0.522294,-0.755410,-0.056601,0.233463,0.841546
4,Q99805;A0A024QYR8;B3KSG9,Q99805;A0A024QYR8;B3KSG9,TM9SF2,6.830250,-8.207671,6.156061,-5.951607,7.166215,5.852807,8.552372,...,1.179800,5.970200,-0.412350,-0.744500,-3.389100,-1.587400,1.823800,-0.608600,-0.438800,-2.382900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8594,X5D7P8,X5D7P8,STK39,-0.471505,-0.867377,-0.384673,1.035184,-0.725610,-0.089538,-0.244117,...,0.780834,-0.197799,-0.041305,-0.297478,-0.446006,-0.220150,-0.297105,-0.177771,0.846096,-1.447514
8595,X5D8X9,X5D8X9,CNTNAP2,0.919475,0.094208,-0.425143,0.300054,0.481218,-0.368300,-0.933941,...,1.987213,-0.036976,1.450198,-0.635438,-0.468167,-0.489276,-1.360433,-0.010147,2.103270,-1.001487
8596,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,CRMP1,-0.808379,-0.123037,0.274291,-0.414747,0.292167,0.734020,0.619879,...,0.478744,-1.138186,-0.736214,0.154687,-0.534041,-0.057126,0.138068,0.263834,0.939305,-0.347383
8597,X5DQZ7,X5DQZ7,GPX1,-0.961491,-0.766564,0.437071,-0.238496,0.106310,0.130053,0.224640,...,0.794681,-0.321079,-0.026692,-0.156213,3.753081,0.060683,0.283482,-0.283292,0.831158,3.091007
