### Correlation filter

#### This is the third step of the enrichment calculation

Some Org-IP samples may have very similar profiles, which may lead to underestimation of significance if outlier proteins overlap. 

To tackle this problem, enrichment values are pre-calculated using every other sample in the batch, and these enrichment values are correlated across samples.

The correlation values are then used to filter out highly correlated org-IP samples from being used in the significance test together.

This notebook conducts a sweep of the correlation cutoff values, using the ARI as a evaluating metric 

In [1]:
import pandas as pd
import sys
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
from pathlib import Path

script_path = Path.cwd().parent.parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent.parent / "data"
sys.path.append(str(script_path))
from pyseus import primary_analysis as pa
from pyseus import spatial_tools as st

### Load the imputed IP table
The correct datestamp is required to find the input files

In [2]:
timestamp = datetime.now().strftime('%Y-%m-%d')
print(f"Timestamp: {timestamp}")
outprefix = f"{timestamp}_QC_filter_impute"

outdir = Path.cwd() / "output"
preprocessing_out = outdir / f"preprocessing"

Timestamp: 2023-12-04


In [3]:
#### Load bait imputed tables (IPs)
IP_path = preprocessing_out / f"{outprefix}_imputed_table.csv"

try:
    bait_imputed_table = pd.read_csv(IP_path, header=[0,1], index_col = 0)
except FileNotFoundError:
    print(f"File {IP_path} not found.\nPlease run 1.QC_filter_and_impute.ipynb first or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {IP_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

### Correlation filter

In [4]:
batches = [['14','12','17'], ['09', '10', '11']]

# imputed, preprocessed table from the earlier steps
grouped = bait_imputed_table.copy()

# initiate lists to add dataframes for concatenation at the end 
volcano_tables = []
enrichment_tables = []
volcano_tables_againstWT = []
enrichment_tables_againstWT = []

# create a folder to save the correlation tables
correlation_table_dir = outdir / "correlation_tables"
os.makedirs(correlation_table_dir, exist_ok=True)

# iterate through batches
for i, batch in enumerate(batches):
    print('batch: ' + str(i))
    new_cols = []
    # select all the samples from each batch
    for col in list(grouped):
        if col[0] == 'metadata':
            new_cols.append(col)
        # identifying each batch
        elif col[1].split('-')[0] in batch: # splitting the dash and seeing if that experiment is in the batch youre running
            new_cols.append(col) 
    selected = grouped[new_cols].copy()

    #print(list(selected))
    # initial calculation of complement enrichments to identify highly correlated samples
    analysis = pa.AnalysisTables(grouped_table=selected, auto_group=False)
    analysis.generate_export_bait_matrix()

    #analysis.select_wildtype_controls(wt_re='-WT') # select only WT as controls, this is onfirmed by Manu to be the better than using all samples as the intial controls
    analysis.select_matching_WT_control(wt_re='-WT') # select matching WT as controls.

    #print(analysis.exclusion_matrix) # uncomment to check the exclusion matrix
    analysis.exclusion_matrix.to_csv(correlation_table_dir /  f"{timestamp}_negative_control_batch_{i}.csv", index=False)

    
    analysis.simple_pval_enrichment(std_enrich=False)
    analysis.convert_to_enrichment_table()
    complements = analysis.enrichment_table.copy()
    control_mat = analysis.exclusion_matrix.copy()

    enrichment_tables_againstWT.append(analysis.enrichment_table.copy()) # export the enrichments against WT
    volcano_tables_againstWT.append(analysis.simple_pval_table.copy()) # export the pvals against WT
    
    # Using the previous calculations, find highly correlated sample (filter is at >0.35)
    spatial = st.SpatialTables(preprocessed_table= selected,
        enrichment_table=complements, control_mat=control_mat)
    spatial.enrichment_corr_control_mat(corr=0.35)

    # save the correlation table for each batch
    spatial.corr_mat.to_csv(correlation_table_dir / f"{timestamp}_corr_table_batch_{i}.csv", index=False)  
    spatial.sample_corrs.to_csv(correlation_table_dir / f"{timestamp}_corr_val_table_batch_{i}.csv", index=True)
    
    # calculate the final pval and enrichment table for the batch
    spatial.new_corr_ARI(just_enrichment=True, labels=None, reference=None, std_enrich=False)

    # appending respective dataframes to the concatenation list
    volcano_tables.append(spatial.corr_pval_table)
    enrichment_tables.append(spatial.corr_enrichment_table)
    

batch: 0
P-val calculations..
Finished!
P-val calculations..
Finished!
batch: 1
P-val calculations..
Finished!
P-val calculations..
Finished!


The final step is to concatenate results from the batches. 

The enrichment table can be used for multiple purposes, including UMAP generation.

In [5]:
# concatenating the final enrichment table
enrichment_metadata = enrichment_tables[0].loc[:,['metadata']].copy()
enrichment_samples = [x.drop(['metadata'], axis=1, level=0) for x in enrichment_tables]

final_enrichments = pd.concat([enrichment_metadata] + enrichment_samples, axis=1)

# concatenating the final volcano table
volcano_metadata = volcano_tables[0].loc[:,['metadata']].copy()
volcano_samples = [x.drop(['metadata'], axis=1, level=0) for x in volcano_tables]

final_volcano = pd.concat([volcano_metadata] + volcano_samples, axis=1)

In [6]:
#saving the final tables to files
enrich_out_dir = outdir / "enrichment_and_volcano_tables"
os.makedirs(enrich_out_dir, exist_ok=True)

volcano_csv_path = enrich_out_dir / f'{timestamp}_volcano_table.csv'
enrichment_csv_path = enrich_out_dir /  f'{timestamp}_enrichment_table.csv'

final_volcano.to_csv(volcano_csv_path)
final_enrichments.to_csv(enrichment_csv_path)


In [7]:
final_enrichments

Unnamed: 0_level_0,metadata,metadata,metadata,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample
Unnamed: 0_level_1,Protein IDs,Majority protein IDs,Gene names,17-SLC30A2_Infected,14-GOLGA2_Infected,17-ATP1B3_Infected,17-RPL36_Infected,12-WT_Infected,12-LAMP1_Infected,12-YWHAQ_Infected,...,10-TOMM20_Infected,11-CEP350_Infected,09-PEX3_Infected,09-WT_Infected,10-RTN4_Infected,11-SEC31A_Infected,09-HSP90AA1_Infected,10-EXOC2_Infected,09-TOMM20_Infected,10-WT_Infected
0,A0A023T6R1;Q96A72;F5H6P7;F5H6N1;F5H3U9;F5H124,A0A023T6R1;Q96A72;F5H6P7;F5H6N1,FLJ10292;MAGOHB,-0.153721,-0.991876,-0.487636,1.801878,0.397270,-0.945137,-0.524625,...,0.450149,-0.763785,0.064155,2.427443,-1.404833,-0.933127,-0.652504,1.772409,0.502488,1.373609
1,Q9Y5S9;A0A023T787;A0A0J9YW13,Q9Y5S9;A0A023T787,RBM8A;RBM8,-1.118800,-0.461100,-1.304100,0.610900,1.549150,-0.127200,0.362400,...,-1.624750,0.044000,1.003500,1.601700,-1.694300,0.352900,-3.383000,1.100000,-1.311100,0.557200
2,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6Z...,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6ZTK5,TM9SF4,-3.445300,1.197850,-4.945500,-3.140200,-1.216500,-0.995400,-3.524800,...,-3.785350,0.481200,-1.405800,0.196500,-1.205400,1.233000,-6.680200,-0.819500,-3.319200,0.337500
3,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,PTEN,0.370895,0.264732,0.716220,-0.431213,-1.352463,-0.327094,0.322768,...,-0.139591,-0.775576,0.501110,0.205414,-0.509183,-0.280560,0.976346,0.275284,-0.471726,-0.908906
4,Q99805;A0A024QYR8;B3KSG9,Q99805;A0A024QYR8;B3KSG9,TM9SF2,-1.637100,1.353050,-2.353700,-1.692200,-3.085650,-0.651500,-3.231350,...,-4.548400,1.789500,-0.575100,0.151900,-1.434300,1.472700,-7.951400,-0.159900,-2.908700,-0.052200
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8436,X5D2T3;X1WI28;P27635;B8A6G2;X5D2W5;A6QRI9;Q96L21,X5D2T3;X1WI28;P27635,RPL10,0.746000,-0.111700,0.928900,2.935600,0.676750,-0.265450,0.236100,...,-0.115050,0.992550,0.271200,-0.374000,0.124800,0.171400,-1.241500,0.163500,0.540700,0.752400
8437,X5D7P8,X5D7P8,STK39,0.509498,0.032787,0.718537,-1.866703,0.026481,-0.576165,1.023599,...,0.592130,-0.053515,-0.240133,0.372828,-1.091708,-0.136871,0.914852,0.988118,-0.020222,0.344576
8438,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,CRMP1,-0.339776,0.753024,-0.496244,-0.291901,-0.336175,1.006970,-0.764510,...,-0.053984,-0.409149,3.856170,-0.643068,0.232216,1.209000,0.052735,-0.552581,0.055227,-0.431497
8439,X5DQZ7,X5DQZ7,GPX1,-1.462731,3.127882,-0.747647,-0.300528,-0.126492,-1.682565,-1.341266,...,2.825392,-1.111433,-0.506581,-1.720063,1.696521,-0.648949,4.766470,-1.034065,4.240770,-1.249227
