### Correlation filter

#### This is the third step of the enrichment calculation

Some Org-IP samples may have very similar profiles, which may lead to underestimation of significance if outlier proteins overlap. 

To tackle this problem, enrichment values are pre-calculated using every other sample in the batch, and these enrichment values are correlated across samples.

The correlation values are then used to filter out highly correlated org-IP samples from being used in the significance test together.

In [1]:
import os
import sys
from pathlib import Path
import pandas as pd

script_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "data"
sys.path.append(str(script_path))

### Load the imputed IP table
The correct datestamp is required to find the input files

In [2]:
#%store -r timestamp
timestamp = "2024-07-16"

print(f"Timestamp: {timestamp}")

outprefix = f"{timestamp}_QC_filter_impute"

outdir = Path.cwd() / "output"
preprocessing_out = outdir / f"preprocessing"

Timestamp: 2024-07-16


In [3]:
#### Load bait imputed tables (IPs)
IP_path = preprocessing_out / f"{outprefix}_imputed_table.csv"

try:
    bait_imputed_table = pd.read_csv(IP_path, header=[0, 1], index_col = 0)
except FileNotFoundError:
    print(f"File {IP_path} not found.\nPlease run 1.QC_filter_and_impute.ipynb first or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {IP_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

### create fraction table

In [5]:
# Filter out the metadata and synthetic proteome columns
df = bait_imputed_table
filtered_df = df[[col for col in df.columns if col[0] not in ['metadata', 'synthetic_proteome']]]
# average columns ending with _1, _2, and _3 under the same first level index
grouped_df = filtered_df.groupby(level=0, axis=1).mean()
grouped_df.columns = pd.MultiIndex.from_product([['sample'], grouped_df.columns])
grouped_df

Unnamed: 0_level_0,sample,sample,sample,sample,sample
Samples,03K,06K,12K,24K,80K
0,0.120000,-0.624683,-0.398767,0.571433,1.538233
1,-0.281217,0.188500,-0.149583,-0.492483,-0.693417
2,-1.738557,-1.332183,-2.056200,-2.026821,-1.672267
3,-1.020700,-0.868417,-1.445783,-1.402567,-0.873433
4,-1.180700,-0.643750,3.548467,3.283900,4.336467
...,...,...,...,...,...
4923,-1.347828,-0.980580,-1.817261,-1.845244,-2.083272
4924,-0.930238,-1.118053,-1.059400,-0.709979,-0.711571
4925,-1.124483,-0.924583,-1.733533,-1.705583,-1.195067
4926,0.274350,-0.622217,-0.613917,0.441200,1.235767


In [6]:
metadata_df = df[[col for col in df.columns if col[0] == 'metadata']]
final_fraction_table = pd.concat([metadata_df, grouped_df], axis=1)
final_fraction_table.columns.names = [None, None]
final_fraction_table.index.names = [None]
final_fraction_table

Unnamed: 0_level_0,metadata,sample,sample,sample,sample,sample
Unnamed: 0_level_1,Unnamed: 0,03K,06K,12K,24K,80K
0,Q9NRG9,0.120000,-0.624683,-0.398767,0.571433,1.538233
1,Q2M2I8,-0.281217,0.188500,-0.149583,-0.492483,-0.693417
2,Q13685,-1.738557,-1.332183,-2.056200,-2.026821,-1.672267
3,P49588,-1.020700,-0.868417,-1.445783,-1.402567,-0.873433
4,Q5JTZ9,-1.180700,-0.643750,3.548467,3.283900,4.336467
...,...,...,...,...,...,...
4923,Q9NWK9,-1.347828,-0.980580,-1.817261,-1.845244,-2.083272
4924,Q8NHG8,-0.930238,-1.118053,-1.059400,-0.709979,-0.711571
4925,O95218,-1.124483,-0.924583,-1.733533,-1.705583,-1.195067
4926,O43264,0.274350,-0.622217,-0.613917,0.441200,1.235767


In [7]:
# saving the final tables to files
enrich_out_dir = outdir / "fraction_tables"
os.makedirs(enrich_out_dir, exist_ok=True)

enrichment_csv_path = enrich_out_dir / f"{timestamp}_fraction_table.csv"

final_fraction_table.to_csv(enrichment_csv_path)