### Correlation filter

#### This is the third step of the enrichment calculation

Some Org-IP samples may have very similar profiles, which may lead to underestimation of significance if outlier proteins overlap. 

To tackle this problem, enrichment values are pre-calculated using every other sample in the batch, and these enrichment values are correlated across samples.

The correlation values are then used to filter out highly correlated org-IP samples from being used in the significance test together.

In [1]:
import os
import sys
from pathlib import Path
import pandas as pd

script_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "data"
sys.path.append(str(script_path))

### Load the imputed IP table
The correct datestamp is required to find the input files

In [2]:
#%store -r timestamp
timestamp = "2024-07-16"

print(f"Timestamp: {timestamp}")

outprefix = f"{timestamp}_QC_filter_impute"

outdir = Path.cwd() / "output"
preprocessing_out = outdir / f"preprocessing"

Timestamp: 2024-07-16


In [3]:
#### Load bait imputed tables (IPs)
IP_path = preprocessing_out / f"{outprefix}_imputed_table.csv"

try:
    bait_imputed_table = pd.read_csv(IP_path, header=[0, 1], index_col = 0)
except FileNotFoundError:
    print(f"File {IP_path} not found.\nPlease run 1.QC_filter_and_impute.ipynb first or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {IP_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

### check fraction

In [4]:
## create synthetic proteome
df = bait_imputed_table

# Define the replicates and the new column names
replicates = [str(i) for i in range(1, 7)]
new_columns = [('synthetic_proteome', f'synthetic_proteome_{i}') for i in range(1, 4)]

# Sum the corresponding replicates 
synthetic_proteome_data = {
    new_col: df.loc[:, df.columns.get_level_values(1).str.endswith(replicate)].sum(axis=1)
    for new_col, replicate in zip(new_columns, replicates)
}

# Convert to DataFrame and concatenate with the original DataFrame
synthetic_proteome_df = pd.DataFrame(synthetic_proteome_data)

# Convert the new DataFrame to a multi-indexed DataFrame
synthetic_proteome_df.columns = pd.MultiIndex.from_tuples(synthetic_proteome_df.columns, names=['Samples', 'Replicates'])

# Concatenate the new synthetic proteome columns with the original DataFrame
df = pd.concat([df, synthetic_proteome_df], axis=1)

bait_imputed_table = df

In [5]:
bait_imputed_table

Samples,01K,01K,01K,03K,03K,03K,06K,06K,06K,12K,...,24K,80K,80K,80K,metadata,metadata,metadata,synthetic_proteome,synthetic_proteome,synthetic_proteome
Replicates,01K_1,01K_2,01K_3,03K_1,03K_2,03K_3,06K_1,06K_2,06K_3,12K_1,...,24K_3,80K_1,80K_2,80K_3,Protein IDs,Compartment,Gene names,synthetic_proteome_1,synthetic_proteome_2,synthetic_proteome_3
0,0.1081,0.1064,0.1162,0.2516,0.2792,0.2389,0.2599,0.2499,0.2684,0.2350,...,0.1156,0.0393,0.0421,0.0404,Q92692,Plasma membrane,NECTIN2,0.9998,1.0000,0.9999
1,0.1040,0.1016,0.1029,0.2510,0.2712,0.2425,0.2521,0.2654,0.2592,0.2440,...,0.1117,0.0477,0.0394,0.0465,Q969P0,Plasma membrane,IGSF8,1.0000,1.0000,1.0000
2,0.0890,0.1073,0.1484,0.2754,0.2679,0.2341,0.2598,0.2471,0.2478,0.2296,...,0.0983,0.0372,0.0390,0.0367,P15151,Plasma membrane,PVR,1.0002,1.0000,1.0000
3,0.1163,0.1031,0.1234,0.2453,0.2563,0.2297,0.2548,0.2583,0.2693,0.2238,...,0.1024,0.0523,0.0479,0.0511,P15529,Plasma membrane,CD46,1.0001,1.0000,1.0001
4,0.1083,0.1093,0.1211,0.2548,0.2741,0.2480,0.2685,0.2565,0.2565,0.2223,...,0.1037,0.0447,0.0477,0.0437,Q9ULF5,undefined,SLC39A10,1.0000,0.9999,0.9999
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7438,0.1426,0.1393,0.1675,0.2513,0.2405,0.2641,0.2082,0.1767,0.1783,0.1434,...,0.1390,0.1334,0.1076,0.0982,O14641,undefined,DVL2,1.0000,0.9999,1.0000
7439,0.1838,0.1822,0.2030,0.2293,0.2273,0.2682,0.1718,0.1630,0.1456,0.1654,...,0.1314,0.0940,0.0896,0.1037,P67936,undefined,TPM4,0.9998,1.0000,1.0000
7440,0.3061,0.2743,0.2790,0.2657,0.2519,0.2275,0.1862,0.1983,0.2206,0.1306,...,0.0806,0.0437,0.0391,0.0435,Q12800,undefined,TFCP2,1.0000,1.0001,0.9999
7441,0.1230,0.2183,0.2293,0.3047,0.3199,0.2272,0.2923,0.1313,0.1209,0.1110,...,0.2044,0.0581,0.0793,0.1444,Q9UPS6,undefined,SETD1B,1.0000,1.0000,0.9999


### create fraction table

In [6]:
# Filter out the metadata and synthetic proteome columns
filtered_df = df[[col for col in df.columns if col[0] not in ['metadata', 'synthetic_proteome']]]
# Average columns ending with _1, _2, and _3 under the same first level index
grouped_df = filtered_df.groupby(level=0, axis=1).mean()
grouped_df.columns = pd.MultiIndex.from_product([['sample'], grouped_df.columns])
grouped_df

Unnamed: 0_level_0,sample,sample,sample,sample,sample,sample
Samples,01K,03K,06K,12K,24K,80K
0,0.110233,0.256567,0.259400,0.226100,0.107000,0.040600
1,0.102833,0.254900,0.258900,0.238300,0.100533,0.044533
2,0.114900,0.259133,0.251567,0.234233,0.102600,0.037633
3,0.114267,0.243767,0.260800,0.226367,0.104433,0.050433
4,0.112900,0.258967,0.260500,0.222533,0.099667,0.045367
...,...,...,...,...,...,...
7438,0.149800,0.251967,0.187733,0.149467,0.147933,0.113067
7439,0.189667,0.241600,0.160133,0.157267,0.155500,0.095767
7440,0.286467,0.248367,0.201700,0.143433,0.077933,0.042100
7441,0.190200,0.283933,0.181500,0.120000,0.130400,0.093933


In [7]:
metadata_df = df[[col for col in df.columns if col[0] == 'metadata']]
final_fraction_table = pd.concat([metadata_df, grouped_df], axis=1)
final_fraction_table.columns.names = [None, None]
final_fraction_table.index.names = [None]
final_fraction_table

Unnamed: 0_level_0,metadata,metadata,metadata,sample,sample,sample,sample,sample,sample
Unnamed: 0_level_1,Protein IDs,Compartment,Gene names,01K,03K,06K,12K,24K,80K
0,Q92692,Plasma membrane,NECTIN2,0.110233,0.256567,0.259400,0.226100,0.107000,0.040600
1,Q969P0,Plasma membrane,IGSF8,0.102833,0.254900,0.258900,0.238300,0.100533,0.044533
2,P15151,Plasma membrane,PVR,0.114900,0.259133,0.251567,0.234233,0.102600,0.037633
3,P15529,Plasma membrane,CD46,0.114267,0.243767,0.260800,0.226367,0.104433,0.050433
4,Q9ULF5,undefined,SLC39A10,0.112900,0.258967,0.260500,0.222533,0.099667,0.045367
...,...,...,...,...,...,...,...,...,...
7438,O14641,undefined,DVL2,0.149800,0.251967,0.187733,0.149467,0.147933,0.113067
7439,P67936,undefined,TPM4,0.189667,0.241600,0.160133,0.157267,0.155500,0.095767
7440,Q12800,undefined,TFCP2,0.286467,0.248367,0.201700,0.143433,0.077933,0.042100
7441,Q9UPS6,undefined,SETD1B,0.190200,0.283933,0.181500,0.120000,0.130400,0.093933


In [8]:
# saving the final tables to files
enrich_out_dir = outdir / "fraction_tables"
os.makedirs(enrich_out_dir, exist_ok=True)

enrichment_csv_path = enrich_out_dir / f"{timestamp}_fraction_table.csv"

final_fraction_table.to_csv(enrichment_csv_path)