### Correlation filter

#### This is the third step of the enrichment calculation

Some Org-IP samples may have very similar profiles, which may lead to underestimation of significance if outlier proteins overlap. 

To tackle this problem, enrichment values are pre-calculated using every other sample in the batch, and these enrichment values are correlated across samples.

The correlation values are then used to filter out highly correlated org-IP samples from being used in the significance test together.

In [1]:
import os
import sys
from pathlib import Path
import pandas as pd

script_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "data"
sys.path.append(str(script_path))

### Load the imputed IP table
The correct datestamp is required to find the input files

In [2]:
#%store -r timestamp
timestamp = "2024-07-27"

print(f"Timestamp: {timestamp}")

outprefix = f"{timestamp}_QC_filter_impute"

outdir = Path.cwd() / "output"
preprocessing_out = outdir / f"preprocessing"

Timestamp: 2024-07-27


In [3]:
#### Load bait imputed tables (IPs)
IP_path = preprocessing_out / f"{outprefix}_imputed_table.csv"

try:
    bait_imputed_table = pd.read_csv(IP_path, header=[0, 1], index_col = 0)
except FileNotFoundError:
    print(f"File {IP_path} not found.\nPlease run 1.QC_filter_and_impute.ipynb first or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {IP_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

### check synthetic proteome (sum of fractions should = 1)

In [4]:
## create synthetic proteome
df = bait_imputed_table

# Define the replicates and the new column names
replicates = [str(i) for i in range(1, 7)]
new_columns = [('synthetic_proteome', f'synthetic_proteome_{i}') for i in range(1, 3)]

# Sum the corresponding replicates 
synthetic_proteome_data = {
    new_col: df.loc[:, df.columns.get_level_values(1).str.endswith(replicate)].sum(axis=1)
    for new_col, replicate in zip(new_columns, replicates)
}

# Convert to DataFrame and concatenate with the original DataFrame
synthetic_proteome_df = pd.DataFrame(synthetic_proteome_data)

# Convert the new DataFrame to a multi-indexed DataFrame
synthetic_proteome_df.columns = pd.MultiIndex.from_tuples(synthetic_proteome_df.columns, names=['Samples', 'Replicates'])

# Concatenate the new synthetic proteome columns with the original DataFrame
df = pd.concat([df, synthetic_proteome_df], axis=1)

bait_imputed_table = df

In [5]:
bait_imputed_table

Samples,126,126,126,127C,127C,127C,127N,127N,127N,128C,...,129C,129N,129N,129N,130N,130N,130N,metadata,synthetic_proteome,synthetic_proteome
Replicates,126_1,126_2,126_3,127C_1,127C_2,127C_3,127N_1,127N_2,127N_3,128C_1,...,129C_3,129N_1,129N_2,129N_3,130N_1,130N_2,130N_3,Protein IDs,synthetic_proteome_1,synthetic_proteome_2
0,0.0760,0.0702,0.0561,0.0439,0.0544,0.0536,0.0528,0.0674,0.0677,0.1664,...,0.1489,0.1545,0.2105,0.2070,0.2453,0.1791,0.2460,A0AVF1,0.9999,1.0000
1,0.0614,0.0413,0.0478,0.0543,0.0395,0.0576,0.0310,0.0335,0.0409,0.1735,...,0.1518,0.1501,0.1915,0.1800,0.2386,0.2042,0.2562,A0AVT1,1.0000,1.0000
2,0.4773,0.4861,0.6590,0.1079,0.1450,0.0543,0.3557,0.2974,0.2352,0.0113,...,0.0050,0.0092,0.0085,0.0067,0.0041,0.0073,0.0035,A0FGR8,1.0000,1.0000
3,0.1854,0.1027,0.1746,0.1714,0.1278,0.1773,0.0723,0.0576,0.0644,0.1056,...,0.0801,0.0778,0.1106,0.1026,0.1176,0.1255,0.1004,A0MZ66,1.0000,0.9999
4,0.3752,0.3270,0.4915,0.1293,0.1755,0.0931,0.4311,0.3740,0.3159,0.0115,...,0.0083,0.0081,0.0146,0.0137,0.0044,0.0111,0.0061,A1L0T0,1.0001,1.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5309,0.2997,0.3207,0.4133,0.0597,0.0845,0.0503,0.1662,0.1654,0.1626,0.0763,...,0.1114,0.1086,0.0976,0.0768,0.1419,0.1373,0.1279,Q9Y6W5,1.0001,0.9999
5310,0.8341,0.7673,0.8770,0.0266,0.0543,0.0187,0.1062,0.1434,0.0776,0.0041,...,0.0047,0.0077,0.0045,0.0041,0.0056,0.0050,0.0060,Q9Y6X5,1.0001,1.0000
5311,0.0497,0.0422,0.0486,0.0598,0.0514,0.0674,0.0602,0.0416,0.0786,0.1655,...,0.2246,0.1926,0.1768,0.1694,0.1535,0.1571,0.1254,Q9Y6X9,1.0000,1.0000
5312,0.0201,0.0860,0.0446,0.0213,0.0633,0.0399,0.0326,0.0855,0.0772,0.1819,...,0.2081,0.2594,0.2012,0.2468,0.2175,0.1994,0.2403,Q9Y6Y0,1.0001,1.0002


### create fraction table

In [6]:
# Filter out the metadata and synthetic proteome columns
filtered_df = df[[col for col in df.columns if col[0] not in ['metadata', 'synthetic_proteome']]]
# Average columns ending with _1, _2, and _3 under the same first level index
grouped_df = filtered_df.groupby(level=0, axis=1).mean()
grouped_df.columns = pd.MultiIndex.from_product([['sample'], grouped_df.columns])
grouped_df

Unnamed: 0_level_0,sample,sample,sample,sample,sample,sample,sample,sample
Samples,126,127C,127N,128C,128N,129C,129N,130N
0,0.067433,0.050633,0.062633,0.155900,0.076067,0.173167,0.190667,0.223467
1,0.050167,0.050467,0.035133,0.170467,0.120033,0.166900,0.173867,0.233000
2,0.540800,0.102400,0.296100,0.012467,0.029333,0.005800,0.008133,0.004967
3,0.154233,0.158833,0.064767,0.125700,0.195633,0.089300,0.097000,0.114500
4,0.397900,0.132633,0.373667,0.020833,0.047367,0.008300,0.012133,0.007200
...,...,...,...,...,...,...,...,...
5309,0.344567,0.064833,0.164733,0.052500,0.023800,0.119533,0.094333,0.135700
5310,0.826133,0.033200,0.109067,0.005100,0.009933,0.005633,0.005433,0.005533
5311,0.046833,0.059533,0.060133,0.156700,0.136333,0.215533,0.179600,0.145333
5312,0.050233,0.041500,0.065100,0.120600,0.042767,0.225000,0.235800,0.219067


In [7]:
metadata_df = df[[col for col in df.columns if col[0] == 'metadata']]
final_fraction_table = pd.concat([metadata_df, grouped_df], axis=1)
final_fraction_table.columns.names = [None, None]
final_fraction_table.index.names = [None]
final_fraction_table

Unnamed: 0_level_0,metadata,sample,sample,sample,sample,sample,sample,sample,sample
Unnamed: 0_level_1,Protein IDs,126,127C,127N,128C,128N,129C,129N,130N
0,A0AVF1,0.067433,0.050633,0.062633,0.155900,0.076067,0.173167,0.190667,0.223467
1,A0AVT1,0.050167,0.050467,0.035133,0.170467,0.120033,0.166900,0.173867,0.233000
2,A0FGR8,0.540800,0.102400,0.296100,0.012467,0.029333,0.005800,0.008133,0.004967
3,A0MZ66,0.154233,0.158833,0.064767,0.125700,0.195633,0.089300,0.097000,0.114500
4,A1L0T0,0.397900,0.132633,0.373667,0.020833,0.047367,0.008300,0.012133,0.007200
...,...,...,...,...,...,...,...,...,...
5309,Q9Y6W5,0.344567,0.064833,0.164733,0.052500,0.023800,0.119533,0.094333,0.135700
5310,Q9Y6X5,0.826133,0.033200,0.109067,0.005100,0.009933,0.005633,0.005433,0.005533
5311,Q9Y6X9,0.046833,0.059533,0.060133,0.156700,0.136333,0.215533,0.179600,0.145333
5312,Q9Y6Y0,0.050233,0.041500,0.065100,0.120600,0.042767,0.225000,0.235800,0.219067


In [8]:
# saving the final tables to files
enrich_out_dir = outdir / "fraction_tables"
os.makedirs(enrich_out_dir, exist_ok=True)

enrichment_csv_path = enrich_out_dir / f"{timestamp}_fraction_table.csv"

final_fraction_table.to_csv(enrichment_csv_path)