### Correlation filter

#### This is the third step of the enrichment calculation

Some Org-IP samples may have very similar profiles, which may lead to underestimation of significance if outlier proteins overlap. 

To tackle this problem, enrichment values are pre-calculated using every other sample in the batch, and these enrichment values are correlated across samples.

The correlation values are then used to filter out highly correlated org-IP samples from being used in the significance test together.

In [1]:
import os
import sys
from pathlib import Path
import pandas as pd

script_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "data"
sys.path.append(str(script_path))

### Load the imputed IP table
The correct datestamp is required to find the input files

In [2]:
#%store -r timestamp
timestamp = "2024-07-27"

print(f"Timestamp: {timestamp}")

outprefix = f"{timestamp}_QC_filter_impute"

outdir = Path.cwd() / "output"
preprocessing_out = outdir / f"preprocessing"

Timestamp: 2024-07-27


In [3]:
#### Load bait imputed tables (IPs)
IP_path = preprocessing_out / f"{outprefix}_imputed_table.csv"

try:
    bait_imputed_table = pd.read_csv(IP_path, header=[0, 1], index_col = 0)
except FileNotFoundError:
    print(f"File {IP_path} not found.\nPlease run 1.QC_filter_and_impute.ipynb first or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {IP_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

### merge set1 and set2

In [4]:
setMerged = bait_imputed_table.copy()
first_level = list(set([i[0] for i in bait_imputed_table.columns]))

for s in first_level:
    if s.startswith("set1X126"): # X126 had missing columns, thus we deal with in a hard-coded way
        setMerged[("X126", f"X126_1")] = setMerged[("set2X126", f"set2X126_1")]
        setMerged[("X126", f"X126_2")] = setMerged[("set2X126", f"set2X126_2")]
        setMerged[("X126", f"X126_3")] = (setMerged[("set1X126", f"set1X126_3")] + setMerged[("set2X126", f"set2X126_3")])/2
        setMerged.drop(columns=[("set1X126", "set1X126_3"), ("set2X126", "set2X126_3"), ("set2X126", f"set2X126_2"), ("set2X126", "set2X126_1")], inplace=True)
    elif s.startswith("set1X127N"):
        setMerged[("X127N", f"X127N_1")] =  setMerged[("set2X127N", f"set2X127N_3")]
        setMerged[("X127N", f"X127N_2")] = (setMerged[("set1X127N", f"set1X127N_2")] + setMerged[("set2X127N", f"set2X127N_2")])/2
        setMerged[("X127N", f"X127N_3")] = (setMerged[("set1X127N", f"set1X127N_3")] + setMerged[("set2X127N", f"set2X127N_3")])/2
        setMerged.drop(columns=[("set1X127N","set1X127N_2"), ("set1X127N","set1X127N_3"), ("set2X127N","set2X127N_1"), ("set2X127N","set2X127N_2"), ("set2X127N","set2X127N_3")], inplace=True)
    elif s.startswith("set1"):
        # check if S2 is present
        matchedname = s.replace("set1", "set2") # predict the name of the matched set
        basename = s.replace("set1", "")
        if matchedname not in first_level:
            print(f"{s} is missing its matched pair {matchedname}")
        else:
            print(f"Found matched set pair {s} and {matchedname}")
            # merge the two sets across reps
            setMerged[(basename, f"{basename}_1")] = (setMerged[(s, f"{s}_1")] + setMerged[(matchedname, f"{matchedname}_1")]) / 2
            setMerged[(basename, f"{basename}_2")] = (setMerged[(s, f"{s}_2")] + setMerged[(matchedname, f"{matchedname}_2")]) / 2
            setMerged[(basename, f"{basename}_3")] = (setMerged[(s, f"{s}_3")] + setMerged[(matchedname, f"{matchedname}_3")]) / 2
            # drop the original columns
            setMerged.drop(columns=[(s, f"{s}_1"), (s, f"{s}_2"), (s, f"{s}_3"), (matchedname, f"{matchedname}_1"), (matchedname, f"{matchedname}_2"), (matchedname, f"{matchedname}_3")], inplace=True)

Found matched set pair set1X128C and set2X128C
Found matched set pair set1X129N and set2X129N
Found matched set pair set1X129C and set2X129C
Found matched set pair set1X128N and set2X128N
Found matched set pair set1X131 and set2X131
Found matched set pair set1X130N and set2X130N
Found matched set pair set1X127C and set2X127C
Found matched set pair set1X130C and set2X130C


In [5]:
setMerged.columns

MultiIndex([('metadata', 'Unnamed: 0'),
            (   'X128C',    'X128C_1'),
            (   'X128C',    'X128C_2'),
            (   'X128C',    'X128C_3'),
            (   'X127N',    'X127N_1'),
            (   'X127N',    'X127N_2'),
            (   'X127N',    'X127N_3'),
            (   'X129N',    'X129N_1'),
            (   'X129N',    'X129N_2'),
            (   'X129N',    'X129N_3'),
            (   'X129C',    'X129C_1'),
            (   'X129C',    'X129C_2'),
            (   'X129C',    'X129C_3'),
            (   'X128N',    'X128N_1'),
            (   'X128N',    'X128N_2'),
            (   'X128N',    'X128N_3'),
            (    'X126',     'X126_1'),
            (    'X126',     'X126_2'),
            (    'X126',     'X126_3'),
            (    'X131',     'X131_1'),
            (    'X131',     'X131_2'),
            (    'X131',     'X131_3'),
            (   'X130N',    'X130N_1'),
            (   'X130N',    'X130N_2'),
            (   'X130N',    'X130N_3'),


### check synthetic proteome (sum of fractions should = 1)

In [6]:
## create synthetic proteome
df = setMerged

# Define the replicates and the new column names
replicates = [str(i) for i in range(1, 7)]
new_columns = [('synthetic_proteome', f'synthetic_proteome_{i}') for i in range(1, 3)]

# Sum the corresponding replicates 
synthetic_proteome_data = {
    new_col: df.loc[:, df.columns.get_level_values(1).str.endswith(replicate)].sum(axis=1)
    for new_col, replicate in zip(new_columns, replicates)
}

# Convert to DataFrame and concatenate with the original DataFrame
synthetic_proteome_df = pd.DataFrame(synthetic_proteome_data)

# Convert the new DataFrame to a multi-indexed DataFrame
synthetic_proteome_df.columns = pd.MultiIndex.from_tuples(synthetic_proteome_df.columns, names=['Samples', 'Replicates'])

# Concatenate the new synthetic proteome columns with the original DataFrame
df = pd.concat([df, synthetic_proteome_df], axis=1)

bait_imputed_table = df

In [7]:
bait_imputed_table

Samples,metadata,X128C,X128C,X128C,X127N,X127N,X127N,X129N,X129N,X129N,...,X130N,X130N,X127C,X127C,X127C,X130C,X130C,X130C,synthetic_proteome,synthetic_proteome
Replicates,Unnamed: 0,X128C_1,X128C_2,X128C_3,X127N_1,X127N_2,X127N_3,X129N_1,X129N_2,X129N_3,...,X130N_2,X130N_3,X127C_1,X127C_2,X127C_3,X130C_1,X130C_2,X130C_3,synthetic_proteome_1,synthetic_proteome_2
0,P05387,0.1020,0.0800,0.1130,0.082,0.0365,0.0675,0.1015,0.1030,0.0665,...,0.0310,0.0285,0.0420,0.0325,0.0395,0.3490,0.4725,0.4310,1.0495,1.0045
1,P04406,0.0745,0.0465,0.0515,0.061,0.0345,0.0565,0.0760,0.1285,0.0835,...,0.0380,0.0390,0.0785,0.0530,0.0550,0.2010,0.2170,0.1510,1.0385,0.9960
2,P60903,0.2090,0.1930,0.1510,0.180,0.1070,0.1780,0.0935,0.0650,0.0550,...,0.0440,0.0725,0.1235,0.1145,0.1780,0.0570,0.0895,0.0625,1.1160,1.0190
3,P57105,0.1150,0.1085,0.1115,0.077,0.0595,0.0665,0.2130,0.1395,0.1515,...,0.0100,0.0040,0.0460,0.0670,0.0575,0.0965,0.1010,0.0840,1.0450,0.9965
4,P21964-2,0.1940,0.1900,0.1835,0.155,0.1050,0.1370,0.1235,0.0970,0.1000,...,0.0045,0.0050,0.1065,0.1050,0.1110,0.0520,0.0820,0.0665,1.0865,0.9995
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4878,Q5T011-5,0.1060,0.0860,0.0395,0.063,0.0560,0.0595,0.0945,0.1095,0.1310,...,0.0585,0.0695,0.0985,0.0600,0.0765,0.2695,0.3640,0.3325,1.0425,1.0270
4879,Q8IWV7,0.0700,0.1110,0.1000,0.050,0.0670,0.1100,0.1035,0.0520,0.0790,...,0.0365,0.0175,0.0595,0.0675,0.1510,0.1440,0.0515,0.0275,1.0295,1.0090
4880,Q14055,0.1700,0.1570,0.1905,0.139,0.0940,0.1215,0.1330,0.0975,0.0840,...,0.0260,0.0140,0.0800,0.0920,0.0955,0.1090,0.0910,0.0820,1.1055,1.0270
4881,P04196,0.0970,0.0555,0.0455,0.035,0.0335,0.0320,0.1080,0.0970,0.0695,...,0.0155,0.0205,0.0620,0.0435,0.0325,0.2635,0.3010,0.1685,1.0245,1.0100


### create fraction table

In [8]:
# Filter out the metadata and synthetic proteome columns
filtered_df = df[[col for col in df.columns if col[0] not in ['metadata', 'synthetic_proteome']]]
# Average columns ending with _1, _2, and _3 under the same first level index
grouped_df = filtered_df.groupby(level=0, axis=1).mean()
grouped_df.columns = pd.MultiIndex.from_product([['sample'], grouped_df.columns])
grouped_df

Unnamed: 0_level_0,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample
Samples,X126,X127C,X127N,X128C,X128N,X129C,X129N,X130C,X130N,X131
0,0.018833,0.038000,0.062000,0.098333,0.114000,0.115833,0.090333,0.417500,0.034000,0.031500
1,0.039667,0.062167,0.050667,0.057500,0.055333,0.062167,0.096000,0.189667,0.043000,0.356167
2,0.066333,0.138667,0.155000,0.184333,0.157500,0.134167,0.071167,0.069667,0.053333,0.006667
3,0.021500,0.056833,0.067667,0.111667,0.220667,0.262167,0.168000,0.093833,0.010000,0.005500
4,0.038167,0.107500,0.132333,0.189167,0.190833,0.162500,0.106833,0.066833,0.004833,0.020333
...,...,...,...,...,...,...,...,...,...,...
4878,0.101333,0.078333,0.059500,0.077167,0.064000,0.078833,0.111667,0.322000,0.065500,0.055000
4879,0.070667,0.092667,0.075667,0.093667,0.074000,0.080333,0.078167,0.074333,0.047500,0.325667
4880,0.055500,0.089167,0.118167,0.172500,0.190000,0.158667,0.104833,0.094000,0.019167,0.042000
4881,0.027167,0.046000,0.033500,0.066000,0.070000,0.081667,0.091500,0.244333,0.015000,0.336333


In [9]:
metadata_df = df[[col for col in df.columns if col[0] == 'metadata']]
final_fraction_table = pd.concat([metadata_df, grouped_df], axis=1)
final_fraction_table.columns.names = [None, None]
final_fraction_table.index.names = [None]
final_fraction_table

Unnamed: 0_level_0,metadata,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample
Unnamed: 0_level_1,Unnamed: 0,X126,X127C,X127N,X128C,X128N,X129C,X129N,X130C,X130N,X131
0,P05387,0.018833,0.038000,0.062000,0.098333,0.114000,0.115833,0.090333,0.417500,0.034000,0.031500
1,P04406,0.039667,0.062167,0.050667,0.057500,0.055333,0.062167,0.096000,0.189667,0.043000,0.356167
2,P60903,0.066333,0.138667,0.155000,0.184333,0.157500,0.134167,0.071167,0.069667,0.053333,0.006667
3,P57105,0.021500,0.056833,0.067667,0.111667,0.220667,0.262167,0.168000,0.093833,0.010000,0.005500
4,P21964-2,0.038167,0.107500,0.132333,0.189167,0.190833,0.162500,0.106833,0.066833,0.004833,0.020333
...,...,...,...,...,...,...,...,...,...,...,...
4878,Q5T011-5,0.101333,0.078333,0.059500,0.077167,0.064000,0.078833,0.111667,0.322000,0.065500,0.055000
4879,Q8IWV7,0.070667,0.092667,0.075667,0.093667,0.074000,0.080333,0.078167,0.074333,0.047500,0.325667
4880,Q14055,0.055500,0.089167,0.118167,0.172500,0.190000,0.158667,0.104833,0.094000,0.019167,0.042000
4881,P04196,0.027167,0.046000,0.033500,0.066000,0.070000,0.081667,0.091500,0.244333,0.015000,0.336333


In [10]:
# saving the final tables to files
enrich_out_dir = outdir / "fraction_tables"
os.makedirs(enrich_out_dir, exist_ok=True)

enrichment_csv_path = enrich_out_dir / f"{timestamp}_fraction_table.csv"

final_fraction_table.to_csv(enrichment_csv_path)