### Correlation filter

#### This is the third step of the enrichment calculation

Some Org-IP samples may have very similar profiles, which may lead to underestimation of significance if outlier proteins overlap. 

To tackle this problem, enrichment values are pre-calculated using every other sample in the batch, and these enrichment values are correlated across samples.

The correlation values are then used to filter out highly correlated org-IP samples from being used in the significance test together.

In [1]:
import os
import sys
from pathlib import Path
import pandas as pd

script_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "data"
sys.path.append(str(script_path))

### Load the imputed IP table
The correct datestamp is required to find the input files

In [2]:
#%store -r timestamp
timestamp = "2024-07-27"

print(f"Timestamp: {timestamp}")

outprefix = f"{timestamp}_QC_filter_impute"

outdir = Path.cwd() / "output"
preprocessing_out = outdir / f"preprocessing"

Timestamp: 2024-07-27


In [3]:
#### Load bait imputed tables (IPs)
IP_path = preprocessing_out / f"{outprefix}_imputed_table.csv"

try:
    bait_imputed_table = pd.read_csv(IP_path, header=[0, 1], index_col = 0)
except FileNotFoundError:
    print(f"File {IP_path} not found.\nPlease run 1.QC_filter_and_impute.ipynb first or specify the correct timestamp, current value is {timestamp}")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {IP_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

### merge set1 and set2

In [4]:
setMerged = bait_imputed_table.copy()
first_level = list(set([i[0] for i in bait_imputed_table.columns]))

for s in first_level:
    if s.startswith("set1X126"): # X126 had missing columns, thus we deal with in a hard-coded way
        setMerged[("X126", f"X126_1")] = setMerged[("set2X126", f"set2X126_1")]
        setMerged[("X126", f"X126_2")] = setMerged[("set2X126", f"set2X126_2")]
        setMerged[("X126", f"X126_3")] = (setMerged[("set1X126", f"set1X126_3")] + setMerged[("set2X126", f"set2X126_3")])/2
        setMerged.drop(columns=[("set1X126", "set1X126_3"), ("set2X126", "set2X126_3"), ("set2X126", f"set2X126_2"), ("set2X126", "set2X126_1")], inplace=True)
    elif s.startswith("set1X127N"):
        setMerged[("X127N", f"X127N_1")] =  setMerged[("set2X127N", f"set2X127N_3")]
        setMerged[("X127N", f"X127N_2")] = (setMerged[("set1X127N", f"set1X127N_2")] + setMerged[("set2X127N", f"set2X127N_2")])/2
        setMerged[("X127N", f"X127N_3")] = (setMerged[("set1X127N", f"set1X127N_3")] + setMerged[("set2X127N", f"set2X127N_3")])/2
        setMerged.drop(columns=[("set1X127N","set1X127N_2"), ("set1X127N","set1X127N_3"), ("set2X127N","set2X127N_1"), ("set2X127N","set2X127N_2"), ("set2X127N","set2X127N_3")], inplace=True)
    elif s.startswith("set1"):
        # check if S2 is present
        matchedname = s.replace("set1", "set2") # predict the name of the matched set
        basename = s.replace("set1", "")
        if matchedname not in first_level:
            print(f"{s} is missing its matched pair {matchedname}")
        else:
            print(f"Found matched set pair {s} and {matchedname}")
            # merge the two sets across reps
            setMerged[(basename, f"{basename}_1")] = (setMerged[(s, f"{s}_1")] + setMerged[(matchedname, f"{matchedname}_1")]) / 2
            setMerged[(basename, f"{basename}_2")] = (setMerged[(s, f"{s}_2")] + setMerged[(matchedname, f"{matchedname}_2")]) / 2
            setMerged[(basename, f"{basename}_3")] = (setMerged[(s, f"{s}_3")] + setMerged[(matchedname, f"{matchedname}_3")]) / 2
            # drop the original columns
            setMerged.drop(columns=[(s, f"{s}_1"), (s, f"{s}_2"), (s, f"{s}_3"), (matchedname, f"{matchedname}_1"), (matchedname, f"{matchedname}_2"), (matchedname, f"{matchedname}_3")], inplace=True)

In [5]:
setMerged.columns

MultiIndex([(      'P1',        'P1_1'),
            (      'P1',        'P1_2'),
            (      'P1',        'P1_3'),
            (      'P2',        'P2_1'),
            (      'P2',        'P2_2'),
            (      'P2',        'P2_3'),
            (      'P3',        'P3_1'),
            (      'P3',        'P3_2'),
            (      'P3',        'P3_3'),
            (      'P4',        'P4_1'),
            (      'P4',        'P4_2'),
            (      'P4',        'P4_3'),
            (      'P5',        'P5_1'),
            (      'P5',        'P5_2'),
            (      'P5',        'P5_3'),
            (      'P6',        'P6_1'),
            (      'P6',        'P6_2'),
            (      'P6',        'P6_3'),
            (      'P7',        'P7_1'),
            (      'P7',        'P7_2'),
            (      'P7',        'P7_3'),
            (      'P8',        'P8_1'),
            (      'P8',        'P8_2'),
            (      'P8',        'P8_3'),
            (   

### check synthetic proteome (sum of fractions should = 1)

In [6]:
## create synthetic proteome
df = setMerged

# Define the replicates and the new column names
replicates = [str(i) for i in range(1, 7)]
new_columns = [('synthetic_proteome', f'synthetic_proteome_{i}') for i in range(1, 3)]

# Sum the corresponding replicates 
synthetic_proteome_data = {
    new_col: df.loc[:, df.columns.get_level_values(1).str.endswith(replicate)].sum(axis=1)
    for new_col, replicate in zip(new_columns, replicates)
}

# Convert to DataFrame and concatenate with the original DataFrame
synthetic_proteome_df = pd.DataFrame(synthetic_proteome_data)

# Convert the new DataFrame to a multi-indexed DataFrame
synthetic_proteome_df.columns = pd.MultiIndex.from_tuples(synthetic_proteome_df.columns, names=['Samples', 'Replicates'])

# Concatenate the new synthetic proteome columns with the original DataFrame
df = pd.concat([df, synthetic_proteome_df], axis=1)

bait_imputed_table = df

In [7]:
bait_imputed_table

Samples,P1,P1,P1,P2,P2,P2,P3,P3,P3,P4,...,P8,P9,P9,P9,SN,SN,SN,metadata,synthetic_proteome,synthetic_proteome
Replicates,P1_1,P1_2,P1_3,P2_1,P2_2,P2_3,P3_1,P3_2,P3_3,P4_1,...,P8_3,P9_1,P9_2,P9_3,SN_1,SN_2,SN_3,Protein IDs,synthetic_proteome_1,synthetic_proteome_2
0,0.084,0.142,0.117,0.116,0.100,0.103,0.099,0.107,0.106,0.090,...,0.058,0.066,0.067,0.054,0.252,0.258,0.228,P09382,0.976,1.000
1,0.112,0.202,0.230,0.199,0.177,0.152,0.207,0.134,0.177,0.163,...,0.033,0.013,0.034,0.007,0.003,0.004,0.002,P63218,1.018,0.988
2,0.054,0.068,0.056,0.070,0.092,0.071,0.093,0.140,0.113,0.100,...,0.060,0.078,0.046,0.055,0.222,0.203,0.195,P04792,0.994,1.000
3,0.051,0.052,0.049,0.069,0.079,0.066,0.086,0.119,0.105,0.093,...,0.057,0.061,0.057,0.061,0.243,0.251,0.226,P60174-1,0.976,0.971
4,0.048,0.096,0.045,0.058,0.064,0.063,0.082,0.146,0.099,0.107,...,0.064,0.072,0.048,0.059,0.166,0.072,0.230,P00558,0.966,0.965
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6832,0.081,0.110,0.094,0.103,0.091,0.072,0.103,0.124,0.107,0.118,...,0.094,0.100,0.095,0.088,0.117,0.150,0.144,Q86XP1-5,1.000,1.001
6833,0.079,0.153,0.103,0.186,0.268,0.229,0.189,0.159,0.221,0.133,...,0.014,0.052,0.005,0.000,0.016,0.004,0.001,Q01955,1.000,0.999
6834,0.046,0.036,0.048,0.082,0.079,0.078,0.088,0.067,0.091,0.101,...,0.149,0.035,0.068,0.062,0.003,0.004,0.007,Q9BXX2,1.001,0.999
6835,0.047,0.044,0.126,0.080,0.052,0.070,0.062,0.043,0.093,0.067,...,0.144,0.197,0.253,0.117,0.014,0.008,0.040,Q9UKN7,1.000,0.976


### create fraction table

In [8]:
# Filter out the metadata and synthetic proteome columns
filtered_df = df[[col for col in df.columns if col[0] not in ['metadata', 'synthetic_proteome']]]
# Average columns ending with _1, _2, and _3 under the same first level index
grouped_df = filtered_df.groupby(level=0, axis=1).mean()
grouped_df.columns = pd.MultiIndex.from_product([['sample'], grouped_df.columns])
grouped_df

Unnamed: 0_level_0,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample
Samples,P1,P2,P3,P4,P5,P6,P7,P8,P9,SN
0,0.114333,0.106333,0.104000,0.094667,0.077333,0.069333,0.059333,0.053667,0.062333,0.246000
1,0.181333,0.176000,0.172667,0.178333,0.105000,0.060667,0.066333,0.036333,0.018000,0.003000
2,0.059333,0.077667,0.115333,0.108000,0.117333,0.126667,0.077000,0.049333,0.059667,0.206667
3,0.050667,0.071333,0.103333,0.096667,0.108000,0.124000,0.075000,0.049333,0.059667,0.240000
4,0.063000,0.061667,0.109000,0.119333,0.122333,0.149333,0.082000,0.054000,0.059667,0.156000
...,...,...,...,...,...,...,...,...,...,...
6832,0.095000,0.088667,0.111333,0.121000,0.103333,0.104667,0.070000,0.075667,0.094333,0.137000
6833,0.111667,0.227667,0.189667,0.172000,0.128000,0.067333,0.049667,0.027667,0.019000,0.007000
6834,0.043333,0.079667,0.082000,0.109667,0.128000,0.113333,0.212000,0.172333,0.055000,0.004667
6835,0.072333,0.067333,0.066000,0.073667,0.076667,0.078667,0.129000,0.218667,0.189000,0.020667


In [9]:
metadata_df = df[[col for col in df.columns if col[0] == 'metadata']]
final_fraction_table = pd.concat([metadata_df, grouped_df], axis=1)
final_fraction_table.columns.names = [None, None]
final_fraction_table.index.names = [None]
final_fraction_table

Unnamed: 0_level_0,metadata,sample,sample,sample,sample,sample,sample,sample,sample,sample,sample
Unnamed: 0_level_1,Protein IDs,P1,P2,P3,P4,P5,P6,P7,P8,P9,SN
0,P09382,0.114333,0.106333,0.104000,0.094667,0.077333,0.069333,0.059333,0.053667,0.062333,0.246000
1,P63218,0.181333,0.176000,0.172667,0.178333,0.105000,0.060667,0.066333,0.036333,0.018000,0.003000
2,P04792,0.059333,0.077667,0.115333,0.108000,0.117333,0.126667,0.077000,0.049333,0.059667,0.206667
3,P60174-1,0.050667,0.071333,0.103333,0.096667,0.108000,0.124000,0.075000,0.049333,0.059667,0.240000
4,P00558,0.063000,0.061667,0.109000,0.119333,0.122333,0.149333,0.082000,0.054000,0.059667,0.156000
...,...,...,...,...,...,...,...,...,...,...,...
6832,Q86XP1-5,0.095000,0.088667,0.111333,0.121000,0.103333,0.104667,0.070000,0.075667,0.094333,0.137000
6833,Q01955,0.111667,0.227667,0.189667,0.172000,0.128000,0.067333,0.049667,0.027667,0.019000,0.007000
6834,Q9BXX2,0.043333,0.079667,0.082000,0.109667,0.128000,0.113333,0.212000,0.172333,0.055000,0.004667
6835,Q9UKN7,0.072333,0.067333,0.066000,0.073667,0.076667,0.078667,0.129000,0.218667,0.189000,0.020667


In [10]:
# saving the final tables to files
enrich_out_dir = outdir / "fraction_tables"
os.makedirs(enrich_out_dir, exist_ok=True)

enrichment_csv_path = enrich_out_dir / f"{timestamp}_fraction_table.csv"

final_fraction_table.to_csv(enrichment_csv_path)