# Preprocessing

Pre-processing includes QC filtering, Log2 intensity transformation, removing rows that have insufficient data, and global imputation  
In this notebook, we process the IP columns, and the N/O/C columns will be separately processed in another notebook

In [19]:
import os, sys
from pathlib import Path
import pandas as pd

script_path = Path.cwd().parent.parent / "script"
data_path = Path.cwd().parent.parent / "data"
sys.path.append(str(script_path))

from pyseus import basic_processing as ip

## Define input and output

In [21]:
%store -r timestamp USE_FROZEN
if USE_FROZEN:
    raise Exception("USE_FROZEN is true, you can skip enrichment and proceed from Fig1")

maxQuant_csv = "proteinGroups.txt"
# the above file is available through the PRIDE repository under the identifier PXD046440
# it is also available via the following FTP link: ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2023/12/PXD046440/proteinGroups.txt

outprefix = f"{timestamp}_QC_filter_impute"

outdir = Path.cwd() / "output"

preprocessing_out = outdir / f"preprocessing"
os.makedirs(preprocessing_out, exist_ok=True)

preprocessing_log = preprocessing_out / f"logs"
os.makedirs(preprocessing_log, exist_ok=True)

## Import proteingroups table and process the column names


In [6]:
# import the protein groups table (MaxQuant output)
pgroups_path = data_path / "MaxQuant" / maxQuant_csv

try:
    pgroups = pd.read_csv(pgroups_path, index_col=None, low_memory=False, sep='\t')
except FileNotFoundError:
    print(f"File {pgroups_path} not found.\nPlease note that this file is ~670MB and is not included in the repository.\nYou can download it from the PRIDE repository under the identifier PXD046440.")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {pgroups_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [7]:
# define exclusion columns to remove non-relavent samples from the mass spectrometry master file
# remove infected IPs from the mass spectrometry master file as it shouldn't be used in calculating the reference UMAP
# remove WTs as they represent background binding, remove samples tagged with "harsh", "unsorted"
exclusion_cols1 = [x for x in list(pgroups.columns) if 'Infected' in x or '-infected' in x or '_infected' in x]
exclusion_cols2= [x for x in list(pgroups.columns) if 'Harsh' in x or "Unsorted" in x]
exclusion_cols3 = [x for x in list(pgroups.columns) if 'brefeldin' in x or 'arsenite' in x] 
exclusion_cols = sorted(list(set(exclusion_cols1  + exclusion_cols2 + exclusion_cols3)))

#write exclusion_col to file
with open(preprocessing_log / f"{outprefix}_exclusion_cols.txt", 'w') as f:
    for item in exclusion_cols:
        if item.startswith("LFQ"):
            f.write(f"{item}\n")

In [8]:
# use the inverse of the exclusion columns to select the inclusion columns
inclusion_cols = [x for x in list(pgroups.columns) if x not in exclusion_cols]
#write inclusion column names to file for examination
with open(preprocessing_log / f"{outprefix}_inclusion_cols.txt", 'w') as f:
    for item in inclusion_cols:
        if item.startswith("LFQ"):
            f.write("%s\n" % item)

In [11]:
# sample naming rule: a single underscore is used to separate the sample name from the replicate number, for example SampleA_1, SampleA_2 and SampleA_3 are replicates of SampleA
# the sample name is used to group replicates together
# Experimental conditions (eg infection) are part of sample names, and therefore gets a dash instead of underscore

# Rename columns based on the above rules. Note that the following regular expression is a blanket one
re = ["infected", '_InfectedOC43', 'WT_HEK', '_Unsorted', '_HarshLysis', '_rep1', '_rep2', '_rep3', r'( \d{2})_']  # regular expression to search for
replacement_re=["Infected",'-infected', 'WT', '-unsorted', '-HarshLysis', '_1', '_2', '_3', r'\1-'] # replacement regular expression

# The method will reformat strings by the order given in the previous list
# This is solely to test whether the renaming is working as intended
sample_cols = ip.sample_rename(inclusion_cols, RE=re, replacement_RE=replacement_re, repl_search=False) 

# subset pggroups
pgroups = pgroups[inclusion_cols]

In [12]:
# apply the renaming of columns
meta_cols = [
    'Protein IDs',
    'Majority protein IDs',
    'Gene names']
preprocessing = ip.RawTables(proteingroup=pgroups, sample_cols=list(pgroups),
    file_designated=True, info_cols=meta_cols, intensity_type='LFQ intensity') #Duo: type selection can't be use together with sample_cols selection

# rename files
preprocessing.rename_columns(RE=re, replacement_RE=replacement_re, repl_search=False) #Duo: this is a dry run too right?

# you can access renamed table and renamed cols by the following class objects
_ = preprocessing.renamed_table
_ = preprocessing.sample_cols

In [13]:
# write sample column names to file for examination
with open(os.path.join(preprocessing_log / f"{outprefix}__sample_cols.txt"), "w") as f:
    for item in preprocessing.sample_cols:
        if item.startswith("LFQ"):
            f.write("%s\n" % item)

## Proprocessing


In [14]:
# using the same RawTables class initiated from renaming

# filter table based on MaxQuant qc
preprocessing.filter_table(select_intensity=True, verbose=True)

# transform intensities
preprocessing.transform_intensities()

# group replicates and remove insufficient rows
preprocessing.group_replicates(reg_exp=r"(.*)_\d+$")

Filtered 3699 of 14551 rows. Now 10852 rows.


In [15]:
# require proteins to have at least two valid values in the proteome samples
preprocessing.remove_invalid_rows_custom(["UnInfected_Proteome"])

Removing invalid rows for 1 groups
Removed invalid rows. 8641 from 10852 rows remaining.


In [16]:
preprocessing.remove_invalid_rows()

# save preimpute table
preprocessing.preimpute_table.to_csv(preprocessing_out / f"{outprefix}_preimpute_table.csv")

# impute for NaN values. Here we are using global imputation 
preprocessing.bait_impute(distance=1.8, width=0.3, local=False)

Removed invalid rows. 8599 from 8641 rows remaining.


Save the imputed table

In [17]:
# The pre-processing is done, you can save/continue using the below class object
_ = preprocessing.bait_imputed_table

# print out the column names to file for examination
preprocessing.bait_imputed_table.columns.to_frame().to_csv(preprocessing_out / f"{outprefix}_imputed_table_cols.csv")

# write preprocessed pg table to a csv file (this is required for next steps)
preprocessing.bait_imputed_table.to_csv(preprocessing_out / f"{outprefix}_imputed_table.csv")

# write preprossed pg table to tabular file (for Perseus) 
#preprocessing.bait_imputed_table.to_csv(preprocessing_out / f"{outprefix}_imputed_table.tab", sep='\t')

# write the filtered table to file (for NOC processing)
preprocessing.filtered_table.to_csv(preprocessing_out / f"{outprefix}_filtered_table.csv")

In [18]:
# view the bait_imputed_table
preprocessing.bait_imputed_table

Samples,01-CAPRIN1,01-CAPRIN1,01-CAPRIN1,01-WT,01-WT,01-WT,02-ATG101,02-ATG101,02-ATG101,02-COPE,...,UnInfected_Nuclear,UnInfected_Organelle,UnInfected_Organelle,UnInfected_Organelle,UnInfected_Proteome,UnInfected_Proteome,UnInfected_Proteome,metadata,metadata,metadata
Replicates,01-CAPRIN1_1,01-CAPRIN1_2,01-CAPRIN1_3,01-WT_1,01-WT_2,01-WT_3,02-ATG101_1,02-ATG101_2,02-ATG101_3,02-COPE_1,...,UnInfected_Nuclear_3,UnInfected_Organelle_1,UnInfected_Organelle_2,UnInfected_Organelle_3,UnInfected_Proteome_1,UnInfected_Proteome_2,UnInfected_Proteome_3,Protein IDs,Majority protein IDs,Gene names
0,19.956102,19.323727,19.061507,17.631034,21.820300,18.031756,22.213900,22.508000,17.138793,21.318200,...,27.402000,26.166300,26.194800,26.580500,26.6141,26.7082,26.391500,A0A023T6R1;Q96A72;F5H6P7;F5H6N1;F5H3U9;F5H124,A0A023T6R1;Q96A72;F5H6P7;F5H6N1,FLJ10292;MAGOHB
1,27.599200,27.686800,27.994100,26.047200,25.926300,26.058400,25.535200,25.355900,25.559500,24.354100,...,27.826500,27.561900,27.218000,27.536700,27.4517,27.5977,27.166400,Q9Y5S9;A0A023T787;A0A0J9YW13,Q9Y5S9;A0A023T787,RBM8A;RBM8
2,22.974200,23.060700,22.077500,20.868400,23.255800,23.916100,25.747000,26.199600,25.321800,27.098100,...,24.453700,25.622300,25.880800,25.308600,24.5685,24.6034,24.675900,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6Z...,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6ZTK5,TM9SF4
3,18.622806,19.593637,22.123400,18.085099,19.198930,19.627753,19.960159,18.038397,19.447209,18.609645,...,19.346971,19.940100,20.431940,19.092715,21.1816,22.3871,21.586900,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,PTEN
4,24.979200,24.601300,24.323600,25.159500,26.003900,25.245500,27.260100,27.272800,27.695400,28.656600,...,24.529100,26.203800,26.274400,26.066100,24.9183,24.8291,24.590300,Q99805;A0A024QYR8;B3KSG9,Q99805;A0A024QYR8;B3KSG9,TM9SF2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8594,18.734276,18.823468,18.741720,19.456703,18.283421,18.834527,18.392407,18.709709,18.993311,19.050077,...,21.519700,22.276600,22.522900,22.111200,23.2888,23.1555,23.526700,X5D7P8,X5D7P8,STK39
8595,18.067610,19.594506,17.602267,18.402874,21.304148,22.656200,21.678400,22.075300,21.941200,22.102400,...,22.190600,24.613400,24.594600,24.748000,22.2467,22.3373,18.716295,X5D8X9,X5D8X9,CNTNAP2
8596,19.773653,18.991080,17.976966,18.590005,18.113036,18.277395,18.412942,17.893417,17.981675,20.402748,...,19.186834,18.440103,18.319022,19.167048,19.9645,21.0641,19.740000,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,CRMP1
8597,19.532135,18.603817,17.728151,16.830791,19.440189,18.805794,19.237272,19.939457,18.349270,20.083819,...,21.009700,21.660700,21.886400,19.290562,22.4563,22.7356,22.443400,X5DQZ7,X5DQZ7,GPX1
