# Preprocessing

Pre-processing includes QC filtering, Log2 intensity transformation, removing rows that have insufficient data, and global imputation  
In this notebook, we processes the IP columns, and the N/O/C columns will be separately processed in another notebook

Note: All notebooks in this folder are for processing **infected** samples

In [1]:
import os, sys
from pathlib import Path
import pandas as pd

script_path = Path.cwd().parent.parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent.parent / "data"
sys.path.append(str(script_path))

from pyseus import basic_processing as ip

## Define input and output files

In [2]:
%store -r fig5_timestamp FIG5_USE_FROZEN
if FIG5_USE_FROZEN:
    raise Exception("USE_FROZEN is true, you probably want to skip enrichment and proceed from 3.aligned_umap")
timestamp = fig5_timestamp

maxQuant_csv = "proteinGroups.txt" # name of the MaxQuant output file
# see data/MaxQuant/readme.md for more information about this file

outprefix = f"{timestamp}_QC_filter_impute"

outdir = Path.cwd() / "output"

preprocessing_out = outdir / f"preprocessing"
os.makedirs(preprocessing_out, exist_ok=True)

preprocessing_log = preprocessing_out / f"logs"
os.makedirs(preprocessing_log, exist_ok=True)

## Import proteingroups table and renaming

In [3]:
# import the protein groups table
pgroups_path = data_path / "MaxQuant" / maxQuant_csv

try:
    pgroups = pd.read_csv(pgroups_path, index_col=None, low_memory=False, sep='\t')
except FileNotFoundError:
    print(f"File {pgroups_path} not found.\nPlease note that this file is 670MB and is not included in the repository.")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {pgroups_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [4]:
# select infected samples only
exclusion_cols1 = [x for x in list(pgroups.columns) if not 'Infected' in x]
# remove non-relavent samples from the mass spectrometry master file
exclusion_cols2 = [x for x in list(pgroups.columns) if 'Harsh' in x or "Unsorted" in x]
exclusion_cols3 = [x for x in list(pgroups.columns) if 'brefeldin' in x or 'arsenite' in x] 
exclusion_cols = sorted(list(set(exclusion_cols1 + exclusion_cols2 + exclusion_cols3)))
# correct capitalization errors in column names
exclusion_cols = [i for i in exclusion_cols if i.startswith("LFQ") if not '_infected' in i]

# write exclusion_col to file
with open(preprocessing_log / f"{outprefix}_exclusion_cols.txt", 'w') as f:
    for item in exclusion_cols:
        if item.startswith("LFQ"):
            f.write(f"{item}\n")

In [5]:
# use the inverse of the exclusion columns to select the inclusion columns
inclusion_cols = [x for x in list(pgroups.columns) if x not in exclusion_cols]
# write inclusion column names to file for examination
with open(preprocessing_log / f"{outprefix}_inclusion_cols.txt", 'w') as f:
    for item in inclusion_cols:
        if item.startswith("LFQ"):
            f.write("%s\n" % item)

In [6]:
# sample naming rule: a single underscore is used to separate the sample name from the replicate number, for example SampleA_1, SampleA_2 and SampleA_3 are replicates of SampleA
# the sample name is used to group replicates together
# experimental conditions (eg infection) are part of sample names, and therefore gets a dash instead of underscore

# rename columns based on the above rules. Note that the following regular expression is a blanket one
re = ["infected", '_InfectedOC43', 'WT_HEK', '_Unsorted', '_HarshLysis', '_rep1', '_rep2', '_rep3', r'( \d{2})_'] # regular expression to search for
replacement_re=["Infected",'-infected', 'WT', '-unsorted', '-HarshLysis', '_1', '_2', '_3', r'\1-'] # replacement regular expression

# the method will reformat strings by the order given in the previous list
# this is a dry run and intended to test whether the renaming is working as intended
sample_cols = ip.sample_rename(inclusion_cols, RE=re, replacement_RE=replacement_re, repl_search=False) 

# subset pggroups
pgroups = pgroups[inclusion_cols]

In [7]:
# apply the renaming of columns
meta_cols = ["Protein IDs", "Majority protein IDs", "Gene names"]
preprocessing = ip.RawTables(
    proteingroup=pgroups,
    sample_cols=list(pgroups),
    file_designated=True,
    info_cols=meta_cols,
    intensity_type="LFQ intensity",  # Duo: type selection can't be use together with sample_cols selection
)

# rename files
preprocessing.rename_columns(
    RE=re, replacement_RE=replacement_re, repl_search=False
) 

# you can access renamed table and renamed cols by the following class objects
_ = preprocessing.renamed_table
_ = preprocessing.sample_cols

In [8]:
# write sample column names to file for examination
with open(os.path.join(preprocessing_log / f"{outprefix}__sample_cols.txt"), "w") as f:
    for item in preprocessing.sample_cols:
        if item.startswith("LFQ"):
            f.write("%s\n" % item)

## Pre-processing steps
Pre-processing includes QC filtering, Log2 intensity transformation, removing rows that have insufficient data,
and global imputation

In [9]:
# filter table based on MaxQuant qc
preprocessing.filter_table(select_intensity=True, verbose=True)

# transform intensities
preprocessing.transform_intensities()

# group replicates and remove insufficient rows
preprocessing.group_replicates(reg_exp=r"(.*)_\d+$")

Filtered 3699 of 14551 rows. Now 10852 rows.


In [10]:
# require proteins to have at leaset two valid values in the proteome samples
preprocessing.remove_invalid_rows_custom(["Infected_Proteome"])

Removing invalid rows for 1 groups
Removed invalid rows. 8518 from 10852 rows remaining.


In [11]:
preprocessing.remove_invalid_rows()

# save preimpute table
preprocessing.preimpute_table.to_csv(preprocessing_out / f"{outprefix}_preimpute_table.csv")

# impute for NaN values. Here we are using global imputation 
preprocessing.bait_impute(distance=1.8, width=0.3, local=False)

Removed invalid rows. 8441 from 8518 rows remaining.


Save the imputed table

In [12]:
# the pre-processing is done, you can save/continue using the below class object
_ = preprocessing.bait_imputed_table

# print out the column names to file for examination
preprocessing.bait_imputed_table.columns.to_frame().to_csv(preprocessing_out / f"{outprefix}_imputed_table_cols.csv")

# write preprocessed pg table to a csv file (this is required for next steps)
preprocessing.bait_imputed_table.to_csv(preprocessing_out / f"{outprefix}_imputed_table.csv")

# write preprossed pg table to tabular file (for Perseus) 
#preprocessing.bait_imputed_table.to_csv(preprocessing_out / f"{outprefix}_imputed_table.tab", sep='\t')

# write the filtered table to file (for NOC processing)
preprocessing.filtered_table.to_csv(preprocessing_out / f"{outprefix}_filtered_table.csv")

In [13]:
# view the bait_imputed_table
preprocessing.bait_imputed_table

Samples,09-ATG101_Infected,09-ATG101_Infected,09-ATG101_Infected,09-EDC4_Infected,09-EDC4_Infected,09-EDC4_Infected,09-HSP90AA1_Infected,09-HSP90AA1_Infected,09-HSP90AA1_Infected,09-PEX3_Infected,...,Infected_Nuclear,Infected_Organelle,Infected_Organelle,Infected_Organelle,Infected_Proteome,Infected_Proteome,Infected_Proteome,metadata,metadata,metadata
Replicates,09-ATG101_Infected_1,09-ATG101_Infected_2,09-ATG101_Infected_3,09-EDC4_Infected_1,09-EDC4_Infected_2,09-EDC4_Infected_3,09-HSP90AA1_Infected_1,09-HSP90AA1_Infected_2,09-HSP90AA1_Infected_3,09-PEX3_Infected_1,...,Infected_Nuclear_3,Infected_Organelle_1,Infected_Organelle_2,Infected_Organelle_3,Infected_Proteome_1,Infected_Proteome_2,Infected_Proteome_3,Protein IDs,Majority protein IDs,Gene names
0,24.260500,24.358600,24.385900,22.865300,22.151000,21.708500,18.260169,19.456726,17.998453,19.024308,...,27.994000,26.584800,26.425500,26.627600,26.6353,26.9362,26.805900,A0A023T6R1;Q96A72;F5H6P7;F5H6N1;F5H3U9;F5H124,A0A023T6R1;Q96A72;F5H6P7;F5H6N1,FLJ10292;MAGOHB
1,27.591100,28.154700,28.081300,27.331100,27.514700,27.981200,22.705800,24.896200,23.354900,27.589800,...,28.172500,27.342200,27.501200,27.181700,27.7727,27.6019,27.669100,Q9Y5S9;A0A023T787;A0A0J9YW13,Q9Y5S9;A0A023T787,RBM8A;RBM8
2,26.030600,24.924900,25.224900,26.387600,25.425800,25.138600,18.279800,19.030200,20.627700,24.304600,...,23.545700,25.142600,25.437400,25.503300,24.2675,24.0137,24.122400,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6Z...,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6ZTK5,TM9SF4
3,19.702075,19.414632,19.529941,19.453909,18.586885,19.041059,18.292705,18.927967,19.270496,20.209011,...,19.108638,20.074800,20.354100,19.794800,21.3708,20.5724,18.415118,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,PTEN
4,25.737100,25.856600,25.781300,26.411500,26.306700,25.613100,19.890700,21.075600,16.337600,26.685400,...,24.733400,25.443300,25.786600,25.736300,24.6176,24.5205,24.443000,Q99805;A0A024QYR8;B3KSG9,Q99805;A0A024QYR8;B3KSG9,TM9SF2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8436,28.299800,28.585900,28.480400,29.245500,29.423700,29.562700,28.106200,28.122500,29.017900,29.427200,...,26.718800,28.170100,27.997400,28.150700,28.7207,28.7476,28.720700,X5D2T3;X1WI28;P27635;B8A6G2;X5D2W5;A6QRI9;Q96L21,X5D2T3;X1WI28;P27635,RPL10
8437,20.271537,18.268604,18.130920,18.470466,20.179247,18.769912,18.616934,18.424734,17.915424,19.594402,...,22.146000,22.301400,22.682000,18.914129,23.6920,23.8120,23.717700,X5D7P8,X5D7P8,STK39
8438,19.575159,20.193766,18.664982,18.349417,18.725533,18.687180,19.582969,17.822112,18.062296,23.278700,...,19.462708,18.912684,20.089757,19.120399,20.3041,21.3493,21.605300,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,CRMP1
8439,23.316600,22.441400,22.072400,19.262856,18.947131,18.794093,23.674700,23.114700,23.378900,18.252960,...,18.552358,22.925300,22.674300,22.802600,22.7995,22.7228,23.150300,X5DQZ7,X5DQZ7,GPX1
