# Preprocessing

Pre-processing includes QC filtering, Log2 intensity transformation, removing rows that have insufficient data, and global imputation  
In this notebook, we processes the IP columns, and the N/O/C columns will be separately processed in another notebook

In [1]:
import os
import sys
from pathlib import Path
import numpy as np
import pandas as pd

script_path = Path.cwd().parent.parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent.parent / "data"
sys.path.append(str(script_path))

from pyseus import basic_processing as ip

### Define input and output files

In [2]:
%store -r fig5_timestamp FIG5_USE_FROZEN
if FIG5_USE_FROZEN:
    raise Exception("USE_FROZEN is true, you probably want to skip enrichment and proceed from 3.aligned_umap")
timestamp = fig5_timestamp

maxQuant_csv = "proteinGroups.txt"
# the above file is available through the PRIDE repository under the identifier PXD046440
# it is also available via the following FTP link: ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2023/12/PXD046440/proteinGroups.txt

outprefix = f"{timestamp}_QC_filter_impute"

outdir = Path.cwd() / "output"

preprocessing_out = outdir / f"preprocessing"
os.makedirs(preprocessing_out, exist_ok=True)

preprocessing_log = preprocessing_out / f"logs"
os.makedirs(preprocessing_log, exist_ok=True)

### Read the infected enrichment table
The purpose of this is to get a list of infected samples, so we can subset the uninfected samples to create matching sets of samples

In [3]:
filename = f'{timestamp}_enrichment_table_NOC_prop.csv'
infected_enrichment_path = Path.cwd().parent / "1.infected_enrichment" / "output" / "enrichment_and_volcano_tables" / filename 

try:
    infected_enrichments = pd.read_csv(infected_enrichment_path, header=[0, 1], index_col=0)
except FileNotFoundError:
    print(f"File {infected_enrichment_path} not found.\n please run the infected_enrichment notebooks first.")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {infected_enrichment_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [4]:
infected_samples = infected_enrichments["sample"].columns
# predict the names of uninfected samples from infected samples
uninfected_pred = [
    i.split("_")[0] for i in infected_samples if not i.startswith("NOC")
] + ["Uninfected-Cytosol", "Uninfected-Nuclear", "Uninfected-Organelle", "Uninfected-Proteome"]

In [5]:
print(f"the number of infected samples is {len(infected_samples)}")

the number of infected samples is 38


### Load proteingroups table

In [6]:
# import the protein groups table
pgroups_path = data_path / "MaxQuant" / maxQuant_csv

try:
    pgroups = pd.read_csv(pgroups_path, index_col=None, low_memory=False, sep='\t')
except FileNotFoundError:
    print(f"File {pgroups_path} not found.\nPlease note that this file is 670MB and is not included in the repository.")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {pgroups_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [7]:
# define exclusion columns to remove non-relavent samples from the mass spectrometry master file
exclusion_cols1 = [x for x in list(pgroups.columns) if 'Infected' in x or "_infected" in x or '-infected' in x] # exclude infected samples
exclusion_cols2 = [x for x in list(pgroups.columns) if 'Harsh' in x or "Unsorted" in x] # exclude harsh and unsorted samples
exclusion_cols3 = [x for x in list(pgroups.columns) if 'brefeldin' in x or 'arsenite' in x]  # exclude brefeldin and arsenite samples

# select uninfected samples that matches the infected samples
exclusion_cols4 = [x for x in list(pgroups.columns) if not any([i in x.replace("_", "-") for i in uninfected_pred])] 

exclusion_cols = sorted(list(set(exclusion_cols1 + exclusion_cols2 + exclusion_cols3 + exclusion_cols4)))

# fix capitalization errors in column names
exclusion_cols = [i for i in exclusion_cols if i.startswith("LFQ")]

# write exclusion_col to file
with open(preprocessing_log / f"{outprefix}_exclusion_cols.txt", 'w') as f:
    for item in exclusion_cols:
        if item.startswith("LFQ"):
            f.write(f"{item}\n")

In [8]:
# use the inverse of the exclusion columns to select the inclusion columns
inclusion_cols = [x for x in list(pgroups.columns) if x not in exclusion_cols]
# write inclusion column names to file for examination
count = 0
with open(preprocessing_log / f"{outprefix}_inclusion_cols.txt", 'w') as f:
    for item in inclusion_cols:
        if item.startswith("LFQ"):
            f.write("%s\n" % item)

print(f"the number of columns in the protein groups table is {int(count/3)}, not including the NOC columns")

the number of columns in the protein groups table is 0, not including the NOC columns


In [9]:
# sample naming rule: a single underscore is used to separate the sample name from the replicate number, for example SampleA_1, SampleA_2 and SampleA_3 are replicates of SampleA
# the sample name is used to group replicates together
# experimental conditions (eg infection) are part of sample names, and therefore gets a dash instead of underscore

re = ["infected", '_InfectedOC43', 'WT_HEK', '_Unsorted', '_HarshLysis', '_rep1', '_rep2', '_rep3', r'( \d{2})_'] # regular expression to search for
replacement_re=["Infected",'-infected', 'WT', '-unsorted', '-HarshLysis', '_1', '_2', '_3', r'\1-'] # replacement regular expression

# the method will reformat strings by the order given in the previous list
# this is a dry run and intended to test whether the renaming is working as intended
sample_cols = ip.sample_rename(inclusion_cols, RE=re, replacement_RE=replacement_re, repl_search=False) 

# subset pggroups
pgroups = pgroups[inclusion_cols]

In [10]:
# rename samples
meta_cols = ["Protein IDs", "Majority protein IDs", "Gene names"]
preprocessing = ip.RawTables(
    proteingroup=pgroups,
    sample_cols=list(pgroups),
    file_designated=True,
    info_cols=meta_cols,
    intensity_type="LFQ intensity",  # Duo: type selection can't be use together with sample_cols selection
)

# rename files
preprocessing.rename_columns(RE=re, replacement_RE=replacement_re, repl_search=False) 

# you can access renamed table and renamed cols by the following class objects
_ = preprocessing.renamed_table
_ = preprocessing.sample_cols

In [11]:
# write sample column names to file for examination
with open(os.path.join(preprocessing_log / f"{outprefix}__sample_cols.txt"), "w") as f:
    for item in preprocessing.sample_cols:
        if item.startswith("LFQ"):
            f.write("%s\n" % item)

### Pre-processing steps
Pre-processing includes QC filtering, Log2 intensity transformation, removing rows that have insufficient data,
and global imputation

In [12]:
# using the same RawTables class initiated from renaming

# filter table based on MaxQuant qc
preprocessing.filter_table(select_intensity=True, verbose=True)

# transform intensities
preprocessing.transform_intensities()

# group replicates and remove insufficient rows
preprocessing.group_replicates(reg_exp=r'(.*)_\d+$')


Filtered 3699 of 14551 rows. Now 10852 rows.


In [13]:
# require proteins to have at leaset two valid values in the proteome samples
preprocessing.remove_invalid_rows_custom(["UnInfected_Proteome"])

Removing invalid rows for 1 groups
Removed invalid rows. 8641 from 10852 rows remaining.


In [14]:
preprocessing.remove_invalid_rows()

# save preimpute table
preprocessing.preimpute_table.to_csv(preprocessing_out / f"{outprefix}_preimpute_table.csv")

# impute for NaN values. Here we are using global imputation 
preprocessing.bait_impute(distance=1.8, width=0.3, local=False)

Removed invalid rows. 8592 from 8641 rows remaining.


Save the imputed table

In [15]:
# the pre-processing is done, you can save/continue using the below class object
_ = preprocessing.bait_imputed_table

# print out the column names to file for examination
preprocessing.bait_imputed_table.columns.to_frame().to_csv(preprocessing_out / f"{outprefix}_imputed_table_cols.csv")

# write preprocessed pg table to a csv file (this is required for next steps)
preprocessing.bait_imputed_table.to_csv(preprocessing_out / f"{outprefix}_imputed_table.csv")

# write preprossed pg table to tabular file (for Perseus) 
#preprocessing.bait_imputed_table.to_csv(preprocessing_out / f"{outprefix}_imputed_table.tab", sep='\t')

# write the filtered table to file (for NOC processing)
preprocessing.filtered_table.to_csv(preprocessing_out / f"{outprefix}_filtered_table.csv")

In [16]:
# view the bait_imputed_table
preprocessing.bait_imputed_table

Samples,09-ATG101,09-ATG101,09-ATG101,09-EDC4,09-EDC4,09-EDC4,09-HSP90AA1,09-HSP90AA1,09-HSP90AA1,09-PEX3,...,UnInfected_Nuclear,UnInfected_Organelle,UnInfected_Organelle,UnInfected_Organelle,UnInfected_Proteome,UnInfected_Proteome,UnInfected_Proteome,metadata,metadata,metadata
Replicates,09-ATG101_1,09-ATG101_2,09-ATG101_3,09-EDC4_1,09-EDC4_2,09-EDC4_3,09-HSP90AA1_1,09-HSP90AA1_2,09-HSP90AA1_3,09-PEX3_1,...,UnInfected_Nuclear_3,UnInfected_Organelle_1,UnInfected_Organelle_2,UnInfected_Organelle_3,UnInfected_Proteome_1,UnInfected_Proteome_2,UnInfected_Proteome_3,Protein IDs,Majority protein IDs,Gene names
0,22.758700,22.055800,22.344900,22.127400,21.555700,21.013300,17.290477,17.945600,18.595024,21.518943,...,27.402000,26.166300,26.194800,26.580500,26.6141,26.7082,26.391500,A0A023T6R1;Q96A72;F5H6P7;F5H6N1;F5H3U9;F5H124,A0A023T6R1;Q96A72;F5H6P7;F5H6N1,FLJ10292;MAGOHB
1,26.841000,25.911100,26.163100,27.631500,27.575900,26.966300,24.802500,24.010800,24.522600,26.394600,...,27.826500,27.561900,27.218000,27.536700,27.4517,27.5977,27.166400,Q9Y5S9;A0A023T787;A0A0J9YW13,Q9Y5S9;A0A023T787,RBM8A;RBM8
2,25.983100,25.236700,26.205200,24.804400,23.441100,23.368900,18.780569,17.268354,19.501800,23.497800,...,24.453700,25.622300,25.880800,25.308600,24.5685,24.6034,24.675900,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6Z...,A0A0C4DFM1;A0A024QYR3;Q92544;B4DH88;B4DKC1;Q6ZTK5,TM9SF4
3,19.246555,19.072108,18.296520,18.219692,18.835037,25.261300,18.780737,19.214067,17.970534,19.240206,...,19.748326,19.940100,20.019664,17.783050,21.1816,22.3871,21.586900,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,A0A024QYR6;A0A1V0DNR7;A0A6G6A825;F6KD02;F6KD01...,PTEN
4,26.503800,25.960400,26.380000,24.380600,26.643500,25.297700,23.746600,23.685600,24.023200,26.877700,...,24.529100,26.203800,26.274400,26.066100,24.9183,24.8291,24.590300,Q99805;A0A024QYR8;B3KSG9,Q99805;A0A024QYR8;B3KSG9,TM9SF2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8587,19.592690,17.786008,19.754034,18.415576,18.731608,19.962270,20.484305,20.479634,18.411649,18.036718,...,21.519700,22.276600,22.522900,22.111200,23.2888,23.1555,23.526700,X5D7P8,X5D7P8,STK39
8588,18.020178,20.588900,19.047112,18.805262,21.233000,19.262843,19.094337,18.686778,19.193989,20.639900,...,22.190600,24.613400,24.594600,24.748000,22.2467,22.3373,19.689434,X5D8X9,X5D8X9,CNTNAP2
8589,18.434029,18.280585,19.565471,18.252669,21.148677,20.189538,19.814076,18.754988,18.558138,19.826361,...,19.156836,18.720150,17.989758,18.279934,19.9645,21.0641,19.740000,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,X5DQV1;X5DNI1;B3KV96;E9PD68;B3KXQ5;Q14194;B3KT...,CRMP1
8590,18.432711,19.144382,19.378049,19.615134,19.229470,18.122063,23.099800,22.859800,22.939800,21.363700,...,21.009700,21.660700,21.886400,20.423280,22.4563,22.7356,22.443400,X5DQZ7,X5DQZ7,GPX1
