# Preprocessing

Pre-processing includes QC filtering, Log2 intensity transformation, removing rows that have insufficient data, and global imputation  
In this notebook, we process the IP columns, and the N/O/C columns will be separately processed in another notebook

In [5]:
import sys,os
from pathlib import Path
import pandas as pd

script_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "data"
sys.path.append(str(script_path))

from pyseus import basic_processing as ip

## Define input and output

In [6]:
#%store -r timestamp
timestamp = "2024-07-16"

print(f"Timestamp: {timestamp}")

csv = "itzhak2016stcSILAC.csv" 
outprefix = f"{timestamp}_QC_filter_impute"

outdir = Path.cwd() / "output"

preprocessing_out = outdir / f"preprocessing"
os.makedirs(preprocessing_out, exist_ok=True)

preprocessing_log = preprocessing_out / f"logs"
os.makedirs(preprocessing_log, exist_ok=True)

Timestamp: 2024-07-16


## Import proteingroups table and process the column names


In [7]:
# import the protein groups table
pgroups_path = data_path / "external" / csv

try:
    pgroups = pd.read_csv(pgroups_path, index_col=None, low_memory=False, sep=',')
except FileNotFoundError:
    print(f"File {pgroups_path} not found.\nPlease note that this file is 670MB and is not included in the repository.")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {pgroups_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [8]:
# define exclusion columns
exclusion_cols1 = [x for x in list(pgroups.columns) if 'Infected' in x or '-infected' in x or '_infected' in x]
exclusion_cols2= [x for x in list(pgroups.columns) if 'Harsh' in x or "Unsorted" in x]
exclusion_cols3 = [x for x in list(pgroups.columns) if 'brefeldin' in x or 'arsenite' in x] 
exclusion_cols = sorted(list(set(exclusion_cols1  + exclusion_cols2 + exclusion_cols3)))

In [9]:
# use the inverse of the exclusion columns to select the inclusion columns
inclusion_cols = [x for x in list(pgroups.columns) if x not in exclusion_cols]

In [10]:
# Rename columns based on rules defined in regular expressions.
# Experimental states (eg infection) are part of sample names, and therefore gets a dash instead of underscore
re = ["infected", '_InfectedOC43', 'WT_HEK', '_Unsorted', '_HarshLysis', '_rep1', '_rep2', '_rep3', r'MAP([123456])_(.*)', r'( \d{2})_', 'cyt'] 
# This is a list of replacement strings, respective in order to the above list
replacement_re=["Infected",'-infected', 'WT', '-unsorted', '-HarshLysis', '_1', '_2', '_3', r'\2_\1', r'\1-', 'Cyt']

# The method will reformat strings by the order given in the previous list
# This is solely to test whether the renaming is working as intended
sample_cols = ip.sample_rename(inclusion_cols, RE=re, replacement_RE=replacement_re, repl_search=False) 

# subset pggroups
pgroups = pgroups[inclusion_cols]

# create empty columns
pgroups["Potential contaminant"] = None
pgroups["Reverse"] = None
pgroups["Only identified by site"] = None

In [11]:
# Actual renaming of the table
# initiate RawTables
meta_cols = [
    'Unnamed: 0' ]
preprocessing = ip.RawTables(proteingroup=pgroups, sample_cols=list(pgroups),
    file_designated=True, info_cols=meta_cols, intensity_type='log H/L') #Duo: type selection can't be use together with sample_cols selection

# rename files
preprocessing.rename_columns(RE=re, replacement_RE=replacement_re, repl_search=False) #Duo: this is a dry run too right?

# you can access renamed table and renamed cols by the following class objects
_ = preprocessing.renamed_table
_ = preprocessing.sample_cols

In [12]:
# write sample column names to file for examination
with open(os.path.join(preprocessing_log / f"{outprefix}__sample_cols.txt"), "w") as f:
    for item in preprocessing.sample_cols:
        if item.startswith("log H/L"):
            f.write("%s\n" % item)

## Proprocessing


In [13]:
# using the same RawTables class initiated from renaming

# filter table based on MaxQuant qc
preprocessing.filter_table(select_intensity=True, verbose=True)

# transform intensities
#preprocessing.transform_intensities() # do not transform intensities b/c they are log10 H/L ratio

# group replicates and remove insufficient rows
preprocessing.group_replicates(reg_exp=r"(.*)_\d+$")

Filtered 0 of 5265 rows. Now 5265 rows.
Intensity values have not been transformed yet from filtered table,
we recommend using transform_intensities() method before grouping replicates.

Using filtered_table to group replicates.


In [14]:
# require proteins to have at least two valid values in the proteome samples
#preprocessing.remove_invalid_rows_custom(["UnInfected_Proteome"])

In [15]:
preprocessing.remove_invalid_rows_custom(n=5) # Remove rows that do not have at least one group that has less than n invalid values in all replicates

# save preimpute table
preprocessing.preimpute_table.to_csv(preprocessing_out / f"{outprefix}_preimpute_table.csv")

# impute for NaN values. Here we are using global imputation 
preprocessing.bait_impute(distance=1.8, width=0.3, local=False)

Removing invalid rows for 5 groups
Removed invalid rows. 4928 from 5265 rows remaining.


Save the imputed table

In [16]:
# The pre-processing is done, you can save/continue using the below class object
_ = preprocessing.bait_imputed_table

# print out the column names to file for examination
preprocessing.bait_imputed_table.columns.to_frame().to_csv(preprocessing_out / f"{outprefix}_imputed_table_cols.csv")

# write preprocessed pg table to a csv file (this is required for next steps)
preprocessing.bait_imputed_table.to_csv(preprocessing_out / f"{outprefix}_imputed_table.csv")

# write preprossed pg table to tabular file (for Perseus) 
#preprocessing.bait_imputed_table.to_csv(preprocessing_out / f"{outprefix}_imputed_table.tab", sep='\t')

# write the filtered table to file (for NOC processing)
preprocessing.filtered_table.to_csv(preprocessing_out / f"{outprefix}_filtered_table.csv")

In [17]:
# view the bait_imputed_table
preprocessing.bait_imputed_table

Samples,03K,03K,03K,03K,03K,03K,06K,06K,06K,06K,...,24K,24K,24K,80K,80K,80K,80K,80K,80K,metadata
Replicates,03K_1,03K_2,03K_3,03K_4,03K_5,03K_6,06K_1,06K_2,06K_3,06K_4,...,24K_4,24K_5,24K_6,80K_1,80K_2,80K_3,80K_4,80K_5,80K_6,Unnamed: 0
0,0.088700,0.191800,0.290200,0.137400,-0.1380,0.14990,-0.624800,-0.547800,-0.485800,-0.602300,...,0.419300,0.6630,0.791100,1.594100,1.505200,1.664100,1.551400,1.4232,1.491400,Q9NRG9
1,-0.297300,-0.217300,-0.306900,-0.290800,-0.4641,-0.11090,0.151100,0.269800,0.294700,0.086100,...,-0.346400,-0.6437,-0.541900,-0.899400,-0.567200,-0.471900,-0.698800,-0.8377,-0.685500,Q2M2I8
2,-0.834000,-2.095883,-2.195958,-2.542010,-0.8049,-1.95859,-0.367800,-2.325760,-1.259481,-1.575801,...,-2.452897,-0.7046,-2.977117,-0.409800,-1.857060,-2.628215,-2.280164,-0.9385,-1.919861,Q13685
3,-0.963600,-0.987400,-0.986600,-1.309200,-1.1398,-0.73760,-0.924300,-0.892500,-0.816600,-0.700200,...,-1.303900,-1.3813,-1.370500,-0.766100,-0.810300,-0.722700,-0.890900,-0.9703,-1.080300,P49588
4,-1.185800,-1.168600,-1.169400,-1.354200,-1.3094,-0.89680,-0.697900,-0.491700,-0.487700,-0.663600,...,4.980100,0.1054,8.050300,4.789300,4.624000,4.849600,4.603400,-0.6064,7.758900,Q5JTZ9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4923,-3.608385,-2.487903,-3.070146,-2.008237,1.7643,1.32340,-2.585043,-1.909081,-2.003014,-2.813043,...,-2.444080,-1.0664,-1.107200,-2.323559,-2.645602,-2.127740,-2.394431,-1.5355,-1.472800,Q9NWK9
4924,-2.511720,-2.188211,-0.381500,0.026600,-0.6059,0.07930,-2.661468,-3.336249,-0.054400,0.185200,...,-0.136400,0.4569,0.636700,-2.926703,-2.817323,0.265700,0.320700,0.4349,0.453300,Q8NHG8
4925,-1.108300,-1.429200,-0.934800,-1.429400,-1.2095,-0.63570,-0.856800,-1.031100,-0.846200,-1.122500,...,-1.637200,-1.3830,-1.715400,-1.065100,-0.766000,-1.773800,-0.994500,-1.2307,-1.340300,O95218
4926,0.252900,0.310500,0.475700,0.240600,0.0116,0.35480,-0.720900,-0.562000,-0.445600,-0.559500,...,0.445900,0.3751,0.703400,1.330400,1.439700,1.440300,1.114700,1.0065,1.083000,O43264
