# Preprocessing

Pre-processing includes QC filtering, Log2 intensity transformation, removing rows that have insufficient data, and global imputation  
In this notebook, we process the IP columns, and the N/O/C columns will be separately processed in another notebook

In [5]:
import sys,os
from pathlib import Path
import pandas as pd

script_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "data"
sys.path.append(str(script_path))

from pyseus import basic_processing as ip

## Define input and output

In [6]:
#%store -r timestamp
timestamp = "2024-07-27"

print(f"Timestamp: {timestamp}")

csv = "hyperLOPITU2OS2018.csv" 
outprefix = f"{timestamp}_QC_filter_impute"

outdir = Path.cwd() / "output"

preprocessing_out = outdir / f"preprocessing"
os.makedirs(preprocessing_out, exist_ok=True)

preprocessing_log = preprocessing_out / f"logs"
os.makedirs(preprocessing_log, exist_ok=True)

Timestamp: 2024-07-27


## Import proteingroups table and process the column names


In [7]:
# import the protein groups table
pgroups_path = data_path / "external" / csv

try:
    pgroups = pd.read_csv(pgroups_path, index_col=None, low_memory=False, sep=',')
except FileNotFoundError:
    print(f"File {pgroups_path} not found.\nPlease note that this file is 670MB and is not included in the repository.")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {pgroups_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [8]:
# define exclusion columns
exclusion_cols1 = [x for x in list(pgroups.columns) if 'Infected' in x or '-infected' in x or '_infected' in x]
exclusion_cols2= [x for x in list(pgroups.columns) if 'Harsh' in x or "Unsorted" in x]
exclusion_cols3 = [x for x in list(pgroups.columns) if 'brefeldin' in x or 'arsenite' in x] 
exclusion_cols = sorted(list(set(exclusion_cols1  + exclusion_cols2 + exclusion_cols3)))

#write exclusion_col to file
with open(preprocessing_log / f"{outprefix}_exclusion_cols.txt", 'w') as f:
    for item in exclusion_cols:
        if item.startswith("LFQ"):
            f.write(f"{item}\n")

In [9]:
# use the inverse of the exclusion columns to select the inclusion columns
inclusion_cols = [x for x in list(pgroups.columns) if x not in exclusion_cols]
#write inclusion column names to file for examination
with open(preprocessing_log / f"{outprefix}_inclusion_cols.txt", 'w') as f:
    for item in inclusion_cols:
        if item.startswith("fraction"):
            f.write("%s\n" % item)

In [10]:
# Rename columns based on rules defined in regular expressions.
# Experimental states (eg infection) are part of sample names, and therefore gets a dash instead of underscore
re = ["infected", '_InfectedOC43', 'WT_HEK', '_Unsorted', '_HarshLysis',  r'(.*)_rep([123])(.*)'] 
# This is a list of replacement strings, respective in order to the above list
replacement_re=["Infected",'-infected', 'WT', '-unsorted', '-HarshLysis',  r'fraction \3\1_\2']

# The method will reformat strings by the order given in the previous list
# This is solely to test whether the renaming is working as intended
sample_cols = ip.sample_rename(inclusion_cols, RE=re, replacement_RE=replacement_re, repl_search=False) 

# subset pggroups
pgroups = pgroups[inclusion_cols]

# create empty columns
pgroups["Potential contaminant"] = None
pgroups["Reverse"] = None
pgroups["Only identified by site"] = None

In [11]:
pgroups.columns

Index(['Unnamed: 0', 'X127C_rep1set1', 'X128C_rep1set1', 'X128N_rep1set1',
       'X129C_rep1set1', 'X129N_rep1set1', 'X130C_rep1set1', 'X130N_rep1set1',
       'X131_rep1set1', 'X126_rep1set2', 'X127C_rep1set2', 'X127N_rep1set2',
       'X128C_rep1set2', 'X128N_rep1set2', 'X129C_rep1set2', 'X129N_rep1set2',
       'X130C_rep1set2', 'X130N_rep1set2', 'X131_rep1set2', 'X127C_rep2set1',
       'X127N_rep2set1', 'X128C_rep2set1', 'X128N_rep2set1', 'X129C_rep2set1',
       'X129N_rep2set1', 'X130C_rep2set1', 'X130N_rep2set1', 'X131_rep2set1',
       'X126_rep2set2', 'X127C_rep2set2', 'X127N_rep2set2', 'X128C_rep2set2',
       'X128N_rep2set2', 'X129C_rep2set2', 'X129N_rep2set2', 'X130C_rep2set2',
       'X130N_rep2set2', 'X131_rep2set2', 'X126_rep3set1', 'X127C_rep3set1',
       'X127N_rep3set1', 'X128C_rep3set1', 'X128N_rep3set1', 'X129C_rep3set1',
       'X129N_rep3set1', 'X130C_rep3set1', 'X130N_rep3set1', 'X131_rep3set1',
       'X126_rep3set2', 'X127C_rep3set2', 'X127N_rep3set2', 'X12

In [12]:
# merge set1 and set2 (this cell had no effect in the processing, just a dry run)
# Identify unique fraction types
fractions = ['X126', 'X127C', 'X127N', 'X128C', 'X128N', 'X129C', 'X129N', 'X130C', 'X130N', 'X131']
final_cols = []
df = pgroups.copy()
for fraction in fractions:
    # rep1
    col_s1 = f'{fraction}_rep1set1'
    col_s2 = f'{fraction}_rep1set2'
    if col_s1 in df.columns and col_s2 in df.columns:
        df[f'R1 {fraction}'] = df[col_s1] + df[col_s2]
        final_cols.append(f'R1 {fraction}')
    
    # rep2
    col_s1 = f'{fraction}_rep2set1'
    col_s2 = f'{fraction}_rep2set2'
    if col_s1 in df.columns and col_s2 in df.columns:
        df[f'R2 {fraction}'] = df[col_s1] + df[col_s2]
        final_cols.append(f'R2 {fraction}')


    # rep3
    col_s1 = f'{fraction}_rep3set1'
    col_s2 = f'{fraction}_rep3set2'
    if col_s1 in df.columns and col_s2 in df.columns:
        df[f'R3 {fraction}'] = df[col_s1] + df[col_s2]
        final_cols.append(f'R3 {fraction}')

final_cols = final_cols 

fractions_cols = df[final_cols]

In [13]:
# Actual renaming of the table
# initiate RawTables
meta_cols = [
    'Unnamed: 0' ]
preprocessing = ip.RawTables(proteingroup=pgroups, sample_cols=list(pgroups),
    file_designated=True, info_cols=meta_cols, intensity_type='fraction') #Duo: type selection can't be use together with sample_cols selection

# rename files
preprocessing.rename_columns(RE=re, replacement_RE=replacement_re, repl_search=False) #Duo: this is a dry run too right?

# you can access renamed table and renamed cols by the following class objects
_ = preprocessing.renamed_table
_ = preprocessing.sample_cols

In [14]:
preprocessing.renamed_table

Unnamed: 0.1,Unnamed: 0,fraction set1X127C_1,fraction set1X128C_1,fraction set1X128N_1,fraction set1X129C_1,fraction set1X129N_1,fraction set1X130C_1,fraction set1X130N_1,fraction set1X131_1,fraction set2X126_1,...,fraction set2X128C_3,fraction set2X128N_3,fraction set2X129C_3,fraction set2X129N_3,fraction set2X130C_3,fraction set2X130N_3,fraction set2X131_3,Potential contaminant,Reverse,Only identified by site
0,P05387,0.039,0.101,0.116,0.143,0.120,0.368,0.046,0.045,0.032,...,0.115,0.111,0.084,0.042,0.464,0.028,0.023,,,
1,P04406,0.097,0.087,0.076,0.077,0.067,0.272,0.061,0.246,0.064,...,0.051,0.059,0.061,0.104,0.109,0.042,0.455,,,
2,P60903,0.122,0.221,0.202,0.172,0.120,0.063,0.053,0.008,0.091,...,0.146,0.119,0.107,0.048,0.083,0.079,0.005,,,
3,P57105,0.050,0.125,0.151,0.319,0.247,0.080,0.018,0.007,0.026,...,0.136,0.295,0.207,0.114,0.096,0.005,0.004,,,
4,P21964-2,0.120,0.217,0.210,0.200,0.149,0.040,0.005,0.028,0.057,...,0.184,0.187,0.130,0.084,0.085,0.003,0.012,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4878,Q5T011-5,0.099,0.126,0.119,0.082,0.075,0.336,0.058,0.041,0.115,...,0.040,0.038,0.073,0.191,0.312,0.065,0.043,,,
4879,Q8IWV7,0.083,0.079,0.090,0.110,0.101,0.062,0.045,0.395,0.034,...,0.067,0.065,0.072,0.102,0.037,0.027,0.451,,,
4880,Q14055,0.093,0.198,0.211,0.201,0.131,0.105,0.018,0.024,0.050,...,0.199,0.159,0.129,0.071,0.092,0.014,0.047,,,
4881,P04196,0.068,0.101,0.114,0.118,0.118,0.301,0.010,0.169,0.052,...,0.042,0.053,0.066,0.077,0.155,0.020,0.511,,,


In [15]:
# write sample column names to file for examination
with open(os.path.join(preprocessing_log / f"{outprefix}__sample_cols.txt"), "w") as f:
    for item in preprocessing.sample_cols:
        if item.startswith("fraction"):
            f.write("%s\n" % item)

## Proprocessing


In [16]:
# using the same RawTables class initiated from renaming

# filter table based on MaxQuant qc
preprocessing.filter_table(select_intensity=True, verbose=True)

# transform intensities
#preprocessing.transform_intensities()

# group replicates and remove insufficient rows
preprocessing.group_replicates(reg_exp=r"(.*)_\d+$")

Filtered 0 of 4883 rows. Now 4883 rows.
Intensity values have not been transformed yet from filtered table,
we recommend using transform_intensities() method before grouping replicates.

Using filtered_table to group replicates.


In [17]:
# require proteins to have at least two valid values in the proteome samples
#preprocessing.remove_invalid_rows_custom(["UnInfected_Proteome"])

In [18]:
preprocessing.remove_invalid_rows_custom(n=1) # Remove rows that do not have at least one group that has less than n invalid values in all replicates

# save preimpute table
preprocessing.preimpute_table.to_csv(preprocessing_out / f"{outprefix}_preimpute_table.csv")

# impute for NaN values. Here we are using global imputation 
preprocessing.bait_impute(distance=1.8, width=0.3, local=False)

Removing invalid rows for 20 groups
Removed invalid rows. 4883 from 4883 rows remaining.


Save the imputed table

In [19]:
# The pre-processing is done, you can save/continue using the below class object
_ = preprocessing.bait_imputed_table

# print out the column names to file for examination
preprocessing.bait_imputed_table.columns.to_frame().to_csv(preprocessing_out / f"{outprefix}_imputed_table_cols.csv")

# write preprocessed pg table to a csv file (this is required for next steps)
preprocessing.bait_imputed_table.to_csv(preprocessing_out / f"{outprefix}_imputed_table.csv")

# write preprossed pg table to tabular file (for Perseus) 
#preprocessing.bait_imputed_table.to_csv(preprocessing_out / f"{outprefix}_imputed_table.tab", sep='\t')

# write the filtered table to file (for NOC processing)
preprocessing.filtered_table.to_csv(preprocessing_out / f"{outprefix}_filtered_table.csv")

In [20]:
# view the bait_imputed_table
preprocessing.bait_imputed_table

Samples,metadata,set1X126,set1X127C,set1X127C,set1X127C,set1X127N,set1X127N,set1X128C,set1X128C,set1X128C,...,set2X129N,set2X130C,set2X130C,set2X130C,set2X130N,set2X130N,set2X130N,set2X131,set2X131,set2X131
Replicates,Unnamed: 0,set1X126_3,set1X127C_1,set1X127C_2,set1X127C_3,set1X127N_2,set1X127N_3,set1X128C_1,set1X128C_2,set1X128C_3,...,set2X129N_3,set2X130C_1,set2X130C_2,set2X130C_3,set2X130N_1,set2X130N_2,set2X130N_3,set2X131_1,set2X131_2,set2X131_3
0,P05387,0.005,0.039,0.024,0.039,0.011,0.053,0.101,0.058,0.111,...,0.042,0.330,0.360,0.464,0.039,0.034,0.028,0.045,0.025,0.023
1,P04406,0.039,0.097,0.053,0.060,0.026,0.052,0.087,0.042,0.052,...,0.104,0.130,0.141,0.109,0.043,0.038,0.042,0.337,0.327,0.455
2,P60903,0.064,0.122,0.106,0.178,0.062,0.176,0.221,0.189,0.156,...,0.048,0.051,0.100,0.083,0.034,0.041,0.079,0.002,0.008,0.005
3,P57105,0.006,0.050,0.062,0.045,0.033,0.056,0.125,0.091,0.087,...,0.114,0.113,0.111,0.096,0.014,0.009,0.005,0.006,0.006,0.004
4,P21964-2,0.020,0.120,0.089,0.102,0.061,0.119,0.217,0.174,0.183,...,0.084,0.064,0.093,0.085,0.005,0.002,0.003,0.026,0.017,0.012
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4878,Q5T011-5,0.066,0.099,0.041,0.074,0.040,0.056,0.126,0.072,0.039,...,0.191,0.203,0.209,0.312,0.079,0.050,0.065,0.053,0.075,0.043
4879,Q8IWV7,0.220,0.083,0.061,0.237,0.075,0.170,0.079,0.148,0.133,...,0.102,0.226,0.050,0.037,0.132,0.019,0.027,0.207,0.532,0.451
4880,Q14055,0.058,0.093,0.088,0.091,0.085,0.104,0.198,0.142,0.182,...,0.071,0.113,0.099,0.092,0.017,0.029,0.014,0.031,0.058,0.047
4881,P04196,0.011,0.068,0.036,0.034,0.012,0.029,0.101,0.047,0.049,...,0.077,0.226,0.208,0.155,0.008,0.018,0.020,0.205,0.329,0.511
