# Preprocessing

Pre-processing includes QC filtering, Log2 intensity transformation, removing rows that have insufficient data, and global imputation  
In this notebook, we process the IP columns, and the N/O/C columns will be separately processed in another notebook

In [4]:
import sys,os
from pathlib import Path
import pandas as pd

script_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "data"
sys.path.append(str(script_path))

from pyseus import basic_processing as ip

## Define input and output

In [5]:
#%store -r timestamp
timestamp = "2024-07-27"

print(f"Timestamp: {timestamp}")

csv = "lopitdcU2OS2018.csv" 
outprefix = f"{timestamp}_QC_filter_impute"

outdir = Path.cwd() / "output"

preprocessing_out = outdir / f"preprocessing"
os.makedirs(preprocessing_out, exist_ok=True)

preprocessing_log = preprocessing_out / f"logs"
os.makedirs(preprocessing_log, exist_ok=True)

Timestamp: 2024-07-27


## Import proteingroups table and process the column names


In [6]:
# import the protein groups table
pgroups_path = data_path / "external" / csv

try:
    pgroups = pd.read_csv(pgroups_path, index_col=None, low_memory=False, sep=',')
except FileNotFoundError:
    print(f"File {pgroups_path} not found.\nPlease note that this file is 670MB and is not included in the repository.")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {pgroups_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

# rename the id column
pgroups.rename(columns={"Unnamed: 0": "Protein IDs"}, inplace=True)

In [7]:
# define exclusion columns
exclusion_cols1 = [x for x in list(pgroups.columns) if 'Infected' in x or '-infected' in x or '_infected' in x]
exclusion_cols2= [x for x in list(pgroups.columns) if 'Harsh' in x or "Unsorted" in x]
exclusion_cols3 = [x for x in list(pgroups.columns) if 'brefeldin' in x or 'arsenite' in x] 
exclusion_cols = sorted(list(set(exclusion_cols1  + exclusion_cols2 + exclusion_cols3)))

#write exclusion_col to file
with open(preprocessing_log / f"{outprefix}_exclusion_cols.txt", 'w') as f:
    for item in exclusion_cols:
        if item.startswith("LFQ"):
            f.write(f"{item}\n")

In [8]:
# use the inverse of the exclusion columns to select the inclusion columns
inclusion_cols = [x for x in list(pgroups.columns) if x not in exclusion_cols]
#write inclusion column names to file for examination
with open(preprocessing_log / f"{outprefix}_inclusion_cols.txt", 'w') as f:
    for item in inclusion_cols:
        if item.startswith("fraction"):
            f.write("%s\n" % item)

In [9]:
# Rename columns based on rules defined in regular expressions.
# Experimental states (eg infection) are part of sample names, and therefore gets a dash instead of underscore
re = ["infected", '_InfectedOC43', 'WT_HEK', '_Unsorted', '_HarshLysis',  r'(.*)rep([123])'] 
# This is a list of replacement strings, respective in order to the above list
replacement_re=["Infected",'-infected', 'WT', '-unsorted', '-HarshLysis',  r'fraction \1_\2']

# The method will reformat strings by the order given in the previous list
# This is solely to test whether the renaming is working as intended
sample_cols = ip.sample_rename(inclusion_cols, RE=re, replacement_RE=replacement_re, repl_search=False) 

# subset pggroups
pgroups = pgroups[inclusion_cols]

# create empty columns
pgroups["Potential contaminant"] = None
pgroups["Reverse"] = None
pgroups["Only identified by site"] = None

In [10]:
pgroups.columns

Index(['Protein IDs', 'P1rep1', 'P2rep1', 'P3rep1', 'P4rep1', 'P5rep1',
       'P6rep1', 'P7rep1', 'P8rep1', 'P9rep1', 'SNrep1', 'P1rep2', 'P2rep2',
       'P3rep2', 'P4rep2', 'P5rep2', 'P6rep2', 'P7rep2', 'P8rep2', 'P9rep2',
       'SNrep2', 'P1rep3', 'P2rep3', 'P3rep3', 'P4rep3', 'P5rep3', 'P6rep3',
       'P7rep3', 'P8rep3', 'P9rep3', 'SNrep3', 'Potential contaminant',
       'Reverse', 'Only identified by site'],
      dtype='object')

In [11]:
# Actual renaming of the table
# initiate RawTables
meta_cols = [
    'Protein IDs' ]
preprocessing = ip.RawTables(proteingroup=pgroups, sample_cols=list(pgroups),
    file_designated=True, info_cols=meta_cols, intensity_type='fraction') #Duo: type selection can't be use together with sample_cols selection

# rename files
preprocessing.rename_columns(RE=re, replacement_RE=replacement_re, repl_search=False) #Duo: this is a dry run too right?

# you can access renamed table and renamed cols by the following class objects
_ = preprocessing.renamed_table
_ = preprocessing.sample_cols

In [12]:
preprocessing.renamed_table

Unnamed: 0,Protein IDs,fraction P1_1,fraction P2_1,fraction P3_1,fraction P4_1,fraction P5_1,fraction P6_1,fraction P7_1,fraction P8_1,fraction P9_1,...,fraction P4_3,fraction P5_3,fraction P6_3,fraction P7_3,fraction P8_3,fraction P9_3,fraction SN_3,Potential contaminant,Reverse,Only identified by site
0,P09382,0.084,0.116,0.099,0.090,0.089,0.072,0.057,0.051,0.066,...,0.103,0.076,0.080,0.061,0.058,0.054,0.228,,,
1,P63218,0.112,0.199,0.207,0.163,0.146,0.080,0.064,0.031,0.013,...,0.182,0.078,0.060,0.066,0.033,0.007,0.002,,,
2,P04792,0.054,0.070,0.093,0.100,0.122,0.131,0.078,0.046,0.078,...,0.118,0.129,0.123,0.077,0.060,0.055,0.195,,,
3,P60174-1,0.051,0.069,0.086,0.093,0.105,0.144,0.079,0.045,0.061,...,0.111,0.126,0.118,0.068,0.057,0.061,0.226,,,
4,P00558,0.048,0.058,0.082,0.107,0.113,0.181,0.084,0.055,0.072,...,0.122,0.126,0.117,0.073,0.064,0.059,0.230,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6832,Q86XP1-5,0.081,0.103,0.103,0.118,0.111,0.125,0.068,0.074,0.100,...,0.116,0.116,0.100,0.071,0.094,0.088,0.144,,,
6833,Q01955,0.079,0.186,0.189,0.133,0.150,0.081,0.060,0.054,0.052,...,0.204,0.104,0.079,0.045,0.014,0.000,0.001,,,
6834,Q9BXX2,0.046,0.082,0.088,0.101,0.155,0.152,0.186,0.153,0.035,...,0.117,0.113,0.113,0.222,0.149,0.062,0.007,,,
6835,Q9UKN7,0.047,0.080,0.062,0.067,0.078,0.085,0.127,0.243,0.197,...,0.084,0.089,0.107,0.130,0.144,0.117,0.040,,,


In [13]:
# write sample column names to file for examination
with open(os.path.join(preprocessing_log / f"{outprefix}__sample_cols.txt"), "w") as f:
    for item in preprocessing.sample_cols:
        if item.startswith("fraction"):
            f.write("%s\n" % item)

## Proprocessing


In [14]:
# using the same RawTables class initiated from renaming

# filter table based on MaxQuant qc
preprocessing.filter_table(select_intensity=True, verbose=True)

# transform intensities
#preprocessing.transform_intensities()

# group replicates and remove insufficient rows
preprocessing.group_replicates(reg_exp=r"(.*)_\d+$")

Filtered 0 of 6837 rows. Now 6837 rows.
Intensity values have not been transformed yet from filtered table,
we recommend using transform_intensities() method before grouping replicates.

Using filtered_table to group replicates.


In [15]:
# require proteins to have at least two valid values in the proteome samples
#preprocessing.remove_invalid_rows_custom(["UnInfected_Proteome"])

In [16]:
preprocessing.remove_invalid_rows_custom(n=1) # Remove rows that do not have at least one group that has less than n invalid values in all replicates

# save preimpute table
preprocessing.preimpute_table.to_csv(preprocessing_out / f"{outprefix}_preimpute_table.csv")

# impute for NaN values. Here we are using global imputation 
preprocessing.bait_impute(distance=1.8, width=0.3, local=False)

Removing invalid rows for 10 groups
Removed invalid rows. 6837 from 6837 rows remaining.


Save the imputed table

In [17]:
# The pre-processing is done, you can save/continue using the below class object
_ = preprocessing.bait_imputed_table

# print out the column names to file for examination
preprocessing.bait_imputed_table.columns.to_frame().to_csv(preprocessing_out / f"{outprefix}_imputed_table_cols.csv")

# write preprocessed pg table to a csv file (this is required for next steps)
preprocessing.bait_imputed_table.to_csv(preprocessing_out / f"{outprefix}_imputed_table.csv")

# write preprossed pg table to tabular file (for Perseus) 
#preprocessing.bait_imputed_table.to_csv(preprocessing_out / f"{outprefix}_imputed_table.tab", sep='\t')

# write the filtered table to file (for NOC processing)
preprocessing.filtered_table.to_csv(preprocessing_out / f"{outprefix}_filtered_table.csv")

In [18]:
# view the bait_imputed_table
preprocessing.bait_imputed_table

Samples,P1,P1,P1,P2,P2,P2,P3,P3,P3,P4,...,P8,P8,P8,P9,P9,P9,SN,SN,SN,metadata
Replicates,P1_1,P1_2,P1_3,P2_1,P2_2,P2_3,P3_1,P3_2,P3_3,P4_1,...,P8_1,P8_2,P8_3,P9_1,P9_2,P9_3,SN_1,SN_2,SN_3,Protein IDs
0,0.084,0.142,0.117,0.116,0.100,0.103,0.099,0.107,0.106,0.090,...,0.051,0.052,0.058,0.066,0.067,0.054,0.252,0.258,0.228,P09382
1,0.112,0.202,0.230,0.199,0.177,0.152,0.207,0.134,0.177,0.163,...,0.031,0.045,0.033,0.013,0.034,0.007,0.003,0.004,0.002,P63218
2,0.054,0.068,0.056,0.070,0.092,0.071,0.093,0.140,0.113,0.100,...,0.046,0.042,0.060,0.078,0.046,0.055,0.222,0.203,0.195,P04792
3,0.051,0.052,0.049,0.069,0.079,0.066,0.086,0.119,0.105,0.093,...,0.045,0.046,0.057,0.061,0.057,0.061,0.243,0.251,0.226,P60174-1
4,0.048,0.096,0.045,0.058,0.064,0.063,0.082,0.146,0.099,0.107,...,0.055,0.043,0.064,0.072,0.048,0.059,0.166,0.072,0.230,P00558
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6832,0.081,0.110,0.094,0.103,0.091,0.072,0.103,0.124,0.107,0.118,...,0.074,0.059,0.094,0.100,0.095,0.088,0.117,0.150,0.144,Q86XP1-5
6833,0.079,0.153,0.103,0.186,0.268,0.229,0.189,0.159,0.221,0.133,...,0.054,0.015,0.014,0.052,0.005,0.000,0.016,0.004,0.001,Q01955
6834,0.046,0.036,0.048,0.082,0.079,0.078,0.088,0.067,0.091,0.101,...,0.153,0.215,0.149,0.035,0.068,0.062,0.003,0.004,0.007,Q9BXX2
6835,0.047,0.044,0.126,0.080,0.052,0.070,0.062,0.043,0.093,0.067,...,0.243,0.269,0.144,0.197,0.253,0.117,0.014,0.008,0.040,Q9UKN7
