# Preprocessing

Pre-processing includes QC filtering, Log2 intensity transformation, removing rows that have insufficient data, and global imputation  
In this notebook, we process the IP columns, and the N/O/C columns will be separately processed in another notebook

In [4]:
import sys,os
from pathlib import Path
import pandas as pd

script_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "data"
sys.path.append(str(script_path))

from pyseus import basic_processing as ip

## Define input and output

In [5]:
#%store -r timestamp
timestamp = "2024-07-27"

print(f"Timestamp: {timestamp}")

csv = "dLOPIT2024-DMSO.csv"
outprefix = f"{timestamp}_QC_filter_impute"

outdir = Path.cwd() / "output"

preprocessing_out = outdir / f"preprocessing"
os.makedirs(preprocessing_out, exist_ok=True)

preprocessing_log = preprocessing_out / f"logs"
os.makedirs(preprocessing_log, exist_ok=True)

Timestamp: 2024-07-27


## Import proteingroups table and process the column names


In [6]:
# import the protein groups table
pgroups_path = data_path / "external" / csv

try:
    pgroups = pd.read_csv(pgroups_path, index_col=None, low_memory=False, sep=',')
except FileNotFoundError:
    print(f"File {pgroups_path} not found.\n")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {pgroups_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

# rename the id column
pgroups.rename(columns={"Unnamed: 0": "Protein IDs"}, inplace=True)

In [7]:
# define exclusion columns
exclusion_cols1 = [x for x in list(pgroups.columns) if 'Infected' in x or '-infected' in x or '_infected' in x]
exclusion_cols2= [x for x in list(pgroups.columns) if 'Harsh' in x or "Unsorted" in x]
exclusion_cols3 = [x for x in list(pgroups.columns) if 'brefeldin' in x or 'arsenite' in x] 
exclusion_cols = sorted(list(set(exclusion_cols1  + exclusion_cols2 + exclusion_cols3)))

#write exclusion_col to file
with open(preprocessing_log / f"{outprefix}_exclusion_cols.txt", 'w') as f:
    for item in exclusion_cols:
        if item.startswith("LFQ"):
            f.write(f"{item}\n")

In [8]:
# use the inverse of the exclusion columns to select the inclusion columns
inclusion_cols = [x for x in list(pgroups.columns) if x not in exclusion_cols]
#write inclusion column names to file for examination
with open(preprocessing_log / f"{outprefix}_inclusion_cols.txt", 'w') as f:
    for item in inclusion_cols:
        if item.startswith("fraction"):
            f.write("%s\n" % item)

In [9]:
# Rename columns based on rules defined in regular expressions.
# Experimental states (eg infection) are part of sample names, and therefore gets a dash instead of underscore
re = ["infected", '_InfectedOC43', 'WT_HEK', '_Unsorted', '_HarshLysis',  r'(.*)\.Rep([123])'] 
# This is a list of replacement strings, respective in order to the above list
replacement_re=["Infected",'-infected', 'WT', '-unsorted', '-HarshLysis',  r'fraction \1_\2']

# The method will reformat strings by the order given in the previous list
# This is solely to test whether the renaming is working as intended
sample_cols = ip.sample_rename(inclusion_cols, RE=re, replacement_RE=replacement_re, repl_search=False) 

# subset pggroups
pgroups = pgroups[inclusion_cols]

# create empty columns
pgroups["Potential contaminant"] = None
pgroups["Reverse"] = None
pgroups["Only identified by site"] = None

In [10]:
sample_cols

['Protein IDs',
 'fraction 126_1',
 'fraction 127N_1',
 'fraction 127C_1',
 'fraction 128N_1',
 'fraction 128C_1',
 'fraction 129N_1',
 'fraction 129C_1',
 'fraction 130N_1',
 'fraction 126_2',
 'fraction 127N_2',
 'fraction 127C_2',
 'fraction 128N_2',
 'fraction 128C_2',
 'fraction 129N_2',
 'fraction 129C_2',
 'fraction 130N_2',
 'fraction 126_3',
 'fraction 127N_3',
 'fraction 127C_3',
 'fraction 128N_3',
 'fraction 128C_3',
 'fraction 129N_3',
 'fraction 129C_3',
 'fraction 130N_3']

In [11]:
pgroups.columns

Index(['Protein IDs', '126.Rep1', '127N.Rep1', '127C.Rep1', '128N.Rep1',
       '128C.Rep1', '129N.Rep1', '129C.Rep1', '130N.Rep1', '126.Rep2',
       '127N.Rep2', '127C.Rep2', '128N.Rep2', '128C.Rep2', '129N.Rep2',
       '129C.Rep2', '130N.Rep2', '126.Rep3', '127N.Rep3', '127C.Rep3',
       '128N.Rep3', '128C.Rep3', '129N.Rep3', '129C.Rep3', '130N.Rep3',
       'Potential contaminant', 'Reverse', 'Only identified by site'],
      dtype='object')

In [12]:
# Actual renaming of the table
# initiate RawTables
meta_cols = [
    'Protein IDs' ]
preprocessing = ip.RawTables(proteingroup=pgroups, sample_cols=list(pgroups),
    file_designated=True, info_cols=meta_cols, intensity_type='fraction') #Duo: type selection can't be use together with sample_cols selection

# rename files
preprocessing.rename_columns(RE=re, replacement_RE=replacement_re, repl_search=False) #Duo: this is a dry run too right?

# you can access renamed table and renamed cols by the following class objects
_ = preprocessing.renamed_table
_ = preprocessing.sample_cols

In [13]:
preprocessing.renamed_table

Unnamed: 0,Protein IDs,fraction 126_1,fraction 127N_1,fraction 127C_1,fraction 128N_1,fraction 128C_1,fraction 129N_1,fraction 129C_1,fraction 130N_1,fraction 126_2,...,fraction 127N_3,fraction 127C_3,fraction 128N_3,fraction 128C_3,fraction 129N_3,fraction 129C_3,fraction 130N_3,Potential contaminant,Reverse,Only identified by site
0,A0AVF1,0.075998,0.052805,0.043911,0.084396,0.166369,0.154534,0.176645,0.245341,0.070218,...,0.067664,0.053600,0.074892,0.145819,0.206967,0.148945,0.246000,,,
1,A0AVT1,0.061398,0.031036,0.054258,0.130859,0.173547,0.150094,0.160187,0.238620,0.041280,...,0.040915,0.057618,0.109563,0.156170,0.179984,0.151770,0.256185,,,
2,A0FGR8,0.477319,0.355670,0.107941,0.027711,0.011299,0.009230,0.006760,0.004068,0.486102,...,0.235214,0.054348,0.023296,0.012997,0.006694,0.004965,0.003504,,,
3,A0MZ66,0.185401,0.072259,0.171417,0.180019,0.105553,0.077828,0.089887,0.117637,0.102738,...,0.064410,0.177310,0.179241,0.121370,0.102615,0.080053,0.100424,,,
4,A0PK00,0.300444,0.418844,0.186354,0.036021,0.013856,0.018953,0.014504,0.011024,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6922,Q9Y3A6,,,,,,,,,,...,0.108242,0.068282,0.031645,0.022640,0.019039,0.017022,0.014944,,,
6923,Q9Y535,,,,,,,,,,...,0.033928,0.048709,0.098621,0.187810,0.249108,0.185306,0.176160,,,
6924,Q9Y576,,,,,,,,,,...,0.081914,0.054064,0.081425,0.096339,0.145391,0.187480,0.267404,,,
6925,Q9Y620,,,,,,,,,,...,0.094341,0.062182,0.095428,0.110213,0.118534,0.155225,0.252392,,,


In [14]:
# write sample column names to file for examination
with open(os.path.join(preprocessing_log / f"{outprefix}__sample_cols.txt"), "w") as f:
    for item in preprocessing.sample_cols:
        if item.startswith("fraction"):
            f.write("%s\n" % item)

## Proprocessing


In [15]:
# using the same RawTables class initiated from renaming

# filter table based on MaxQuant qc
preprocessing.filter_table(select_intensity=True, verbose=True)

# transform intensities
#preprocessing.transform_intensities()

# group replicates and remove insufficient rows
preprocessing.group_replicates(reg_exp=r"(.*)_\d+$")

Filtered 0 of 6927 rows. Now 6927 rows.
Intensity values have not been transformed yet from filtered table,
we recommend using transform_intensities() method before grouping replicates.

Using filtered_table to group replicates.


In [16]:
# require proteins to have at least two valid values in the proteome samples
#preprocessing.remove_invalid_rows_custom(["UnInfected_Proteome"])

In [17]:
preprocessing.remove_invalid_rows_custom(n=1) # Remove rows that do not have at least one group that has less than n invalid values in all replicates

# save preimpute table
preprocessing.preimpute_table.to_csv(preprocessing_out / f"{outprefix}_preimpute_table.csv")

# impute for NaN values. Here we are using global imputation 
preprocessing.bait_impute(distance=1.8, width=0.3, local=False)

Removing invalid rows for 8 groups
Removed invalid rows. 5314 from 6927 rows remaining.


Save the imputed table

In [18]:
# The pre-processing is done, you can save/continue using the below class object
_ = preprocessing.bait_imputed_table

# print out the column names to file for examination
preprocessing.bait_imputed_table.columns.to_frame().to_csv(preprocessing_out / f"{outprefix}_imputed_table_cols.csv")

# write preprocessed pg table to a csv file (this is required for next steps)
preprocessing.bait_imputed_table.to_csv(preprocessing_out / f"{outprefix}_imputed_table.csv")

# write preprossed pg table to tabular file (for Perseus) 
#preprocessing.bait_imputed_table.to_csv(preprocessing_out / f"{outprefix}_imputed_table.tab", sep='\t')

# write the filtered table to file (for NOC processing)
preprocessing.filtered_table.to_csv(preprocessing_out / f"{outprefix}_filtered_table.csv")

In [19]:
# view the bait_imputed_table
preprocessing.bait_imputed_table

Samples,126,126,126,127C,127C,127C,127N,127N,127N,128C,...,129C,129C,129C,129N,129N,129N,130N,130N,130N,metadata
Replicates,126_1,126_2,126_3,127C_1,127C_2,127C_3,127N_1,127N_2,127N_3,128C_1,...,129C_1,129C_2,129C_3,129N_1,129N_2,129N_3,130N_1,130N_2,130N_3,Protein IDs
0,0.0760,0.0702,0.0561,0.0439,0.0544,0.0536,0.0528,0.0674,0.0677,0.1664,...,0.1766,0.1940,0.1489,0.1545,0.2105,0.2070,0.2453,0.1791,0.2460,A0AVF1
1,0.0614,0.0413,0.0478,0.0543,0.0395,0.0576,0.0310,0.0335,0.0409,0.1735,...,0.1602,0.1887,0.1518,0.1501,0.1915,0.1800,0.2386,0.2042,0.2562,A0AVT1
2,0.4773,0.4861,0.6590,0.1079,0.1450,0.0543,0.3557,0.2974,0.2352,0.0113,...,0.0068,0.0056,0.0050,0.0092,0.0085,0.0067,0.0041,0.0073,0.0035,A0FGR8
3,0.1854,0.1027,0.1746,0.1714,0.1278,0.1773,0.0723,0.0576,0.0644,0.1056,...,0.0899,0.0979,0.0801,0.0778,0.1106,0.1026,0.1176,0.1255,0.1004,A0MZ66
4,0.3752,0.3270,0.4915,0.1293,0.1755,0.0931,0.4311,0.3740,0.3159,0.0115,...,0.0051,0.0115,0.0083,0.0081,0.0146,0.0137,0.0044,0.0111,0.0061,A1L0T0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5309,0.2997,0.3207,0.4133,0.0597,0.0845,0.0503,0.1662,0.1654,0.1626,0.0763,...,0.1222,0.1250,0.1114,0.1086,0.0976,0.0768,0.1419,0.1373,0.1279,Q9Y6W5
5310,0.8341,0.7673,0.8770,0.0266,0.0543,0.0187,0.1062,0.1434,0.0776,0.0041,...,0.0081,0.0041,0.0047,0.0077,0.0045,0.0041,0.0056,0.0050,0.0060,Q9Y6X5
5311,0.0497,0.0422,0.0486,0.0598,0.0514,0.0674,0.0602,0.0416,0.0786,0.1655,...,0.1844,0.2376,0.2246,0.1926,0.1768,0.1694,0.1535,0.1571,0.1254,Q9Y6X9
5312,0.0201,0.0860,0.0446,0.0213,0.0633,0.0399,0.0326,0.0855,0.0772,0.1819,...,0.2337,0.2332,0.2081,0.2594,0.2012,0.2468,0.2175,0.1994,0.2403,Q9Y6Y0


In [20]:
preprocessing.bait_imputed_table.columns

MultiIndex([(     '126',       '126_1'),
            (     '126',       '126_2'),
            (     '126',       '126_3'),
            (    '127C',      '127C_1'),
            (    '127C',      '127C_2'),
            (    '127C',      '127C_3'),
            (    '127N',      '127N_1'),
            (    '127N',      '127N_2'),
            (    '127N',      '127N_3'),
            (    '128C',      '128C_1'),
            (    '128C',      '128C_2'),
            (    '128C',      '128C_3'),
            (    '128N',      '128N_1'),
            (    '128N',      '128N_2'),
            (    '128N',      '128N_3'),
            (    '129C',      '129C_1'),
            (    '129C',      '129C_2'),
            (    '129C',      '129C_3'),
            (    '129N',      '129N_1'),
            (    '129N',      '129N_2'),
            (    '129N',      '129N_3'),
            (    '130N',      '130N_1'),
            (    '130N',      '130N_2'),
            (    '130N',      '130N_3'),
            ('me