# Preprocessing

Pre-processing includes QC filtering, Log2 intensity transformation, removing rows that have insufficient data, and global imputation  
In this notebook, we process the IP columns, and the N/O/C columns will be separately processed in another notebook

In [34]:
import sys,os
from pathlib import Path
import pandas as pd

script_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "script"
data_path = Path.cwd().parent.parent.parent.parent.parent.parent.parent / "data"
sys.path.append(str(script_path))

from pyseus import basic_processing as ip

## Define input and output

In [35]:
#%store -r timestamp
timestamp = "2024-07-16"

print(f"Timestamp: {timestamp}")

csv = "DIA-DOMs-2023.csv" 
outprefix = f"{timestamp}_QC_filter_impute"

outdir = Path.cwd() / "output"

preprocessing_out = outdir / f"preprocessing"
os.makedirs(preprocessing_out, exist_ok=True)

preprocessing_log = preprocessing_out / f"logs"
os.makedirs(preprocessing_log, exist_ok=True)

Timestamp: 2024-07-16


## Import proteingroups table and process the column names


In [36]:
# import the protein groups table
pgroups_path = data_path / "external" / csv

try:
    pgroups = pd.read_csv(pgroups_path, index_col=None, low_memory=False, sep=',')
except FileNotFoundError:
    print(f"File {pgroups_path} not found.\n")
except pd.errors.ParserError:
    print(f"There was an error parsing the CSV file at {pgroups_path}.")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

In [37]:
# define exclusion columns
exclusion_cols1 = [x for x in list(pgroups.columns) if 'Infected' in x or '-infected' in x or '_infected' in x]
exclusion_cols2= [x for x in list(pgroups.columns) if 'Harsh' in x or "Unsorted" in x]
exclusion_cols3 = [x for x in list(pgroups.columns) if 'brefeldin' in x or 'arsenite' in x] 
exclusion_cols = sorted(list(set(exclusion_cols1  + exclusion_cols2 + exclusion_cols3)))

#write exclusion_col to file
with open(preprocessing_log / f"{outprefix}_exclusion_cols.txt", 'w') as f:
    for item in exclusion_cols:
        if item.startswith("LFQ"):
            f.write(f"{item}\n")

In [38]:
# use the inverse of the exclusion columns to select the inclusion columns
inclusion_cols = [x for x in list(pgroups.columns) if x not in exclusion_cols]
#write inclusion column names to file for examination
with open(preprocessing_log / f"{outprefix}_inclusion_cols.txt", 'w') as f:
    for item in inclusion_cols:
        if item.startswith("Fraction"):
            f.write("%s\n" % item)

In [39]:
# Rename columns based on rules defined in regular expressions.
# Experimental states (eg infection) are part of sample names, and therefore gets a dash instead of underscore
re = ["infected", '_InfectedOC43', 'WT_HEK', '_Unsorted', '_HarshLysis', '_rep1', '_rep2', '_rep3', r'MAP([123])_(.*)', r'( \d{2})_', 'cyt'] 
# This is a list of replacement strings, respective in order to the above list
replacement_re=["Infected",'-infected', 'WT', '-unsorted', '-HarshLysis', '_1', '_2', '_3', r'\2_\1', r'\1-', 'Cyt']

# The method will reformat strings by the order given in the previous list
# This is solely to test whether the renaming is working as intended
sample_cols = ip.sample_rename(inclusion_cols, RE=re, replacement_RE=replacement_re, repl_search=False) 

# subset pggroups
pgroups = pgroups[inclusion_cols]

# create empty columns
pgroups["Potential contaminant"] = None
pgroups["Reverse"] = None
pgroups["Only identified by site"] = None

In [40]:
pgroups

Unnamed: 0,Protein IDs,Compartment,Gene names,Normalized Map profiles,Fraction MAP1_01K,Fraction MAP1_03K,Fraction MAP1_06K,Fraction MAP1_12K,Fraction MAP1_24K,Fraction MAP1_80K,...,Compartment Marker,Gene names.1,PredictionMax probability,Prediction Winner,Multiple Prediction Winners?,Classification,Confidence,Potential contaminant,Reverse,Only identified by site
0,Q92692,Plasma membrane,NECTIN2,,0.108145,0.251613,0.259940,0.235037,0.105921,0.039344,...,Plasma membrane,NECTIN2,0.990,Plasma membrane,no,Plasma membrane,Very High,,,
1,Q969P0,Plasma membrane,IGSF8,,0.104023,0.250967,0.252079,0.244042,0.101174,0.047715,...,Plasma membrane,IGSF8,0.990,Plasma membrane,no,Plasma membrane,Very High,,,
2,P15151,Plasma membrane,PVR,,0.088981,0.275359,0.259767,0.229555,0.109169,0.037170,...,Plasma membrane,PVR,0.989,Plasma membrane,no,Plasma membrane,Very High,,,
3,P15529,Plasma membrane,CD46,,0.116255,0.245252,0.254803,0.223829,0.107583,0.052278,...,Plasma membrane,CD46,0.989,Plasma membrane,no,Plasma membrane,Very High,,,
4,Q9ULF5,undefined,SLC39A10,,0.108257,0.254800,0.268498,0.222301,0.101445,0.044699,...,,SLC39A10,0.989,Plasma membrane,no,Plasma membrane,Very High,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7438,O14641,undefined,DVL2,,0.142566,0.251278,0.208249,0.143421,0.121130,0.133355,...,,DVL2,0.326,Actin binding proteins,no,,Best Guess,,,
7439,P67936,undefined,TPM4,,0.183839,0.229338,0.171813,0.165422,0.155544,0.094043,...,,TPM4,0.321,Actin binding proteins,no,,Best Guess,,,
7440,Q12800,undefined,TFCP2,,0.306088,0.265704,0.186233,0.130615,0.067706,0.043655,...,,TFCP2,0.320,Actin binding proteins,no,,Best Guess,,,
7441,Q9UPS6,undefined,SETD1B,,0.123023,0.304656,0.292279,0.111006,0.110908,0.058128,...,,SETD1B,0.303,Actin binding proteins,no,,Best Guess,,,


In [41]:
# Actual renaming of the table
# initiate RawTables
meta_cols = [
    'Protein IDs',
    'Compartment',
    'Gene names']
preprocessing = ip.RawTables(proteingroup=pgroups, sample_cols=list(pgroups),
    file_designated=True, info_cols=meta_cols, intensity_type='Fraction') #Duo: type selection can't be use together with sample_cols selection

# rename files
preprocessing.rename_columns(RE=re, replacement_RE=replacement_re, repl_search=False) #Duo: this is a dry run too right?

# you can access renamed table and renamed cols by the following class objects
_ = preprocessing.renamed_table
_ = preprocessing.sample_cols

In [42]:
# write sample column names to file for examination
with open(os.path.join(preprocessing_log / f"{outprefix}__sample_cols.txt"), "w") as f:
    for item in preprocessing.sample_cols:
        if item.startswith("Fraction"):
            f.write("%s\n" % item)

## Proprocessing


In [43]:
# using the same RawTables class initiated from renaming

# filter table based on MaxQuant qc
preprocessing.filter_table(select_intensity=True, verbose=True)

# transform intensities
#preprocessing.transform_intensities()

# group replicates and remove insufficient rows
preprocessing.group_replicates(reg_exp=r"(.*)_\d+$")

Filtered 0 of 7443 rows. Now 7443 rows.
Intensity values have not been transformed yet from filtered table,
we recommend using transform_intensities() method before grouping replicates.

Using filtered_table to group replicates.


In [44]:
# require proteins to have at least two valid values in the proteome samples
#preprocessing.remove_invalid_rows_custom(["UnInfected_Proteome"])

In [45]:
preprocessing.remove_invalid_rows_custom(n=1) # Remove rows that do not have at least one group that has less than n invalid values in all replicates

# save preimpute table
preprocessing.preimpute_table.to_csv(preprocessing_out / f"{outprefix}_preimpute_table.csv")

# impute for NaN values. Here we are using global imputation 
preprocessing.bait_impute(distance=1.8, width=0.3, local=False)

Removing invalid rows for 6 groups
Removed invalid rows. 7443 from 7443 rows remaining.


Save the imputed table

In [46]:
# The pre-processing is done, you can save/continue using the below class object
_ = preprocessing.bait_imputed_table

# print out the column names to file for examination
preprocessing.bait_imputed_table.columns.to_frame().to_csv(preprocessing_out / f"{outprefix}_imputed_table_cols.csv")

# write preprocessed pg table to a csv file (this is required for next steps)
preprocessing.bait_imputed_table.to_csv(preprocessing_out / f"{outprefix}_imputed_table.csv")

# write preprossed pg table to tabular file (for Perseus) 
#preprocessing.bait_imputed_table.to_csv(preprocessing_out / f"{outprefix}_imputed_table.tab", sep='\t')

# write the filtered table to file (for NOC processing)
preprocessing.filtered_table.to_csv(preprocessing_out / f"{outprefix}_filtered_table.csv")

In [47]:
# view the bait_imputed_table
preprocessing.bait_imputed_table

Samples,01K,01K,01K,03K,03K,03K,06K,06K,06K,12K,12K,12K,24K,24K,24K,80K,80K,80K,metadata,metadata,metadata
Replicates,01K_1,01K_2,01K_3,03K_1,03K_2,03K_3,06K_1,06K_2,06K_3,12K_1,...,12K_3,24K_1,24K_2,24K_3,80K_1,80K_2,80K_3,Protein IDs,Compartment,Gene names
0,0.1081,0.1064,0.1162,0.2516,0.2792,0.2389,0.2599,0.2499,0.2684,0.2350,...,0.2204,0.1059,0.0995,0.1156,0.0393,0.0421,0.0404,Q92692,Plasma membrane,NECTIN2
1,0.1040,0.1016,0.1029,0.2510,0.2712,0.2425,0.2521,0.2654,0.2592,0.2440,...,0.2372,0.1012,0.0887,0.1117,0.0477,0.0394,0.0465,Q969P0,Plasma membrane,IGSF8
2,0.0890,0.1073,0.1484,0.2754,0.2679,0.2341,0.2598,0.2471,0.2478,0.2296,...,0.2347,0.1092,0.1003,0.0983,0.0372,0.0390,0.0367,P15151,Plasma membrane,PVR
3,0.1163,0.1031,0.1234,0.2453,0.2563,0.2297,0.2548,0.2583,0.2693,0.2238,...,0.2242,0.1076,0.1033,0.1024,0.0523,0.0479,0.0511,P15529,Plasma membrane,CD46
4,0.1083,0.1093,0.1211,0.2548,0.2741,0.2480,0.2685,0.2565,0.2565,0.2223,...,0.2269,0.1014,0.0939,0.1037,0.0447,0.0477,0.0437,Q9ULF5,undefined,SLC39A10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7438,0.1426,0.1393,0.1675,0.2513,0.2405,0.2641,0.2082,0.1767,0.1783,0.1434,...,0.1529,0.1211,0.1837,0.1390,0.1334,0.1076,0.0982,O14641,undefined,DVL2
7439,0.1838,0.1822,0.2030,0.2293,0.2273,0.2682,0.1718,0.1630,0.1456,0.1654,...,0.1481,0.1555,0.1796,0.1314,0.0940,0.0896,0.1037,P67936,undefined,TPM4
7440,0.3061,0.2743,0.2790,0.2657,0.2519,0.2275,0.1862,0.1983,0.2206,0.1306,...,0.1487,0.0677,0.0855,0.0806,0.0437,0.0391,0.0435,Q12800,undefined,TFCP2
7441,0.1230,0.2183,0.2293,0.3047,0.3199,0.2272,0.2923,0.1313,0.1209,0.1110,...,0.0737,0.1109,0.0759,0.2044,0.0581,0.0793,0.1444,Q9UPS6,undefined,SETD1B
