In [1]:
import pickle
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import prospectdataset as prods
import pickle
import os

SAVE_PATH = str(Path.home() / "Downloads" / "retention.parquet")


#### Read data

In [2]:
tum_third_pool_df = pd.read_parquet(
    str(Path.home() / "Downloads" / "retention.parquet" / "TUM_third_pool_meta_data.parquet"), engine='fastparquet')


#### Remove data with high variance


In [4]:
treshold = 1
filtered_data = tum_third_pool_df.groupby("modified_sequence")["indexed_retention_time"].var().loc[lambda x: x < treshold]


#### Either groupby peptide and calculate median over iRT


In [6]:
tum_third_pool_df.groupby('modified_sequence')['indexed_retention_time'].median()


modified_sequence
AAAHGQWELQR                               26.428589
AAAHGQWELQRVHAK                           17.735715
AAEADGPLK                                  8.703936
AAEADGPLKR                                14.834332
AAIRLELFLPPQLK                           104.158475
                                            ...    
YVAIC[UNIMOD:4]NPLRYPVIM[UNIMOD:35]NR     68.012109
YVAIC[UNIMOD:4]SPLRYPVIMSK                74.275623
YVAIC[UNIMOD:4]SPLRYPVIM[UNIMOD:35]SK     72.702211
YVLTSPRSLEAC[UNIMOD:4]AR                  46.175652
YYTRLGNDFHTNK                             25.432464
Name: indexed_retention_time, Length: 4423, dtype: float64

#### Or groupby sequence and select highest ANDROMEDA score


In [7]:
tum_third_pool_df.groupby('modified_sequence')['andromeda_score'].max()


modified_sequence
AAAHGQWELQR                              199.68
AAAHGQWELQRVHAK                          225.29
AAEADGPLK                                138.08
AAEADGPLKR                               127.76
AAIRLELFLPPQLK                           114.24
                                          ...  
YVAIC[UNIMOD:4]NPLRYPVIM[UNIMOD:35]NR    192.18
YVAIC[UNIMOD:4]SPLRYPVIMSK               264.31
YVAIC[UNIMOD:4]SPLRYPVIM[UNIMOD:35]SK    192.40
YVLTSPRSLEAC[UNIMOD:4]AR                 142.39
YYTRLGNDFHTNK                            228.33
Name: andromeda_score, Length: 4423, dtype: float64

#### Small example dataset creation


In [66]:
# calculate variances
variance_df = tum_third_pool_df.groupby("modified_sequence")["indexed_retention_time"].var()

In [67]:
# convert to df for easier merging
variance_df = pd.DataFrame({'modified_sequence':variance_df.index, 'variance':variance_df.values})

In [71]:
# merge to original dataset
merged_data = tum_third_pool_df.merge(variance_df, left_on= 'modified_sequence', right_on= 'modified_sequence')

In [109]:
# drop rows with variance higher than treshold
treshold = 1
filtered_data = merged_data[merged_data['variance'] <= treshold]


In [110]:
# select measurement with highest ANDROMEDA score
final_data = filtered_data.sort_values('andromeda_score', ascending=False).drop_duplicates('modified_sequence').sort_index()


In [113]:
# save to csv
final_data.to_csv("third_pool_tresh_1_0.csv")
final_data[["modified_sequence", "indexed_retention_time"]].to_csv("third_pool_tresh_1_0_train.csv", index = False)