# Data processing for further analysing

- Merging PXD004452 data into one data frame
- Sequence integration from PXD004452 and Tailor
- Adding decoy sequences (reversed & shuffled)

In [54]:
import pandas as pd
import os
import re
from tqdm import tqdm
import Levenshtein as lev
import random

In [55]:
df = pd.read_csv("data/tailor.assign-confidence.target.txt", delimiter="\t")
df = df[df['tdc q-value'] < 0.01]
print(f'Number of annotations: {df.shape[0]}')

Number of annotations: 243230


In [56]:
def merge_tsv_files(directory):
    all_files = [f for f in os.listdir(directory) if f.endswith(".tsv")]
    df_list = []
    
    for f in all_files:
        df = pd.read_csv(os.path.join(directory, f), sep='\t')
        df["file_key"] = f[:-4]
        df["scan"] = df["TITLE"].str.extract(r"scan=(\d+)").astype(float).fillna(-1).astype(int)
        df_list.append(df)
    
    return pd.concat(df_list, ignore_index=True) if df_list else pd.DataFrame()

def merge_with_metadata(merged_df, df):
    df = df.copy()
    df["file_key"] = df["file"].apply(lambda x: os.path.basename(x)[:-5] if x.endswith(".mzML") else x)
    return merged_df.merge(df, on=["file_key", "scan"], how="inner")

folder_path = "data/pepnet_PXD004452"
merged_df = merge_tsv_files(folder_path)
merged_df = merge_with_metadata(merged_df, df)

print("Success!")

Success!


In [57]:
merged_df['updated sequence'] = merged_df['sequence'].str.replace(r'\[.*?\]', '', regex=True)

# correct annotations
# PPM difference + sequence match
correct_annotations_df = merged_df[
    ((merged_df['DENOVO'] == merged_df['updated sequence']) & (merged_df['updated sequence'] != merged_df['sequence'])) |
    ((merged_df['DENOVO'] == merged_df['sequence']) & (merged_df['PPM Difference'].abs() < 10))
]

# incorrect annotations
# large PPM difference
incorrect_annotations_df = merged_df[
    (merged_df['PPM Difference'].abs() >= 10) &
    ~(
        (merged_df['DENOVO'] == merged_df['updated sequence']) &
        (merged_df['updated sequence'] != merged_df['sequence'])
    )
]

# other annotations 
# nice PPM but different sequences
other_annotations_df = merged_df[
    (merged_df['PPM Difference'].abs() < 10) & 
    (merged_df['DENOVO'] != merged_df['updated sequence'])
]

print(f'Number of correct annotations: {correct_annotations_df.shape[0]}')
print(f'Number of incorrect annotations (based on PPM difference value): {incorrect_annotations_df.shape[0]}')
print(f'Number of other incorrect annotations: {other_annotations_df.shape[0]}')
print(f'Number of all annotations: {merged_df.shape[0]}')

Number of correct annotations: 71469
Number of incorrect annotations (based on PPM difference value): 69125
Number of other incorrect annotations: 102636
Number of all annotations: 243230


In [58]:
def create_reversed_decoy(sequence):
    if len(sequence) < 2:
        return sequence
    return sequence[0] + sequence[-2:0:-1] + sequence[-1]


def create_random_decoy(sequence):
    if len(sequence) < 2:
        return sequence
    shuffled = list(sequence[1:-1])
    random.shuffle(shuffled)
    return sequence[0] + ''.join(shuffled) + sequence[-1]

merged_df.loc[:, 'decoy (reversed)'] = merged_df['updated sequence'].apply(create_reversed_decoy)
merged_df.loc[:, 'decoy (shuffled)'] = merged_df['updated sequence'].apply(create_random_decoy)

merged_df.loc[:, 'edit distance pepnet-tailor'] = merged_df.apply(
    lambda row: lev.distance(row['DENOVO'], row['updated sequence']), axis=1
)

merged_df.loc[:, 'edit distance tailor-decoy'] = merged_df.apply(
    lambda row: lev.distance(row['updated sequence'], row['decoy (reversed)']), axis=1
)

merged_df.loc[:, 'edit distance tailor-decoy-shuffled'] = merged_df.apply(
    lambda row: lev.distance(row['updated sequence'], row['decoy (shuffled)']), axis=1
)

print(merged_df[['updated sequence', 'decoy (reversed)', 'decoy (shuffled)']].head())
merged_df.to_csv('data/merged_df.txt', index=False, sep='\t')

  updated sequence decoy (reversed) decoy (shuffled)
0        RKLESTESR        RSETSELKR        RTKSSEELR
1     KRNQNSQISTEK     KETSIQSNQNRK     KIRENSQTSQNK
2       GHVQDPNDRR       GRDNPDQVHR       GPNDRHVDQR
3         EDHSLEHR         EHELSHDR         EEHSLDHR
4        KSTGGKAPR        KPAKGGTSR        KATKGPSGR
