In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from math import sqrt
from math import acos
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import seaborn as sns
from immuno_ms2rescore_tools.file_utilities import PrositLib
import pickle



In [2]:
def replace_modified_seq(prositlib_mods):
    """Replace prosit modifications to original maxquant modfications"""

    prositlib_mods = prositlib_mods.str.replace("[Carbamidomethyl (C)]", "(ca)", regex=False)
    prositlib_mods = prositlib_mods.str.replace("[Oxidation (O)]", "(ox)", regex=False)
    prositlib_mods = prositlib_mods.str.extract(r"_([A-Za-z\(\)]*)_")
    return prositlib_mods

In [3]:
df = pd.read_csv("/home/arthur/ms2rescore-immunopeptidomics-manuscript/notebooks/data/evaluation_data/PXD005231/PXD005231_prosit.csv")
prosit_lib = pd.read_csv("/home/arthur/ms2rescore-immunopeptidomics-manuscript/notebooks/data/evaluation_data/PXD005231/myPrositLib_PXD005231.csv")

In [5]:
prosit_lib.groupby("FragmentCharge").agg({"RelativeIntensity": "mean"})

Unnamed: 0_level_0,RelativeIntensity
FragmentCharge,Unnamed: 1_level_1
1,0.259549
2,0.162173
3,0.070651


In [4]:
prosit_lib[prosit_lib["ModifiedPeptide"].str.contains("[Oxidation (O)]", regex=False)]

Unnamed: 0,RelativeIntensity,FragmentMz,ModifiedPeptide,LabeledPeptide,StrippedPeptide,PrecursorCharge,PrecursorMz,iRT,proteotypicity,FragmentNumber,FragmentType,FragmentCharge,FragmentLossType
577,0.012161,166.053238,_AAAAVGPAM[Oxidation (O)]_,AAAAVGPAM,AAAAVGPAM,1.0,774.374176,-1.834412,2.471413,1,y,1,noloss
578,0.035727,237.090347,_AAAAVGPAM[Oxidation (O)]_,AAAAVGPAM,AAAAVGPAM,1.0,774.374176,-1.834412,2.471413,2,y,1,noloss
579,0.530723,334.143127,_AAAAVGPAM[Oxidation (O)]_,AAAAVGPAM,AAAAVGPAM,1.0,774.374176,-1.834412,2.471413,3,y,1,noloss
580,0.029304,214.118622,_AAAAVGPAM[Oxidation (O)]_,AAAAVGPAM,AAAAVGPAM,1.0,774.374176,-1.834412,2.471413,3,b,1,noloss
581,1.000000,391.164581,_AAAAVGPAM[Oxidation (O)]_,AAAAVGPAM,AAAAVGPAM,1.0,774.374176,-1.834412,2.471413,4,y,1,noloss
...,...,...,...,...,...,...,...,...,...,...,...,...,...
836054,0.078581,873.381104,_YYDGKVM[Oxidation (O)]KL_,YYDGKVMKL,YYDGKVMKL,2.0,566.781717,18.562275,-5.093276,7,b,1,noloss
836055,0.263756,969.507385,_YYDGKVM[Oxidation (O)]KL_,YYDGKVMKL,YYDGKVMKL,2.0,566.781717,18.562275,-5.093276,8,y,1,noloss
836056,0.312582,485.257324,_YYDGKVM[Oxidation (O)]KL_,YYDGKVMKL,YYDGKVMKL,2.0,566.781717,18.562275,-5.093276,8,y,2,noloss
836057,0.020425,1001.476074,_YYDGKVM[Oxidation (O)]KL_,YYDGKVMKL,YYDGKVMKL,2.0,566.781717,18.562275,-5.093276,8,b,1,noloss


In [5]:
prosit_lib = prosit_lib[~(prosit_lib["ModifiedPeptide"].str.contains("[Carbamidomethyl (C)]", regex=False))]
prosit_lib["ModifiedPeptide"] = replace_modified_seq(prosit_lib["ModifiedPeptide"])

In [6]:
prosit_lib[prosit_lib["ModifiedPeptide"].str.contains("(ox)", regex=False)]

Unnamed: 0,RelativeIntensity,FragmentMz,ModifiedPeptide,LabeledPeptide,StrippedPeptide,PrecursorCharge,PrecursorMz,iRT,proteotypicity,FragmentNumber,FragmentType,FragmentCharge,FragmentLossType
577,0.012161,166.053238,AAAAVGPAM(ox),AAAAVGPAM,AAAAVGPAM,1.0,774.374176,-1.834412,2.471413,1,y,1,noloss
578,0.035727,237.090347,AAAAVGPAM(ox),AAAAVGPAM,AAAAVGPAM,1.0,774.374176,-1.834412,2.471413,2,y,1,noloss
579,0.530723,334.143127,AAAAVGPAM(ox),AAAAVGPAM,AAAAVGPAM,1.0,774.374176,-1.834412,2.471413,3,y,1,noloss
580,0.029304,214.118622,AAAAVGPAM(ox),AAAAVGPAM,AAAAVGPAM,1.0,774.374176,-1.834412,2.471413,3,b,1,noloss
581,1.000000,391.164581,AAAAVGPAM(ox),AAAAVGPAM,AAAAVGPAM,1.0,774.374176,-1.834412,2.471413,4,y,1,noloss
...,...,...,...,...,...,...,...,...,...,...,...,...,...
836054,0.078581,873.381104,YYDGKVM(ox)KL,YYDGKVMKL,YYDGKVMKL,2.0,566.781717,18.562275,-5.093276,7,b,1,noloss
836055,0.263756,969.507385,YYDGKVM(ox)KL,YYDGKVMKL,YYDGKVMKL,2.0,566.781717,18.562275,-5.093276,8,y,1,noloss
836056,0.312582,485.257324,YYDGKVM(ox)KL,YYDGKVMKL,YYDGKVMKL,2.0,566.781717,18.562275,-5.093276,8,y,2,noloss
836057,0.020425,1001.476074,YYDGKVM(ox)KL,YYDGKVMKL,YYDGKVMKL,2.0,566.781717,18.562275,-5.093276,8,b,1,noloss


In [7]:
test_df = pd.merge(prosit_lib, df, left_on=["ModifiedPeptide", "PrecursorCharge"], right_on=["modified_sequence", "precursor_charge"], validate="many_to_one")

In [12]:
test_df["spec_id"][test_df["ModifiedPeptide"] == "AAAAAAAAR"].unique()

array(['mzspec:PXD005231:20160823_QEh1_LC2_HuPa_SA_HLApI_RA957_1_MG_1:scan:7536',
       'mzspec:PXD005231:20160823_QEh1_LC2_HuPa_SA_HLApI_RA957_2_MG_1:scan:6482'],
      dtype=object)

In [9]:
test_lib = PrositLib("/home/arthur/ms2rescore-immunopeptidomics-manuscript/notebooks/data/evaluation_data/PXD005231/myPrositLib_PXD005231.csv")

In [10]:
test_lib.merge_spec_ids("/home/arthur/ms2rescore-immunopeptidomics-manuscript/notebooks/data/evaluation_data/PXD005231/PXD005231_prosit.csv")

In [13]:
pred_emp = test_lib.create_pred_and_emp_csv("/home/arthur/ms2rescore-immunopeptidomics-manuscript/notebooks/data/evaluation_data/PXD005231/spec_lib_PXD005231_HCD_pred_and_emp.csv")

<class 'numpy.int64'> <class 'numpy.float64'>
<class 'numpy.int64'> <class 'numpy.int64'>


In [15]:
pred_emp[pred_emp["spec_id"] == 'mzspec:PXD005231:20160823_QEh1_LC2_HuPa_SA_HLApI_RA957_2_MG_1:scan:6482']

Unnamed: 0,spec_id,prediction,FragmentMz,charge,ionnumber,ion,FragmentCharge,mz,target
763840,mzspec:PXD005231:20160823_QEh1_LC2_HuPa_SA_HLA...,-9.965784,0.0,1,1,B,,72.04435,-9.965784
763841,mzspec:PXD005231:20160823_QEh1_LC2_HuPa_SA_HLA...,-9.965784,0.0,1,2,B,,143.08147,-8.534998
763842,mzspec:PXD005231:20160823_QEh1_LC2_HuPa_SA_HLA...,-4.04231,214.118622,1,3,B,1.0,214.11859,-6.241658
763843,mzspec:PXD005231:20160823_QEh1_LC2_HuPa_SA_HLA...,-2.262998,285.155731,1,4,B,1.0,285.1557,-5.275553
763844,mzspec:PXD005231:20160823_QEh1_LC2_HuPa_SA_HLA...,-0.986638,356.192841,1,5,B,1.0,356.1928,-4.71223
763845,mzspec:PXD005231:20160823_QEh1_LC2_HuPa_SA_HLA...,-0.172082,427.22995,1,6,B,1.0,427.22992,-4.483859
763846,mzspec:PXD005231:20160823_QEh1_LC2_HuPa_SA_HLA...,-0.214228,498.267059,1,7,B,1.0,498.26703,-5.528564
763847,mzspec:PXD005231:20160823_QEh1_LC2_HuPa_SA_HLA...,-1.923464,569.304199,1,8,B,1.0,569.3042,-6.988497
763848,mzspec:PXD005231:20160823_QEh1_LC2_HuPa_SA_HLA...,0.001442,175.118958,1,1,Y,1.0,175.11891,-4.591251
763849,mzspec:PXD005231:20160823_QEh1_LC2_HuPa_SA_HLA...,-3.897951,246.156067,1,2,Y,1.0,246.15602,-8.105649


In [16]:
pxd008034 = pd.read_csv("data/evaluation_data/PXD008034/PXD008034_prosit.csv")