In [91]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import plotly.express as px
import zipfile
!pip -q install itables
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)
import itables.options as opt
opt.maxBytes = 0
opt.classes = ["display", "nowrap","compact","hover"]
opt.showIndex = False
opt.style = "max-width:6000px"
pd.set_option('display.max_colwidth', 20)

#set option to not use coeluting matches

<IPython.core.display.Javascript object>

This notebook provides a detailed analysis of one or more samples at the peptide, modification and protein level.

For each sample, the `ionbot.twbx` result file needs to be put into a seperate folder with the folder name corresponding to a unique sample name.  

We will analyse the ionbot search results of two samples and created one folder for each sample:

```
PXD000561
├── PXD000561_closed
│   └── ionbot.twbx
├── PXD000561_full
│   └── ionbot.twbx
├── PXD000561_nocorr_nort
│   └── ionbot.twbx
```

We can specify this as follows:

In [92]:
experiment = "PXD000561"
samples = ["PXD000561_closed","PXD000561_full","PXD000561_nocorr_nort"] 

In [93]:
for sample in samples:
    folder = "%s/%s/"%(experiment,sample)
    archive = zipfile.ZipFile("%s/ionbot.twbx"%folder)
    for file in archive.namelist():
        if file.startswith('Data/'):
            archive.extract(file, folder)

In [110]:
sample = "PXD000561_full"
folder = "%s/%s/Data/ionbot_result/"%(experiment,sample)
ionbot_first = pd.read_csv("%s/ionbot.first.csv"%folder)
ionbot_first["rank"] = ["first"]*len(ionbot_first)
ionbot_lower = pd.read_csv("%s/ionbot.lower.csv"%folder)
ionbot_lower["rank"] = ["lower"]*len(ionbot_lower)
ionbot = pd.concat([ionbot_first,ionbot_lower])
ionbot = ionbot[(ionbot["database"]=="T")&(ionbot["q-value"]<=0.01)]
ionbot["unexpected_modification"].fillna("-",inplace=True)
ionbot = ionbot[ionbot["unexpected_modification"]!="-"]
ionbot["prot"] = ionbot["proteins"].apply(lambda x: x.split("((")[0])
tmp = ionbot.drop_duplicates(["matched_peptide","unexpected_modification"])["prot"].value_counts().reset_index(level=0)
tmp.columns = ["proteins","#peptidoforms"]
tmp

proteins,#peptidoforms
Loading... (need help?),


In [119]:
tmp2 = ionbot[ionbot["prot"]=="TRYP_PIG"]
tmp2.drop_duplicates("unexpected_modification")
tmp2[["matched_peptide","modifications","unexpected_modification","proteins"]].to_csv("TRYP_PIG.csv",index=False)

In [116]:
tmp2 = ionbot[ionbot["proteins"]=="ACTG_HUMAN((148-178))((P63261))||ACTB_HUMAN((148-178))((P60709))"].to_csv("ACTG_HUMAN_TTGIVMDSGDGVTHTVPIYEGYAIPHAIIR.csv",index=False)

In [115]:
tmp2[["ionbot_match_id","matched_peptide","modifications","unexpected_modification"]].drop_duplicates("modifications")

In [95]:
proteins.head(100)



ionbot_match_id,is_shared_peptide,protein_group,protein_group_q-value,protein_group_PEP,protein,position_in_protein,uniprot_id,protein_length,protein_description,modifications_delta,best_tag_rank,corrected_retention_time,unexpected_modification,database,psm_score,q-value,PEP,proteins,rank
Loading... (need help?),,,,,,,,,,,,,,,,,,,


In [96]:
tmp = proteins[proteins["unexpected_modification"].notna()]
tmp[["ionbot_match_id","unexpected_modification"]]

ionbot_match_id,unexpected_modification
Loading... (need help?),


In [97]:
tmp = tmp.drop_duplicates(["matched_peptide","unexpected_modification"])["protein_group"].value_counts().reset_index(level=0)
tmp.columns = ["protein_group","#peptidoforms"]
tmp

protein_group,#peptidoforms
Loading... (need help?),


In [99]:
ionbot[ionbot["proteins"]=="MYH9_HUMAN"]




ionbot_match_id,spectrum_title,scan,spectrum_file,precursor_mass,peptide_mass,observed_retention_time,charge,database_peptide,matched_peptide,modifications_delta,best_tag_rank,corrected_retention_time,unexpected_modification,database,psm_score,q-value,PEP,proteins,rank
Loading... (need help?),,,,,,,,,,,,,,,,,,,


In [80]:
proteins.columns

Index(['ionbot_match_id', 'is_shared_peptide', 'protein_group',
       'protein_group_q-value', 'protein_group_PEP', 'protein',
       'position_in_protein', 'uniprot_id', 'protein_length',
       'protein_description', 'spectrum_title', 'scan', 'spectrum_file',
       'precursor_mass', 'peptide_mass', 'observed_retention_time', 'charge',
       'database_peptide', 'matched_peptide', 'modifications',
       'modifications_delta', 'best_tag_rank', 'corrected_retention_time',
       'unexpected_modification', 'database', 'psm_score', 'q-value', 'PEP',
       'proteins', 'rank', '#peptidoforms'],
      dtype='object')

In [87]:
print(proteins["#peptidoforms"].max())

701.0


In [88]:
cols_to_use = ["protein_group","position_in_protein","peptide_mass","matched_peptide","matched_peptide","unexpected_modification"]

proteins[proteins["#peptidoforms"]==701][cols_to_use].to_csv("dd.csv",index=False)

Prepare the result files:

The result files are written to the folder `Data/ionbot_result`:

In [35]:
cols_to_use = ['sample','matched_peptide','peptide_mass','#peptidoforms',
       'max_psm_score', 'max_all-count', 'max_all-explained',
       'max_by-intensity-pattern-correlation', 'min_rt-pred-error']

tmp = []
for sample in samples:
    print(sample)
    folder = "%s/%s/Data/ionbot_result/"%(experiment,sample)
    ionbot_first = pd.read_csv("%s/ionbot.first.csv"%folder)
    ionbot_first["rank"] = ["first"]*len(ionbot_first)
    ionbot_lower = pd.read_csv("%s/ionbot.lower.csv"%folder)
    ionbot_lower["rank"] = ["lower"]*len(ionbot_lower)
    ionbot = pd.concat([ionbot_first,ionbot_lower])
    ionbot = ionbot[(ionbot["database"]=="T")&(ionbot["q-value"]<=0.01)]
    ionbot["modifications"].fillna("-",inplace=True)
    ionbot_features = pd.read_csv("%s/ionbot.features.csv"%folder)
    ionbot = ionbot.merge(ionbot_features,on="ionbot_match_id",how="left")
    ionbot["sample"] = [sample]*len(ionbot)
    #add number of peptidoforms for each peptide
    ionbot["#peptidoforms"] = ionbot.groupby('matched_peptide')['modifications'].transform('count')
    #add max of colds for matched peptide
    cols = ["psm_score","all-count","all-explained","by-intensity-pattern-correlation"]
    for col in cols:
        ionbot["max_"+col] = ionbot.groupby('matched_peptide')[col].transform('max')    
    #add min for rt-pred-error for matched peptide
    ionbot["min_rt-pred-error"] = ionbot.groupby('matched_peptide')['rt-pred-error'].transform('min')
    ionbot = ionbot[cols_to_use]
    ionbot.drop_duplicates(["matched_peptide"],inplace=True)
    tmp.append(ionbot)
ionbot = pd.concat(tmp)

PXD000561_closed
PXD000561_full
PXD000561_nocorr_nort


In [36]:
ionbot.columns

Index(['sample', 'matched_peptide', 'peptide_mass', '#peptidoforms',
       'max_psm_score', 'max_all-count', 'max_all-explained',
       'max_by-intensity-pattern-correlation', 'min_rt-pred-error'],
      dtype='object')

In [37]:
ionbot.head(1000)

sample,matched_peptide,peptide_mass,#peptidoforms,max_psm_score,max_all-count,max_all-explained,max_by-intensity-pattern-correlation,min_rt-pred-error
Loading... (need help?),,,,,,,,


In [38]:
tmp = ionbot["sample"].value_counts().reset_index(level=0)
tmp.columns = ["sample","#ids"]
fig = px.bar(tmp, y='sample', x='#ids', orientation='h')
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

In [39]:
p = {}
for sample in samples:
    p[sample] = list(ionbot[ionbot["sample"]==sample]["matched_peptide"].unique())

In [40]:
d = []
for sample1 in p:
    dd = []
    for sample2 in p:
        dd.append(len(set(p[sample1]).intersection(set(p[sample2]))))
    d.append(dd)
    
intersect = pd.DataFrame(d,columns=p,index=p)

In [41]:
intersect.style.background_gradient(cmap='Blues')

Unnamed: 0,PXD000561_closed,PXD000561_full,PXD000561_nocorr_nort
PXD000561_closed,64630,59578,54835
PXD000561_full,59578,75915,65157
PXD000561_nocorr_nort,54835,65157,68366


In [42]:
tmp = ionbot[ionbot["sample"]=="PXD000561_full"].drop_duplicates(["matched_peptide","modifications"])["protein_group"].value_counts().reset_index(level=0)
tmp.columns = ["protein_group","#peptidoforms"]
tmp

KeyError: Index(['modifications'], dtype='object')

The content of the result files is described [here](https://ionbot.cloud/help).

First, we load the result file that contains the first ranked matches for each MS2 spectrum:

In [None]:
ionbot = pd.read_csv("%s/ionbot.first.csv"%result_folder)

These are the column names:

In [None]:
for col in ionbot.columns:
    print(col)

Let's print some columns and explain the content:

In [None]:
cols_to_use = ["ionbot_match_id","database_peptide","matched_peptide",
               "modifications","modifications_delta","unexpected_modification"]
ionbot[cols_to_use]

The column `database` is `T` if the PSM matched the target database, it is `D` otherwise.

We can see that the result file contains all matches with FDR<1%:

In [None]:
print(ionbot["database"].value_counts())

The column `psm_score` contains the SVM (Percolator 3.0) score (i.e. the PSM score) for the matched spectra:

In [None]:
fig = px.histogram(ionbot, x="psm_score", color="database", nbins=50)
fig.show()

Next, we load the result file that contains the lower ranked (co-eluting) matches for each MS2 spectrum and add these to the search results:

In [None]:
ionbot["rank"] = ["first"]*len(ionbot)
tmp = pd.read_csv("%s/ionbot.lower.csv"%result_folder)
tmp["rank"] = ["lower"]*len(tmp)
ionbot = pd.concat([ionbot,tmp])

For the remainder, we remove the matches against the decoy database:

In [None]:
ionbot = ionbot[(ionbot["database"]=="T")]

While adding the lower ranked matches we created a column `rank` that contains 'first' if the match was ranked first based on the psm_score, and 'lower' otherwise:

In [None]:
print(ionbot["rank"].value_counts())

To reconstruct the LC-MS for matched MS2 spectra we can use the `observed_retention_time` and `precursor_mass` columns: 

In [None]:
fig = px.scatter(ionbot, x="observed_retention_time", y="precursor_mass")
fig.update_traces(marker=dict(size=2))
fig.show()

In [None]:
ionbot["matched_peptide"].value_counts().reset_index(level=0)

In [None]:
tmp = ionbot["matched_peptide"].value_counts().value_counts().reset_index(level=0)
fig = px.pie(tmp, values='matched_peptide', names='index', title="#PSMs for each peptide")
fig.show()

In [None]:
ionbot["matched_peptide"].value_counts().value_counts().plot(kind="pie")

In [None]:
ionbot["unexpected_modification"].value_counts()

In [None]:
ionbot["unexpected_modification"].value_counts().value_counts().plot(kind="pie")

In [None]:
ionbot["proteins"].value_counts()

In [None]:
features = pd.read_csv("%s/ionbot.features.csv"%result_folder)
print(features.columns)
ionbot = ionbot.merge(features,on="ionbot_match_id",how="left")

In [None]:
ionbot.boxplot(column=["by-count","all-count"])

In [None]:
ionbot.boxplot(column=["by-explained","all-explained"])

In [None]:
ionbot.boxplot(column=["by-intensity-pattern-correlation"])

In [None]:
ionbot.boxplot(column=["rt-pred-error"])

In [None]:
fig = px.scatter(ionbot, x="observed_retention_time", y="predicted_retention_time")
fig.update_traces(marker=dict(size=2))
fig.show()

In [None]:
fig = px.scatter(ionbot, x="corrected_retention_time", y="predicted_retention_time")
fig.update_traces(marker=dict(size=2))
fig.show()

are these outliers specific modifications?
can we show other modificatiosn (like the ones robbin presented) predicted correctly?

In [None]:
dataset = "PXD000561"

def get_universal_link(x):
    file = '.'.join(x["spectrum_file"].split('.')[:-1])
    if str(x["modifications"]) == "nan":
        link = "http://proteomecentral.proteomexchange.org/usi/?usi=mzspec:%s:%s:scan:%i:%s/%i"%(dataset,file,x["scan"],x["matched_peptide"],x["charge"])
    else:
        tmp = x["modifications_delta"].split("|")
        seq = list(x["matched_peptide"])
        for i in range(0,len(tmp),2):
            #print("%i %i"%(len(seq),int(tmp[i])))
            #print(x["modifications"])
            pos = int(tmp[i])
            delta = tmp[i+1]
            if not delta.startswith('-'):
                delta = '%2B' + delta
            if pos == 0: #N-TERM
                seq.insert(pos,"[%s]"%delta)
            elif pos == len(seq)+1: #C-TERM
                seq.insert(pos-2,"[%s]"%delta)
            else:
                seq.insert(pos,"[%s]"%delta)
        link = "http://proteomecentral.proteomexchange.org/usi/?usi=mzspec:%s:%s:scan:%i:%s/%i"%(dataset,file,x["scan"],''.join(seq),x["charge"])
    return f'<a target="_blank" href="%s">click</a>'%link      
 

In [None]:
ionbot["USI"] = ionbot.apply(get_universal_link,axis=1)

In [None]:
ionbot[["ionbot_match_id","matched_peptide","modifications","modifications_delta","USI"]]