# Multi-sample ionbot result analysis

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import plotly.express as px
import zipfile
!pip -q install itables
from itables import init_notebook_mode
init_notebook_mode(all_interactive=True)
import itables.options as opt
opt.maxBytes = 0
opt.classes = ["display", "nowrap","compact","hover"]
opt.showIndex = False
opt.style = "max-width:6000px"
pd.set_option('display.max_colwidth', 20)

This notebook executes the analysis of one or more samples at the peptide and modfication.

For each sample, the `ionbot.twbx` result file needs to be put into a seperate folder with the folder name corresponding to a unique sample name.  

In this case we will analyse the ionbot search results of three samples and created one folder for each sample:

```
PXD000561
├── PXD000561_closed
│   └── ionbot.twbx
├── PXD000561_full
│   └── ionbot.twbx
├── PXD000561_nocorr_nort
│   └── ionbot.twbx
```

We can specify this as follows:

In [None]:
experiment = "PXD000561"
samples = ["PXD000561_closed","PXD000561_full","PXD000561_nocorr_nort"] 

The following code will download the ionbot search results:

In [None]:
! wget http://genesis.ugent.be/uvpublicdata/workshop-ml-proteomics/PXD000561.zip
! unzip PXD000561.zip

Next we unpack the result files:

In [None]:
for sample in samples:
    folder = "%s/%s/"%(experiment,sample)
    archive = zipfile.ZipFile("%s/ionbot.twbx"%folder)
    for file in archive.namelist():
        if file.startswith('Data/'):
            archive.extract(file, folder)

We parse the result files and create one Pandas DataFrame that contains all the results:

In [None]:
tmp = []
for sample in samples:
    print(sample)
    folder = "%s/%s/Data/ionbot_result/"%(experiment,sample)
    ionbot_first = pd.read_csv("%s/ionbot.first.csv"%folder)
    ionbot_first["rank"] = ["first"]*len(ionbot_first)
    ionbot_lower = pd.read_csv("%s/ionbot.lower.csv"%folder)
    ionbot_lower["rank"] = ["lower"]*len(ionbot_lower)
    ionbot = pd.concat([ionbot_first,ionbot_lower])
    ionbot = ionbot[(ionbot["database"]=="T")&(ionbot["q-value"]<=0.01)]
    ionbot["modifications"].fillna("-",inplace=True)
    ionbot_features = pd.read_csv("%s/ionbot.features.csv"%folder)
    ionbot = ionbot.merge(ionbot_features,on="ionbot_match_id",how="left")
    ionbot["sample"] = [sample]*len(ionbot)
    tmp.append(ionbot)
ionbot = pd.concat(tmp)

In [None]:
ionbot.columns

Count the number of PSMs for each sample:

In [None]:
tmp = ionbot["sample"].value_counts().reset_index(level=0)
tmp.columns = ["sample","#ids"]
fig = px.bar(tmp, y='sample', x='#ids', orientation='h')
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

Count the number of unique peptidoforms for each sample:

In [None]:
tmp = ionbot.drop_duplicates(["sample","matched_peptide","modifications"],keep="first")
tmp = tmp["sample"].value_counts().reset_index(level=0)
tmp.columns = ["sample","#ids"]
fig = px.bar(tmp, y='sample', x='#ids', orientation='h')
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

Count the number of unique peptides for each sample:

In [None]:
tmp = ionbot.drop_duplicates(["sample","matched_peptide"],keep="first")
tmp = tmp["sample"].value_counts().reset_index(level=0)
tmp.columns = ["sample","#ids"]
fig = px.bar(tmp, y='sample', x='#ids', orientation='h')
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()

Here we look at the PSM identification overlap between the samples:

In [None]:
p = {}
for sample in samples:
    p[sample] = list(ionbot[ionbot["sample"]==sample]["matched_peptide"].unique())

In [None]:
d = []
for sample1 in p:
    dd = []
    for sample2 in p:
        dd.append(len(set(p[sample1]).intersection(set(p[sample2]))))
    d.append(dd)
    
intersect = pd.DataFrame(d,columns=p,index=p)

In [None]:
intersect.style.background_gradient(cmap='Blues')

Next, we load the protein group results:

In [None]:
tmp = []
for sample in samples:
    folder = "%s/%s/Data/ionbot_result/"%(experiment,sample)
    proteins = pd.read_csv("%s/ionbot.coeluting.proteins.csv"%folder)
    proteins = proteins[proteins["is_shared_peptide"]==False]
    proteins.drop_duplicates("ionbot_match_id",inplace=True)
    proteins["sample"] = [sample]*len(proteins)
    tmp.append(proteins)
proteins = pd.concat(tmp)
proteins = proteins.merge(ionbot,on=["sample","ionbot_match_id"],how="inner")

In [None]:
proteins.head(200)

We count the number of protein groups identified in each sample:

In [None]:
tmp = []
for sample in samples:
  tmp.append([sample,len(proteins[proteins["sample"] == sample]["protein_group"].unique())])
fig = px.bar(pd.DataFrame(tmp,columns=["sample","#protein-groups"]), y='sample', x='#protein-groups', orientation='h')
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()


Some proteins are identified with many peptidoforms:

In [None]:
sample = proteins[proteins["sample"]=="PXD000561_full"]

In [None]:
sample["#unexpected"] = sample.drop_duplicates(["matched_peptide",
                                                "unexpected_modification"]).groupby('protein_group')['unexpected_modification'].transform('count')

In [None]:
sample[["protein_group","#unexpected"]].drop_duplicates("protein_group")

In [None]:
protein_group = "H4_HUMAN"

cols_to_use = ["protein_group","unexpected_modification","position_in_protein"]
tmp[tmp["protein_group"]==protein_group].drop_duplicates("unexpected_modification")[cols_to_use]