In [1]:
import pandas as pd
import numpy as np

# Easypqp 2 step spectral library conversion: Convert and Library

# 1. Convert(*.pep.xml) -> ( *.psmpkl, *.peakpklls)

## *.pep.xml is an iProphet output

### Run easypqp convert

In [2]:
!easypqp convert --pepxml "./data/interact-Sample1.pep.xml" \
--spectra ./data/Sample1.mzML \
--exclude-range -1.5,3.5 \
--psms ./data/Sample1.psmpkl \
--peaks ./data/Sample1.peakpklls



Info: Converting ./data/interact-Sample1.pep.xml.
Info: Parsing run Sample1.
Info: Parsing pepXML.
Info: Generate theoretical spectra.
Info: Processing spectra from file ./data/Sample1.mzML.
Progress of 'loading spectra list':
-- done [took 02:19 m (CPU), 17.56 s (Wall)] -- 
Progress of 'loading chromatogram list':
-- done [took 0.02 s (CPU), 0.00 s (Wall)] -- 
Info: PSMs successfully converted and stored in ./data/Sample1.psmpkl.
Info: Peaks successfully converted and stored in ./data/Sample1.peakpklls.


## Check psms output-file

In [3]:
psms = pd.read_pickle("./data/Sample1.psmpkl")

# Check which features are really needed in psms.

In [4]:
psms.head(1).T

Unnamed: 0,0
run_id,Sample1
scan_id,2134
hit_rank,1
massdiff,1.95874
precursor_charge,2
retention_time,221.552
ion_mobility,
peptide_sequence,ATPAPVSIAIVNDGLARR
modifications,M
nterm_modification,


In [24]:
#Exluding
exlude = ["run_id","hit_rank", "massdiff", "ion_mobility", "var_hyperscore", "var_nextscore", "var_expect"]

In [25]:
psms[exlude] = np.nan

In [26]:
#Verify nan imputations
psms.head(1).T

Unnamed: 0,0
run_id,
scan_id,2134
hit_rank,
massdiff,
precursor_charge,2
retention_time,221.552
ion_mobility,
peptide_sequence,ATPAPVSIAIVNDGLARR
modifications,M
nterm_modification,


In [27]:
#Save
psms.to_pickle("./data/Sample1_with_nan.psmpkl")

In [28]:
remaining = list(set(psms.columns) - set(exlude)); remaining

['modified_peptide',
 'nterm_modification',
 'modifications',
 'decoy',
 'precursor_charge',
 'protein_id',
 'cterm_modification',
 'num_tot_proteins',
 'gene_id',
 'group_id',
 'scan_id',
 'retention_time',
 'peptide_sequence',
 'pep']

# Mapping from MSFragger output to reamining columns:

retention_time_sec -> retention_time

assumed_charge -> precursor_charge

protein -> protein_id

protein -> decoy

protein -> num_tot_proteins

protein -> gene_id

modifications & len(peptide) -> modifications #They exlude modifcations in N/C terminus and add it in other features i.e., nterm_modification/cterm_modification.

modifications & len(peptide) -> nterm_modification

modifications & len(peptide) ->cterm_modification

spectrum -> run_id

start_scan -> scan_id

probability -> pep

peptide -> peptide_sequence

pepxmlfile (find "_rank" in name, else set "") & previous parsed values -> group_id


modified_peptide* (Build from parsed values)

In [29]:
peaks = pd.read_pickle("./data/Sample1.peakpklls")

In [30]:
peaks.head(1).T

Unnamed: 0,0
scan_id,2134
modified_peptide,ATPAPVSIAIVNDGLARR
precursor_charge,2
precursor_mz,911.021
fragment,b10^1
product_mz,921.54
intensity,2857.14


# PEAKS is only dependent on 'scan_id','modified_peptide','precursor_charge' i.e., remaining columns

# 1. library(*.psmpkl, *.peakpklls, psm.tsv, peptide.tsv) -> ( library.tsv)

## *.psmpkl and *.peakpklls are easypqp convert output and psm.tsv and peptide.tsv are iProphet output

### What is needed for psm.tsv

In [31]:
psm_tsv = pd.read_csv("./data/psm.tsv", sep="\t")

### Columns used in easypqp library

In [32]:
used_columns_psm = ["Peptide", "Gene", "Protein ID"]

In [33]:
#Protein -> Gene, Protein ID
#Peptide -> Peptide

#No new features

In [34]:
psm_tsv[used_columns_psm].head(3)

Unnamed: 0,Peptide,Gene,Protein ID
0,GRGEQGGSDGDPVDQQSEPR,rne,P71905
1,TDGNQKPDGNSGEQVTVTDK,grpE,P9WMT5
2,NFYDADPLAKAASGGGNGYSLR,mce1A,Q79FZ9


### What is needed for psm.tsv

In [35]:
peptide_tsv = pd.read_csv("./data/peptide.tsv", sep="\t")


### Columns used in easypqp library

In [36]:
used_columns_peptide = ["Spectrum", 'Spectrum File', "Peptide"]

In [37]:
psm_tsv[used_columns_peptide].head(3)

Unnamed: 0,Spectrum,Spectrum File,Peptide
0,Sample1.10719.10719.2,interact-Sample1.pep.xml,GRGEQGGSDGDPVDQQSEPR
1,Sample1.11465.11465.3,interact-Sample1.pep.xml,TDGNQKPDGNSGEQVTVTDK
2,Sample1.11954.11954.4,interact-Sample1.pep.xml,NFYDADPLAKAASGGGNGYSLR


In [38]:
#Spectrum -> spectrum
#Peptide -> Peptide
#Spectrum File -> _rank
#No new features

### Run easypqp with normal easypqp convert output

In [39]:
!easypqp library --psmtsv ./data/psm.tsv \
--peptidetsv ./data/peptide.tsv \
--out ./data/library_complete.tsv \
--rt_lowess_fraction 0.01\
./data/Sample1.psmpkl ./data/Sample1.peakpklls

Info: There are psm.tsv and peptide.tsv. Will ignore --psm_fdr_threshold, --peptide_fdr_threshold, --protein_fdr_threshold, --pi0_lambda, --proteotypic, and --no-proteotypic.
Info: Reading file ./data/Sample1.psmpkl.
Info: 922 redundant PSMs identified after filtering with ./data/psm.tsv and ./data/peptide.tsv
  base_name  modified_peptide
0   Sample1               139
Info: easypqp_rt_alignment_Sample1; Peptide overlap between run and reference: 139.
Info: Parsing file ./data/Sample1.peakpklls.
Info: Library successfully generated.


### Run easypqp with nan-inputed easypqp convert output

In [40]:
!easypqp library --psmtsv ./data/psm.tsv \
--peptidetsv ./data/peptide.tsv \
--out ./data/library_with_nan.tsv \
--rt_lowess_fraction 0.01 \
./data/Sample1_with_nan.psmpkl ./data/Sample1.peakpklls


Info: There are psm.tsv and peptide.tsv. Will ignore --psm_fdr_threshold, --peptide_fdr_threshold, --protein_fdr_threshold, --pi0_lambda, --proteotypic, and --no-proteotypic.
Info: Reading file ./data/Sample1_with_nan.psmpkl.
Traceback (most recent call last):
  File "/home/ekvall/anaconda3/envs/terran3/bin/easypqp", line 8, in <module>
    sys.exit(cli())
  File "/home/ekvall/anaconda3/envs/terran3/lib/python3.6/site-packages/click/core.py", line 764, in __call__
    return self.main(*args, **kwargs)
  File "/home/ekvall/anaconda3/envs/terran3/lib/python3.6/site-packages/click/core.py", line 717, in main
    rv = self.invoke(ctx)
  File "/home/ekvall/anaconda3/envs/terran3/lib/python3.6/site-packages/click/core.py", line 1163, in invoke
    rv.append(sub_ctx.command.invoke(sub_ctx))
  File "/home/ekvall/anaconda3/envs/terran3/lib/python3.6/site-packages/click/core.py", line 956, in invoke
    return ctx.invoke(self.callback, **ctx.params)
  File "/home/ekvall/anaconda3/en

In [22]:
library_complete_with_nan = pd.read_csv("./data/library_with_nan.tsv",sep="\t")
library_complete = pd.read_csv("./data/library_complete.tsv",sep="\t")

In [23]:
library_complete_with_nan.equals(library_complete)

True

# Recap: We need 

### retention_time_sec, assumed_charge, protein, modifications, spectrum, start_scan, probability, peptide and _rank(?)