daichengxin · daichengxin · Jan 31, 2023 · Jan 5, 2023 · Jan 5, 2023 · Jan 5, 2023
diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
@@ -28,6 +28,7 @@ jobs:
         python -m pip install --upgrade pip
         pip install flake8 pytest
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        python setup.py install
     - name: Lint with flake8
       run: |
         # stop the build if there are Python syntax errors or undefined names
@@ -36,8 +37,8 @@ jobs:
         flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
     - name: Test peptide file generation
       run: |
-        python peptide_file_generation.py --mztab data/PXD020192-heart.mzTab.gz --msstats data/PXD020192-heart-msstats.tsv.gz --triqler data/PXD020192-heart-triqler.tsv.gz --sdrf data/PXD020192-heart.sdrf.tsv.gz --output data/PXD020192-Peptide-Intensities.tsv --compress
+        python bin/peptide_file_generation.py --help
     - name: Test with normalization
       run: |
-        python peptide_normalization.py --log2 --peptides ./data/heart-grouped-Intensities.tsv --contaminants contaminants_ids.tsv --routliers --output data/heart-grouped-Intensities-Norm.tsv --verbose --nmethod qnorm
+        python bin/peptide_normalization.py --help
 
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -0,0 +1,45 @@
+# This workflow will install Python dependencies, run tests and lint with a variety of Python versions
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: Python package
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+jobs:
+  build:
+
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.7", "3.8"]
+
+    steps:
+    - uses: actions/checkout@v3
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v3
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        python -m pip install flake8 pytest
+        if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
+        python setup.py install
+    - name: Lint with flake8
+      run: |
+        # stop the build if there are Python syntax errors or undefined names
+        flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
+        # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
+        flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
+    - name: Test peptide file generation
+      run: |
+        python bin/peptide_file_generation.py --help
+    - name: Test with normalization
+      run: |
+        python bin/peptide_normalization.py --help
+
diff --git a/README.md b/README.md
@@ -1,49 +1,108 @@
 # ibaqpy
 
-IBAQPy compute IBAQ values for Triqler outputs of Sample specific datasets. Additional tasks:
+[![Python application](https://github.com/bigbio/ibaqpy/actions/workflows/python-app.yml/badge.svg)](https://github.com/bigbio/ibaqpy/actions/workflows/python-app.yml)
 
-- Remove contaminants, decoy and proteins with non-tryptic peptides
-- Normalize IBAQ values for each protein.
+iBAQ (intensity Based Absolute Quantification) determines the abundance of a protein by dividing the total precursor intensities by the number of theoretically observable peptides of the protein. **ibaqpy** compute IBAQ values for proteins starting from a msstats input file and a SDRF experimental design file. This package provides multiple tools: 
+
+- `peptide_file_generation.py`: generate a peptide file from a msstats input file and a SDRF experimental design file. 
+
+- `peptide_normalization.py`: Normalize the input output file from script `peptide_file_generation.py`. It includes multiple steps such as peptidoform normalization, peptidorom to peptide summarization, peptide intensity normalization, and imputation. 
+
+- `compute_ibaq.py`: Compute IBAQ values from the output file from script `peptide_normalization.py`.
 
 ### Collecting intensity files 
 
 Absolute quantification files has been store in the following url: 
 
 ```
-http://ftp.pride.ebi.ac.uk/pride/data/proteomes/absolute-expression/
+http://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/absolute-expression/
 ```
 
-This FTP structure the information with the following form: 
+Inside each project reanalysis folder, the folder proteomicslfq contains the msstats input file with the structure `{Name of the project}_msstats_in.csv`. 
 
-- Project (Project reanalyzed)
-  - Project Sample results (This can be one folder or multiple depending if the samples are analyzed together or splitted)
+E.g. http://ftp.pride.ebi.ac.uk/pub/databases/pride/resources/proteomes/absolute-expression/PXD000561/proteomicslfq/PXD000561.sdrf_openms_design_msstats_in.csv 
 
-Collecting all the Sample intensities for all the projects can be moved to the folder all-data using the script: 
+### Generate Peptidoform Intesity file - peptide_file_generation.py
 
-Moving triqler: 
-
-```bash
-find ./ -name "out_triqler.tsv" | while read line; do echo $line ; project="$(basename "$(dirname "$(dirname "$line")")")"; echo $project; cp -v $line all-data/${project}-triqler.tsv; done
+```asciidoc
+python  peptide_file_generation.py --msstats PXD000561.sdrf_openms_design_msstats_in.csv --sdrf PXD000561.sdrf.tsv --output PXD000561-peptides.csv
 ```
 
-Moving mzTab: 
+The command provides an additional `flag` for compression data analysis where the msstats and sdrf files are compressed. 
 
-```bash 
-find ./ -name "out.mzTab" | while read line; do echo $line ; project="$(basename "$(dirname "$(dirname "$line")")")"; echo $project; cp -v $line all-data/${project}.mzTab; done
+```asciidoc
+python peptide_file_generation.py --help
+Usage: peptide_file_generation.py [OPTIONS]
+
+  Conversion of peptide intensity information into a file that containers
+  peptide intensities but also the metadata around the conditions, the
+  retention time, charge states, etc.
+
+  :param msstats: MsStats file import generated by quantms :param sdrf: SDRF
+  file import generated by quantms :param compress: Read all files compress
+  :param output: Peptide intensity file including other all properties for
+  normalization
+
+Options:
+  -m, --msstats TEXT  MsStats file import generated by quantms  [required]
+  -s, --sdrf TEXT     SDRF file import generated by quantms  [required]
+  --compress          Read all files compress
+  -o, --output TEXT   Peptide intensity file including other all properties
+                      for normalization
+  --help              Show this message and exit.
 ```
 
-Moving msstats: 
+### Peptide Normalization - peptide_normalization.py
 
-```bash
-find ./ -name "out_msstats.csv" | while read line; do echo $line ; project="$(basename "$(dirname "$(dirname "$line")")")"; echo $project; cp -v $line all-data/${project}-msstats.tsv; done
-```
+Peptide normalization starts from the output file from script `peptide_file_generation.py`. The structure of the input contains the following columns: 
+
+- ProteinName: Protein name
+- PeptideSequence: Peptide sequence including post-translation modifications `(e.g. .(Acetyl)ASPDWGYDDKN(Deamidated)GPEQWSK)`
+- PrecursorCharge: Precursor charge
+- FragmentIon: Fragment ion
+- ProductCharge: Product charge
+- IsotopeLabelType: Isotope label type
+- Condition: Condition label `(e.g. heart)`
+- BioReplicate: Biological replicate index `(e.g. 1)`
+- Run: Run index `(e.g. 1)`
+- Fraction: Fraction index `(e.g. 1)`
+- Intensity: Peptide intensity
+- Reference: Name of the RAW file containing the peptide intensity `(e.g. Adult_Heart_Gel_Elite_54_f16)`
+- SampleID: Sample ID `(e.g. PXD000561-Sample-54)`
+- StudyID: Study ID `(e.g. PXD000561)`. In most of the cases the study ID is the same as the ProteomeXchange ID.
+
+#### Removing Contaminants and Decoys
+
+The first step is to remove contaminants and decoys from the input file. The script `peptide_normalization.py` provides a parameter `--contaminants` for the user to provide a file with a list of protein accessions which represent each contaminant in the file. An example file can be seen in `data/contaminants.txt`. In addition to all the proteins accessions, the tool remove all the proteins with the following prefixes: `CONTAMINANT` and `DECOY`.
+
+#### Peptidoform Normalization
+
+A peptidoform is a combination of a `PeptideSequence(Modifications) + Charge + BioReplicate + Fraction`. In the current version of the file, each row correspond to one peptidoform. 
+
+The current version of the tool uses the parackage [qnorm](https://pypi.org/project/qnorm/) to normalize the intensities for each peptidofrom. **qnorm** implements a quantile normalization method. 
+
+#### Peptidoform to Peptide Summarization
+
+For each peptidoform a peptide sequence (canonical) with not modification is generated. The intensity of all peptides group by biological replicate are `sum`. 
+
+Then, the intensities of the peptides across different biological replicates are summarize using the function `median`. 
+
+At the end of this step, for each peptide, the corresponding protein + the intensity of the peptide is stored. 
+
+#### Peptide Intensity Imputation and Normalization
+
+Before the final two steps (peptide normalization and imputation), the algorithm removes all peptides that are source of missing values significantly. The algorithm removes all peptides that have more than 80% of missing values and peptides that do not appear in more than 1 sample. 
+
+Finally, two extra steps are performed: 
+
+- ``peptide intensity imputation``: Imputation is performed using the package [missingpy](https://pypi.org/project/missingpy/). The algorithm uses a Random Forest algorithm to perform the imputation.
+- ``peptide intensity normalization``: Similar to the normalization of the peptidoform intensities, the peptide intensities are normalized using the package [qnorm](https://pypi.org/project/qnorm/).
+
+### Compute IBAQ
 
-### Generate the peptide intensity files
 
-```asciidoc
-python  peptide_file_generation.py --mztab data/PXD004682-out.mzTab.gz --msstats data/PXD004682-out_msstats.csv.gz --triqler data/PXD004682-out_triqler.tsv.gz --sdrf data/PXD004682.sdrf.tsv.gz --output data/PXD004682-Peptide-Intensities.tsv --compression
-```
 
 ### Credits 
 
-Julianus and Yasset Perez-Riverol
+- Julianus Pfeuffer
+- Yasset Perez-Riverol 
diff --git a/bin/__init__.py b/bin/__init__.py
diff --git a/compute_ibaq.py → bin/compute_ibaq.py b/compute_ibaq.py → bin/compute_ibaq.py
@@ -1,11 +1,16 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
 import math
 
 import click
 import pandas as pd
 from pandas import DataFrame, Series
 from pyopenms import *
 
-from ibaqpy_commons import remove_contaminants_decoys, PROTEIN_NAME, INTENSITY, CONDITION, IBAQ, IBAQ_LOG, IBAQ_PPB
+from ibaq.ibaqpy_commons import PROTEIN_NAME, IBAQ, IBAQ_LOG, IBAQ_PPB, NORM_INTENSITY, SAMPLE_ID, IBAQ_NORMALIZED, CONDITION
+from ibaq.ibaqpy_commons import plot_distributions, plot_box_plot
+
 
 def print_help_msg(command):
     """
@@ -16,6 +21,11 @@ def print_help_msg(command):
     with click.Context(command) as ctx:
         click.echo(command.get_help(ctx))
 
+
+def normalize(group):
+    group[IBAQ_NORMALIZED] = group[IBAQ] / group[IBAQ].sum()
+    return group
+
 def normalize_ibaq(res: DataFrame) -> DataFrame:
     """
     Normalize the ibaq values using the total ibaq of the sample. The resulted
@@ -25,32 +35,46 @@ def normalize_ibaq(res: DataFrame) -> DataFrame:
     :return:
     """
 
-    # Get the total intensity for the sample.
-    total_ibaq = res[IBAQ].sum()
+    res = res.groupby([SAMPLE_ID, CONDITION]).apply(normalize)
+
     # Normalization method used by Proteomics DB 10 + log10(ibaq/sum(ibaq))
-    res[IBAQ_LOG] = res[IBAQ].apply(lambda x: 100 + math.log2(x / total_ibaq))
+    res[IBAQ_LOG] = res[IBAQ_NORMALIZED].apply(lambda x: (math.log10(x) + 10) if x > 0 else 0)
+
     # Normalization used by PRIDE Team (no log transformation) (ibaq/total_ibaq) * 100'000'000
-    res[IBAQ_PPB] = res[IBAQ].apply(lambda x: (x / total_ibaq) * 100000000)
+    res[IBAQ_PPB] = res[IBAQ_NORMALIZED].apply(lambda x: (x) * 100000000)
+
     return res
 
+
+def parse_uniprot_name(identifier: str) -> str:
+    """
+    Parse the uniprot name from the identifier  (e.g. sp|P12345|PROT_NAME)
+    :param identifier: Uniprot identifier
+    :return:
+    """
+    return identifier.split('|')[2]
+
+
 @click.command()
 @click.option("-f", "--fasta", help="Protein database to compute IBAQ values")
 @click.option("-p", "--peptides", help="Peptide identifications with intensities following the peptide intensity output")
 @click.option("-e", "--enzyme", help="Enzyme used during the analysis of the dataset (default: Trypsin)",
               default="Trypsin")
 @click.option("-n", "--normalize", help="Normalize IBAQ values using by using the total IBAQ of the experiment",
               is_flag=True)
-@click.option("-c", "--contaminants", help="Contaminants protein accession", default="contaminants_ids.tsv")
+@click.option("--min_aa", help="Minimum number of amino acids to consider a peptide", default=7)
+@click.option("--max_aa", help="Maximum number of amino acids to consider a peptide", default=30)
 @click.option("-o", "--output", help="Output file with the proteins and ibaq values")
-def ibaq_compute(fasta: str, peptides: str, enzyme: str, normalize: bool, contaminants_file: str, output: str) -> None:
+def ibaq_compute(fasta: str, peptides: str, enzyme: str, normalize: bool, min_aa: int, max_aa: int, output: str) -> None:
     """
     This command computes the IBAQ values for a file output of peptides with the format described in
     peptide_contaminants_file_generation.py.
+    :param min_aa: Minimum number of amino acids to consider a peptide
+    :param max_aa: Maximum number of amino acids to consider a peptide
     :param fasta: Fasta file used to perform the peptide identification
     :param peptides: Peptide intensity file
     :param enzyme: Enzyme used to digest the protein sample
     :param normalize: use some basic normalization steps.
-    :param contaminants_file: Contaminant data file
     :param output: output format containing the ibaq values.
     :return:
     """
@@ -61,48 +85,53 @@ def ibaq_compute(fasta: str, peptides: str, enzyme: str, normalize: bool, contam
     fasta_proteins = list()  # type: list[FASTAEntry]
     FASTAFile().load(fasta, fasta_proteins)
     uniquepepcounts = dict()  # type: dict[str, int]
-    MINLEN = 6
-    MAXLEN = 30
-    ENZYMENAME = enzyme
     digestor = ProteaseDigestion()
-    digestor.setEnzyme(ENZYMENAME)
+    digestor.setEnzyme(enzyme)
 
     def get_average_nr_peptides_unique_bygroup(pdrow: Series) -> Series:
         """
-        Get the average intensity for protein gorups
+        Get the average intensity for protein groups
         :param pdrow: peptide row
         :return: average intensity
         """
-        proteins = pdrow.name.split(';')
+        proteins = pdrow.name[0].split(';')
         summ = 0
         for prot in proteins:
             summ += uniquepepcounts[prot]
-        return pdrow.intensity / (summ / len(proteins))
+        if len(proteins) > 0 and summ > 0:
+            return  pdrow.NormIntensity / (summ / len(proteins))
+        # If there is no protein in the group, return np nan
+        return np.nan # type: ignore
 
     for entry in fasta_proteins:
         digest = list()  # type: list[str]
-        digestor.digest(AASequence().fromString(entry.sequence), digest, MINLEN, MAXLEN)
+        digestor.digest(AASequence().fromString(entry.sequence), digest, min_aa, max_aa)
         digestuniq = set(digest)
-        uniquepepcounts[entry.identifier.decode('utf-8')] = len(digestuniq)
+        uniquepepcounts[parse_uniprot_name(entry.identifier)] = len(digestuniq)
 
-    data = pd.read_csv(peptides, sep="\t")
+    data = pd.read_csv(peptides, sep=",")
     print(data.head())
     # next line assumes unique peptides only (at least per indistinguishable group)
 
-    res = pd.DataFrame(data.groupby(PROTEIN_NAME)[INTENSITY].sum()).apply(get_average_nr_peptides_unique_bygroup, 1)
+    res = pd.DataFrame(data.groupby([PROTEIN_NAME, SAMPLE_ID, CONDITION])[NORM_INTENSITY].sum()).apply(get_average_nr_peptides_unique_bygroup, 1)
     res = res.sort_values(ascending=False)
     res = res.to_frame()
-    res[PROTEIN_NAME] = res.index
+    res = res.reset_index()
     res = res.rename(columns={0: IBAQ})
-    res = res[[PROTEIN_NAME, IBAQ]]
 
-    res = remove_contaminants_decoys(res, contaminants_file, protein_field=PROTEIN_NAME)
     if normalize:
         res = normalize_ibaq(res)
 
-    # For absolute expression the relation is one sample + one condition
-    condition = data[CONDITION].unique()[0]
-    res[CONDITION] = condition.lower()
+    # Remove IBAQ_NORMALIZED NAN values
+    res = res.dropna(subset=[IBAQ_NORMALIZED])
+
+    plot_distributions(res, IBAQ_PPB, SAMPLE_ID, log2=True)
+    plot_box_plot(res, IBAQ_PPB, SAMPLE_ID, log2=True,
+                      title="IBAQ Distribution", violin=False)
+
+    # # For absolute expression the relation is one sample + one condition
+    # condition = data[CONDITION].unique()[0]
+    # res[CONDITION] = condition.lower()
 
     res.to_csv(output, index=False)
 

diff --git a/merge_condition_generation.py → bin/merge_condition_files.py b/merge_condition_generation.py → bin/merge_condition_files.py
@@ -1,13 +1,13 @@
-import gzip
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+
 import os
-import re
-import shutil
 
 import click
 import pandas as pd
-from typing_extensions import OrderedDict
 
-from ibaqpy_commons import *
+from ibaq.ibaqpy_commons import *
 
 def print_help_msg(command) -> None:
   """
@@ -33,13 +33,13 @@ def merge_condition_generation(input: str, output: str, pattern: str) -> None:
   """
 
   files = [f for f in os.listdir(input) if pattern in f]
-  df_from_each_file = (pd.read_csv(input+"/"+f, sep="\t") for f in files)
+  df_from_each_file = (pd.read_csv(input+"/"+f) for f in files)
   concatenated_df = pd.concat(df_from_each_file, ignore_index=True)
   concatenated_df[CONDITION] = concatenated_df[CONDITION].str.lower()
   print(concatenated_df.head())
 
   for k, g in concatenated_df.groupby([CONDITION]):
-    g.to_csv(f'{output}/{k}-grouped-Intensities.tsv', index=False, sep='\t')  # '{}.csv'.format(k)
+    g.to_csv(f'{output}/{k}-grouped-Intensities.csv', index=False)  # '{}.csv'.format(k)
 
 
 if __name__ == '__main__':