V0.0.2 (#133)

* [FEATURE] hmmbuild_and_search: uses hmmbuild and hmmsearch binaries in place of jackhmmer binary * [FIX] deleted .swp files * [FIX] hmmbuild_and_search doesn't modify the kwargs * o2 config file changed * [COMMENT] comment for search_pdb_by_alignment * [COMMENT] hmmbuild_and_search comment * [FIX] hmmbuild_and_search returning its own namedtuple * [FIX] hmmbuildandsearch returning the correct namedtuple? * [FIX] threshold part of kwargs doesn't get modified * [INTERNAL,API] removed run_hmmbuild_and_search, other minor fixes * [INTERNAL]: fixed mapping from HMMbuild results to query * [NOP] PEP8 compliance fixes * [INTERNAL][BUGFIX] incorrect inputs to OrderedDict - does not preserve order * [BUGFIX] Typo in name of Coupling Scores inter file * [INTERNAL][BUGFIX] Paralogs not properly identified in complex/similarity * [API] Added: couplings stage outputs inter ECs file * [NOP] Pep8 compliance * [FEATURE] hmmbuild_and_search: uses hmmbuild and hmmsearch binaries in place of jackhmmer binary * [FIX] hmmbuild_and_search doesn't modify the kwargs * o2 config file changed * [COMMENT] comment for search_pdb_by_alignment * [COMMENT] hmmbuild_and_search comment * [FIX] hmmbuild_and_search returning its own namedtuple * [FIX] hmmbuildandsearch returning the correct namedtuple? * [FIX] threshold part of kwargs doesn't get modified * [INTERNAL,API] removed run_hmmbuild_and_search, other minor fixes * [INTERNAL]: fixed mapping from HMMbuild results to query * [NOP] PEP8 compliance fixes * [FIX]: getting query sequence from input alignment, handling cases of no structures found, ensuring that input alignment is in focus mode * [FIX]: testing of Boolean * [FIX]: requested changes * [API]: inter ECs output file * [API]: added fasta formatted alignment with query sequence in align stage * [CONFIG] restored correct config files * Version bump * [API] changed parameter name for comparing to PBD Seqres using HMMSearch * [BUGFIX]: removed resize tool in bokeh visualization * Merge branch 'master', remote branch 'origin' * [DOC] clarified Python package requirements. Closes #100 * [DOC] clarified documentation on SIFTS mapping table paths * [FIX] Minor corrections to hmmsearch config files * [FIX] Remove print statements * [DOC] Improved comments on SIFTS file path in configs * [DOC] Improved comments on atom filter * [FIX] Regularization issues with mean-field models. Fixes #104 * [INTERNAL] More explicit handling of empty structures in fold stage (related to #108) * [COMMENT] Removed outdated part of docstring * [FIX] Standard alignment protocol now passes through first index. Fixes #108 * [DOC]: complex config compare stage fixed - 'complex' instead of 'complex_compare' * [DOC]: added paths to download complex database files * Update README.md * [FIX] fixed relative path of DB symlinks * change db model to reflect webserver Change model for ComputeJob using correct type for foreign key constraint * [FIX]: monomer couplings protocol not adding probability column to CouplingScores.csv file * [BUGFIX]: hmmsearch comparison to PDB failing for complexes due to incorrect handling of monomer alignment names * [BUGFIX]: paralogs not properly identified for best reciprocal hit stage * [NOP]: PEP8 compliance * update travis keys * update travis keys
debbiemarkslab · Feb 21, 2018 · 154a91c · 154a91c
1 parent cf97426
commit 154a91c
Show file tree

Hide file tree

Showing 19 changed files with 827 additions and 128 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -1,38 +1,30 @@
 language: python
 python:
-  - '3.5'
-  - '3.6'
+- '3.5'
+- '3.6'
 before_install:
-  # https://conda.io/docs/travis.html
-  - sudo apt-get update
-  # We do this conditionally because it saves us some downloading if the
-  # version is the same.
-  - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then
-      wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh;
-    else
-      wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh;
-    fi
-  - bash miniconda.sh -b -p $HOME/miniconda
-  - export PATH="$HOME/miniconda/bin:$PATH"
-  - hash -r
-  - conda config --set always_yes yes --set changeps1 no
-  - conda update -q conda
-  # Useful for debugging any issues with conda
-  - conda info -a
-
-  # Replace dep1 dep2 ... with your dependencies
-  - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION numpy scipy numba pandas matplotlib
-  - source activate test-environment
+- sudo apt-get update
+- if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh
+  -O miniconda.sh; else wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
+  -O miniconda.sh; fi
+- bash miniconda.sh -b -p $HOME/miniconda
+- export PATH="$HOME/miniconda/bin:$PATH"
+- hash -r
+- conda config --set always_yes yes --set changeps1 no
+- conda update -q conda
+- conda info -a
+- conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION numpy scipy numba
+  pandas matplotlib
+- source activate test-environment
 install:
-  - python setup.py sdist --formats=zip -k
-  - find ./dist -iname "*.zip" -print0 | xargs -0 pip install
+- python setup.py sdist --formats=zip -k
+- find ./dist -iname "*.zip" -print0 | xargs -0 pip install
 script:
-  - python -m unittest discover -s test -p "Test*.py"
+- python -m unittest discover -s test -p "Test*.py"
 deploy:
   provider: pypi
   user: debbiemarkslab
   password:
-    secure: dNIatUYDKukXsDbak9r++FV3IAmRVVN3O4233Hx9BkT/1bmZumTYkZE+NpC8ooFCvSJf2eH0imZ2GebXm1eknF/NRZ4EWC1K7sjyBWLWxoLaYF3mWWSArsFe+kH/hQt9nNh0tp1+jcWW7TM6v21FtI8P88G3J4Rk5exuxyZC89r7Y33sF+bcata7aei4XXBeFgyvjCotAMUzwEwUM+0gETKlMbe5nZRZ8ZRPqAEP/Q4bBMVkexs8BhTt8RpgHQmJH4dTYmcM5/5Hkor2kcwPwrVhGdj+ayLqAGvNjyWF3h41x+ooBP+iLheGx9sW09sWr/dCzNwtr6KU5zNU1ibkppTUomwfKXWQOzvFXubjBuOraoc0PsA53RQ20N9d2HAzg25iPt4u+3mGHOFHTb5yHwLUOtPWem8Oal4LsKjVb/GGxzF0iYClT3j6NHc9RviLZp7l7smptY3Q9dRxdmQG+vAytM54qCvzl6Bydh1Fe25l3O97Je8ZIvaKfMMtgx4TrWyJODDtqdJVK0sg/rLHIQOlFR5Pc+XFqlJIedwGIBD9oHgCvHKyqmMokM5RINiQb0255DxNkaJ/tP1l5TN8Bxl00Xj6V4EpPbYA0FAh0+ZZhQSfrJwhKFrlt2rC4JdYQWpr0DMChbHRzqbH/PNRbKOOAYJduRazK///RABjkjU=
+    secure: sgND7tkfN3MDI/lxfcLVl6lekFb0GqxEQZ1cixL/iQGRb7M/pZiPMegWGhBQ3pMCaR7peXCgiHbUIsuP1muLsHTKzsFneeRaY2tBwd0+9y0WLDP/aDEn4lt/CJf7at4HSbvQ8x8DTCjcnBGC7SeUkuJAeJRrnWnngDVlWuw93DVTiOoWSiE9HhVzo3H1wmS8AOjDzebxeHSi5kOdJ0QBW2QJtGxrVrQT0XzW37RdfogC6BoOi3HG620h49TRWAQurgtnprNGQ4RqP6vx9KvzL3HWbuDmI+MLRU2ZvIV86xE7sDMKy1nRqBmrRRC2KAuQB+UWIYuuprLoW4wuibcG/uiQOBOhjeOyUu3fL8Nb0K2+p4YySiTbASqFrWqfHPD83EwfPXHw+aNeuHJyu4h7ogzBYjZsz+nY8AhBl9v7CzNT6VuYQcnJMBt9Ca9Ab9Ats+9yVinmFD6UvFfKfWUjyn7pbWHtIxT1iLSIq9faD7bW6uSAainMGB2KlxgJp5O9ePM0/LBJiGVYyYZphLyWPABHGasfMcxn6LG2c9mWICFpelB5B0P6aSfvRFElzYTcd2hLMyLYdscVOw8NSFYkM9QzGH9Py8ptsA3l34BUeCA0YsUj4PZL7W8wuwOXBhypIkmnHbD8rN0DY1vaVTvjbII/KUbEX0FwaiNRLKlW7Ms=
   on:
     tags: true
-
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ If you are simply interested in using EVcouplings as a library, installing the P
 
 #### Requirements
 
-EVcouplings requires a Python >= 3.5 installation. Since it depends on some packages from the scientific Python stack that can be tricky to install using pip (numba, numpy), we recommend using the [Anaconda Python distribution](https://www.continuum.io/downloads).
+EVcouplings requires a Python >= 3.5 installation. Since it depends on some packages that can be tricky to install using pip (numba, numpy, ...), we recommend using the [Anaconda Python distribution](https://www.continuum.io/downloads). In case you are creating a new conda environment or using miniconda, please make sure to run `conda install anaconda` before running pip, or otherwise the required packages will not be present.  
 
 #### Installation
 
@@ -88,6 +88,16 @@ Please see
 
 for how to download the respective databases. Note that this may take a while, especially the generation of post-processed SIFTS mapping files. 
 
+#### Sequence databases for EVcomplex
+Running the EVcouplings pipeline for protein complexes (aka EVcomplex) requires two pre-computed databases. You can download these databases here:
+
+ena_genome_location_table: https://marks.hms.harvard.edu/evcomplex_databases/cds_pro_2017_02.txt
+uniprot_to_embl_table: https://marks.hms.harvard.edu/evcomplex_databases/idmapping_uniprot_embl_2017_02.txt
+
+Save these databases in your local environment, and then add the paths to the local copies of these databases to your config file for the complex pipeline. 
+
+In future releases these databases will be generated automatically. 
+
 #### Other sequence databases
 
 You can however use any sequence database of your choice in FASTA format if you prefer to. The database for any particular job needs to be defined in the job configuration file ("databases" section) and set as the input database in the "alignment" section.
@@ -97,7 +107,7 @@ You can however use any sequence database of your choice in FASTA format if you
 Relevant PDB structures for comparison of ECs and 3D structure predictions will be automatically fetched from the web in the new compressed MMTF format on a per-job basis. You can however also pre-download the entire PDB and place the structures in a directory if you want to (and set pdb_mmtf_dir in your job configuration).
 
 Uniprot to PDB index mapping files will be automatically generated by EVcouplings based on the SIFTS database.
-You can either generate the files by running *evcouplings_dbupdate* (see above), or by pointing the sifts_mapping_table and sifts_sequence_db configuration parameters to file paths in a valid directory, and if the files do not yet exist, they will be created by fetching and integrating data from the web (this may take a while) when the pipeline is first run.
+You can either generate the files by running *evcouplings_dbupdate* (see above, preferred), or by pointing the sifts_mapping_table and sifts_sequence_db configuration parameters to file paths inside an already existing directory. If these files do not yet exist, they will be created by fetching and integrating data from the web (this may take a while) when the pipeline is first run and saved under the given file paths. 
 
 ## Documentation and tutorials
 

diff --git a/config/sample_config_complex.txt b/config/sample_config_complex.txt
@@ -319,8 +319,8 @@ couplings:
 
 # Compare ECs to known 3D structures
 compare:
-    # Current options: standard, complex_compare
-    protocol: complex_compare
+    # Current options: standard, complex
+    protocol: complex
 
     # Following parameters will be usually overriden by global settings / output of previous stage
     prefix:
@@ -334,6 +334,12 @@ compare:
     # sequence_id and SIFTS database (sequence_id must be UniProt AC/ID in this case)
     first_by_alignment: True
     second_by_alignment: True
+    # Alignment method to use to search the PDB Seqres database. Options: jackhmmer, hmmsearch
+    # Set to jackhmmer to search the PDB Seqres database using jackhmmer from the target sequence only (more stringent). 
+    # Set to hmmsearch to search the PDB seqres database using an HMM built from the output monomer alignment (less stringent). 
+    # Warning: searching by HMM may result in crystal structures from very distant homologs or even unrelated sequences. 
+    first_pdb_search_method: jackhmmer
+    second_pdb_search_method: jackhmmer
 
     # Leave this parameter empty to use all PDB structures for given sequence_id, otherwise
     # will be limited to the given IDs (single value or list). Important: note that this acts only as a filter on the
@@ -353,7 +359,7 @@ compare:
 
     # compare to multimer contacts (if multiple chains of the same sequence or its homologs are present in a structure)
     first_compare_multimer: True
-    second_compare_multimer:
+    second_compare_multimer: True
 
     # settings for sequence alignment against PDB sequences using jackhmmer
     # (additional settings like iterations possible, compare to align stage)
@@ -378,7 +384,10 @@ compare:
     # Return an error if we fail to automatically retrieve information about a given pdb id
     raise_missing: False
 
-    # Set atom_filter to "CA" to compute C_alpha distances instead of minimum atom distances. If blank, will compute minimum atom distance
+    # Filter that defines which atoms will be used for distance calculations. If empty/None, no filter will be
+    # applied (resulting in the computation of minimum atom distances between all pairs of atoms). If setting to any
+    # particular PDB atom type, only these atoms will be used for the computation (e.g. CA will give C_alpha distances,
+    # CB will give C_beta distances, etc.)
     atom_filter:
 
     # Distance cutoff (Angstrom) for a true positive pair
@@ -407,6 +416,7 @@ compare:
     # draw secondary structure on contact map plots
     draw_secondary_structure: True
 
+
 # These settings allow job status tracking using a database, and result collection in an archive
 management:
     # URI of database
@@ -458,8 +468,9 @@ databases:
     # Directory with PDB MMTF structures (leave blank to fetch structures from web)
     pdb_mmtf_dir:
 
-    # SIFTS mapping information. Point to valid directory, and if these files do not exist, they will be automatically generated
-    # (this may take a while). Periodically delete these files to more recent versions of SIFTS are used.
+    # SIFTS mapping information. Point to file paths in an existing directory, and if these files do not exist, they will be
+    # automatically generated and saved at the given file path (this may take a while).
+    # Periodically delete these files to more recent versions of SIFTS are used.
     sifts_mapping_table: /groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_2017_07_03.csv
     sifts_sequence_db: /groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_2017_07_03.fa
 
@@ -471,5 +482,11 @@ tools:
     psipred: /groups/marks/software/runpsipred
     cns: /groups/marks/pipelines/evcouplings/software/cns_solve_1.21/intel-x86_64bit-linux/bin/cns
     maxcluster: /groups/marks/pipelines/evcouplings/software/maxcluster64bit
-    uniprot_to_embl_table : /groups/marks/databases/complexes/idmapping/idmapping_uniprot_embl_2017_02.txt
-    ena_genome_location_table : /groups/marks/databases/complexes/ena/2017_02/cds_pro.txt
+
+    # the following two databases are exclusive to EVcomplex and need to be manually downloaded and saved locally
+    # then add the paths to your local copies of the database
+    # Download urls: 
+    # ena_genome_location_table: https://marks.hms.harvard.edu/evcomplex_databases/cds_pro_2017_02.txt
+    # uniprot_to_embl_table: https://marks.hms.harvard.edu/evcomplex_databases/idmapping_uniprot_embl_2017_02.txt 
+    uniprot_to_embl_table: /groups/marks/databases/complexes/idmapping/idmapping_uniprot_embl_2017_02.txt
+    ena_genome_location_table: /groups/marks/databases/complexes/ena/2017_02/cds_pro.txt
diff --git a/config/sample_config_monomer.txt b/config/sample_config_monomer.txt
@@ -219,7 +219,10 @@ compare:
 
     # Comparison and plotting settings
 
-    # Set atom_filter to "CA" to compute C_alpha distances instead of minimum atom distances. If blank, will compute minimum atom distance
+    # Filter that defines which atoms will be used for distance calculations. If empty/None, no filter will be
+    # applied (resulting in the computation of minimum atom distances between all pairs of atoms). If setting to any
+    # particular PDB atom type, only these atoms will be used for the computation (e.g. CA will give C_alpha distances,
+    # CB will give C_beta distances, etc.)
     atom_filter:
 
     # Distance cutoff (Angstrom) for a true positive pair
@@ -246,6 +249,12 @@ compare:
     # draw secondary structure on contact map plots
     draw_secondary_structure: True
 
+    # Alignment method to use to search the PDB Seqres database. Options: jackhmmer, hmmsearch
+    # Set to jackhmmer to search the PDB Seqres database using jackhmmer from the target sequence only (more stringent). 
+    # Set to hmmsearch to search the PDB seqres database using an HMM built from the output monomer alignment (less stringent). 
+    # Warning: searching by HMM may result in crystal structures from very distant homologs or even unrelated sequences. 
+    pdb_search_method: jackhmmer
+
 # Settings for Mutation effect predictions
 mutate:
     # Options: standard
@@ -364,16 +373,19 @@ databases:
     # Directory with PDB MMTF structures (leave blank to fetch structures from web)
     pdb_mmtf_dir:
 
-    # SIFTS mapping information. Point to valid directory, and if these files do not exist, they will be automatically generated
-    # (this may take a while). Periodically delete these files to more recent versions of SIFTS are used.
+    # SIFTS mapping information. Point to file paths in an existing directory, and if these files do not exist, they will be
+    # automatically generated and saved at the given file path (this may take a while).
+    # Periodically delete these files to more recent versions of SIFTS are used.
     sifts_mapping_table: /groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.csv
     sifts_sequence_db: /groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.fasta
 
 # Paths to external tools used by evcouplings. Please refer to README.md for installation instructions and which tools are required.
 tools:
     jackhmmer: /groups/marks/pipelines/evcouplings/software/hmmer-3.1b2-linux-intel-x86_64/binaries/jackhmmer
     plmc: /groups/marks/pipelines/evcouplings/software/plmc/bin/plmc
+    hmmbuild: /groups/marks/pipelines/evcouplings/software/hmmer-3.1b2-linux-intel-x86_64/binaries/hmmbuild
+    hmmsearch: /groups/marks/pipelines/evcouplings/software/hmmer-3.1b2-linux-intel-x86_64/binaries/hmmsearch
     hhfilter: /groups/marks/pipelines/evcouplings/software/hh-suite/bin/hhfilter
-    psipred: /groups/marks/software/runpsipred
+    psipred: /groups/marks/software/runpsipred_o2
     cns: /groups/marks/pipelines/evcouplings/software/cns_solve_1.21/intel-x86_64bit-linux/bin/cns
     maxcluster: /groups/marks/pipelines/evcouplings/software/maxcluster64bit
diff --git a/config/sample_config_monomer_o2.txt b/config/sample_config_monomer_o2.txt
@@ -219,7 +219,10 @@ compare:
 
     # Comparison and plotting settings
 
-    # Set atom_filter to "CA" to compute C_alpha distances instead of minimum atom distances. If blank, will compute minimum atom distance
+    # Filter that defines which atoms will be used for distance calculations. If empty/None, no filter will be
+    # applied (resulting in the computation of minimum atom distances between all pairs of atoms). If setting to any
+    # particular PDB atom type, only these atoms will be used for the computation (e.g. CA will give C_alpha distances,
+    # CB will give C_beta distances, etc.)
     atom_filter:
 
     # Distance cutoff (Angstrom) for a true positive pair
@@ -246,6 +249,12 @@ compare:
     # draw secondary structure on contact map plots
     draw_secondary_structure: True
 
+    # Alignment method to use to search the PDB Seqres database. Options: jackhmmer, hmmsearch
+    # Set to jackhmmer to search the PDB Seqres database using jackhmmer from the target sequence only (more stringent). 
+    # Set to hmmsearch to search the PDB seqres database using an HMM built from the output monomer alignment (less stringent). 
+    # Warning: searching by HMM may result in crystal structures from very distant homologs or even unrelated sequences. 
+    pdb_search_method: jackhmmer
+
 # Settings for Mutation effect predictions
 mutate:
     # Options: standard
@@ -364,15 +373,18 @@ databases:
     # Directory with PDB MMTF structures (leave blank to fetch structures from web)
     pdb_mmtf_dir:
 
-    # SIFTS mapping information. Point to valid directory, and if these files do not exist, they will be automatically generated
-    # (this may take a while). Periodically delete these files to more recent versions of SIFTS are used.
+    # SIFTS mapping information. Point to file paths in an existing directory, and if these files do not exist, they will be
+    # automatically generated and saved at the given file path (this may take a while).
+    # Periodically delete these files to more recent versions of SIFTS are used.
     sifts_mapping_table: /n/groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.csv
     sifts_sequence_db: /n/groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.fa
 
 # Paths to external tools used by evcouplings. Please refer to README.md for installation instructions and which tools are required.
 tools:
     jackhmmer: /n/groups/marks/pipelines/evcouplings/software/hmmer-3.1b2-linux-intel-x86_64/binaries/jackhmmer
     plmc: /n/groups/marks/pipelines/evcouplings/software/plmc/bin/plmc
+    hmmbuild: /n/groups/marks/pipelines/evcouplings/software/hmmer-3.1b2-linux-intel-x86_64/binaries/hmmbuild
+    hmmsearch: /n/groups/marks/pipelines/evcouplings/software/hmmer-3.1b2-linux-intel-x86_64/binaries/hmmsearch
     hhfilter: /n/groups/marks/pipelines/evcouplings/software/hh-suite/bin/hhfilter
     psipred: /n/groups/marks/software/runpsipred_o2
     cns: /n/groups/marks/pipelines/evcouplings/software/cns_solve_1.21/intel-x86_64bit-linux/bin/cns