diff --git a/.travis.yml b/.travis.yml index e06beb67..197673ab 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,38 +1,30 @@ language: python python: - - '3.5' - - '3.6' +- '3.5' +- '3.6' before_install: - # https://conda.io/docs/travis.html - - sudo apt-get update - # We do this conditionally because it saves us some downloading if the - # version is the same. - - if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then - wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh -O miniconda.sh; - else - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh; - fi - - bash miniconda.sh -b -p $HOME/miniconda - - export PATH="$HOME/miniconda/bin:$PATH" - - hash -r - - conda config --set always_yes yes --set changeps1 no - - conda update -q conda - # Useful for debugging any issues with conda - - conda info -a - - # Replace dep1 dep2 ... with your dependencies - - conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION numpy scipy numba pandas matplotlib - - source activate test-environment +- sudo apt-get update +- if [[ "$TRAVIS_PYTHON_VERSION" == "2.7" ]]; then wget https://repo.continuum.io/miniconda/Miniconda2-latest-Linux-x86_64.sh + -O miniconda.sh; else wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh + -O miniconda.sh; fi +- bash miniconda.sh -b -p $HOME/miniconda +- export PATH="$HOME/miniconda/bin:$PATH" +- hash -r +- conda config --set always_yes yes --set changeps1 no +- conda update -q conda +- conda info -a +- conda create -q -n test-environment python=$TRAVIS_PYTHON_VERSION numpy scipy numba + pandas matplotlib +- source activate test-environment install: - - python setup.py sdist --formats=zip -k - - find ./dist -iname "*.zip" -print0 | xargs -0 pip install +- python setup.py sdist --formats=zip -k +- find ./dist -iname "*.zip" -print0 | xargs -0 pip install script: - - python -m unittest discover -s test -p "Test*.py" +- python -m unittest discover -s test -p "Test*.py" deploy: provider: pypi user: debbiemarkslab password: - secure: dNIatUYDKukXsDbak9r++FV3IAmRVVN3O4233Hx9BkT/1bmZumTYkZE+NpC8ooFCvSJf2eH0imZ2GebXm1eknF/NRZ4EWC1K7sjyBWLWxoLaYF3mWWSArsFe+kH/hQt9nNh0tp1+jcWW7TM6v21FtI8P88G3J4Rk5exuxyZC89r7Y33sF+bcata7aei4XXBeFgyvjCotAMUzwEwUM+0gETKlMbe5nZRZ8ZRPqAEP/Q4bBMVkexs8BhTt8RpgHQmJH4dTYmcM5/5Hkor2kcwPwrVhGdj+ayLqAGvNjyWF3h41x+ooBP+iLheGx9sW09sWr/dCzNwtr6KU5zNU1ibkppTUomwfKXWQOzvFXubjBuOraoc0PsA53RQ20N9d2HAzg25iPt4u+3mGHOFHTb5yHwLUOtPWem8Oal4LsKjVb/GGxzF0iYClT3j6NHc9RviLZp7l7smptY3Q9dRxdmQG+vAytM54qCvzl6Bydh1Fe25l3O97Je8ZIvaKfMMtgx4TrWyJODDtqdJVK0sg/rLHIQOlFR5Pc+XFqlJIedwGIBD9oHgCvHKyqmMokM5RINiQb0255DxNkaJ/tP1l5TN8Bxl00Xj6V4EpPbYA0FAh0+ZZhQSfrJwhKFrlt2rC4JdYQWpr0DMChbHRzqbH/PNRbKOOAYJduRazK///RABjkjU= + secure: sgND7tkfN3MDI/lxfcLVl6lekFb0GqxEQZ1cixL/iQGRb7M/pZiPMegWGhBQ3pMCaR7peXCgiHbUIsuP1muLsHTKzsFneeRaY2tBwd0+9y0WLDP/aDEn4lt/CJf7at4HSbvQ8x8DTCjcnBGC7SeUkuJAeJRrnWnngDVlWuw93DVTiOoWSiE9HhVzo3H1wmS8AOjDzebxeHSi5kOdJ0QBW2QJtGxrVrQT0XzW37RdfogC6BoOi3HG620h49TRWAQurgtnprNGQ4RqP6vx9KvzL3HWbuDmI+MLRU2ZvIV86xE7sDMKy1nRqBmrRRC2KAuQB+UWIYuuprLoW4wuibcG/uiQOBOhjeOyUu3fL8Nb0K2+p4YySiTbASqFrWqfHPD83EwfPXHw+aNeuHJyu4h7ogzBYjZsz+nY8AhBl9v7CzNT6VuYQcnJMBt9Ca9Ab9Ats+9yVinmFD6UvFfKfWUjyn7pbWHtIxT1iLSIq9faD7bW6uSAainMGB2KlxgJp5O9ePM0/LBJiGVYyYZphLyWPABHGasfMcxn6LG2c9mWICFpelB5B0P6aSfvRFElzYTcd2hLMyLYdscVOw8NSFYkM9QzGH9Py8ptsA3l34BUeCA0YsUj4PZL7W8wuwOXBhypIkmnHbD8rN0DY1vaVTvjbII/KUbEX0FwaiNRLKlW7Ms= on: tags: true - diff --git a/README.md b/README.md index 61e249d9..82592429 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ If you are simply interested in using EVcouplings as a library, installing the P #### Requirements -EVcouplings requires a Python >= 3.5 installation. Since it depends on some packages from the scientific Python stack that can be tricky to install using pip (numba, numpy), we recommend using the [Anaconda Python distribution](https://www.continuum.io/downloads). +EVcouplings requires a Python >= 3.5 installation. Since it depends on some packages that can be tricky to install using pip (numba, numpy, ...), we recommend using the [Anaconda Python distribution](https://www.continuum.io/downloads). In case you are creating a new conda environment or using miniconda, please make sure to run `conda install anaconda` before running pip, or otherwise the required packages will not be present. #### Installation @@ -88,6 +88,16 @@ Please see for how to download the respective databases. Note that this may take a while, especially the generation of post-processed SIFTS mapping files. +#### Sequence databases for EVcomplex +Running the EVcouplings pipeline for protein complexes (aka EVcomplex) requires two pre-computed databases. You can download these databases here: + +ena_genome_location_table: https://marks.hms.harvard.edu/evcomplex_databases/cds_pro_2017_02.txt +uniprot_to_embl_table: https://marks.hms.harvard.edu/evcomplex_databases/idmapping_uniprot_embl_2017_02.txt + +Save these databases in your local environment, and then add the paths to the local copies of these databases to your config file for the complex pipeline. + +In future releases these databases will be generated automatically. + #### Other sequence databases You can however use any sequence database of your choice in FASTA format if you prefer to. The database for any particular job needs to be defined in the job configuration file ("databases" section) and set as the input database in the "alignment" section. @@ -97,7 +107,7 @@ You can however use any sequence database of your choice in FASTA format if you Relevant PDB structures for comparison of ECs and 3D structure predictions will be automatically fetched from the web in the new compressed MMTF format on a per-job basis. You can however also pre-download the entire PDB and place the structures in a directory if you want to (and set pdb_mmtf_dir in your job configuration). Uniprot to PDB index mapping files will be automatically generated by EVcouplings based on the SIFTS database. -You can either generate the files by running *evcouplings_dbupdate* (see above), or by pointing the sifts_mapping_table and sifts_sequence_db configuration parameters to file paths in a valid directory, and if the files do not yet exist, they will be created by fetching and integrating data from the web (this may take a while) when the pipeline is first run. +You can either generate the files by running *evcouplings_dbupdate* (see above, preferred), or by pointing the sifts_mapping_table and sifts_sequence_db configuration parameters to file paths inside an already existing directory. If these files do not yet exist, they will be created by fetching and integrating data from the web (this may take a while) when the pipeline is first run and saved under the given file paths. ## Documentation and tutorials diff --git a/config/sample_config_complex.txt b/config/sample_config_complex.txt index 638169d2..0ca8d734 100644 --- a/config/sample_config_complex.txt +++ b/config/sample_config_complex.txt @@ -319,8 +319,8 @@ couplings: # Compare ECs to known 3D structures compare: - # Current options: standard, complex_compare - protocol: complex_compare + # Current options: standard, complex + protocol: complex # Following parameters will be usually overriden by global settings / output of previous stage prefix: @@ -334,6 +334,12 @@ compare: # sequence_id and SIFTS database (sequence_id must be UniProt AC/ID in this case) first_by_alignment: True second_by_alignment: True + # Alignment method to use to search the PDB Seqres database. Options: jackhmmer, hmmsearch + # Set to jackhmmer to search the PDB Seqres database using jackhmmer from the target sequence only (more stringent). + # Set to hmmsearch to search the PDB seqres database using an HMM built from the output monomer alignment (less stringent). + # Warning: searching by HMM may result in crystal structures from very distant homologs or even unrelated sequences. + first_pdb_search_method: jackhmmer + second_pdb_search_method: jackhmmer # Leave this parameter empty to use all PDB structures for given sequence_id, otherwise # will be limited to the given IDs (single value or list). Important: note that this acts only as a filter on the @@ -353,7 +359,7 @@ compare: # compare to multimer contacts (if multiple chains of the same sequence or its homologs are present in a structure) first_compare_multimer: True - second_compare_multimer: + second_compare_multimer: True # settings for sequence alignment against PDB sequences using jackhmmer # (additional settings like iterations possible, compare to align stage) @@ -378,7 +384,10 @@ compare: # Return an error if we fail to automatically retrieve information about a given pdb id raise_missing: False - # Set atom_filter to "CA" to compute C_alpha distances instead of minimum atom distances. If blank, will compute minimum atom distance + # Filter that defines which atoms will be used for distance calculations. If empty/None, no filter will be + # applied (resulting in the computation of minimum atom distances between all pairs of atoms). If setting to any + # particular PDB atom type, only these atoms will be used for the computation (e.g. CA will give C_alpha distances, + # CB will give C_beta distances, etc.) atom_filter: # Distance cutoff (Angstrom) for a true positive pair @@ -407,6 +416,7 @@ compare: # draw secondary structure on contact map plots draw_secondary_structure: True + # These settings allow job status tracking using a database, and result collection in an archive management: # URI of database @@ -458,8 +468,9 @@ databases: # Directory with PDB MMTF structures (leave blank to fetch structures from web) pdb_mmtf_dir: - # SIFTS mapping information. Point to valid directory, and if these files do not exist, they will be automatically generated - # (this may take a while). Periodically delete these files to more recent versions of SIFTS are used. + # SIFTS mapping information. Point to file paths in an existing directory, and if these files do not exist, they will be + # automatically generated and saved at the given file path (this may take a while). + # Periodically delete these files to more recent versions of SIFTS are used. sifts_mapping_table: /groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_2017_07_03.csv sifts_sequence_db: /groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_2017_07_03.fa @@ -471,5 +482,11 @@ tools: psipred: /groups/marks/software/runpsipred cns: /groups/marks/pipelines/evcouplings/software/cns_solve_1.21/intel-x86_64bit-linux/bin/cns maxcluster: /groups/marks/pipelines/evcouplings/software/maxcluster64bit - uniprot_to_embl_table : /groups/marks/databases/complexes/idmapping/idmapping_uniprot_embl_2017_02.txt - ena_genome_location_table : /groups/marks/databases/complexes/ena/2017_02/cds_pro.txt \ No newline at end of file + + # the following two databases are exclusive to EVcomplex and need to be manually downloaded and saved locally + # then add the paths to your local copies of the database + # Download urls: + # ena_genome_location_table: https://marks.hms.harvard.edu/evcomplex_databases/cds_pro_2017_02.txt + # uniprot_to_embl_table: https://marks.hms.harvard.edu/evcomplex_databases/idmapping_uniprot_embl_2017_02.txt + uniprot_to_embl_table: /groups/marks/databases/complexes/idmapping/idmapping_uniprot_embl_2017_02.txt + ena_genome_location_table: /groups/marks/databases/complexes/ena/2017_02/cds_pro.txt diff --git a/config/sample_config_monomer.txt b/config/sample_config_monomer.txt index de6fa0f3..0cbe4891 100644 --- a/config/sample_config_monomer.txt +++ b/config/sample_config_monomer.txt @@ -219,7 +219,10 @@ compare: # Comparison and plotting settings - # Set atom_filter to "CA" to compute C_alpha distances instead of minimum atom distances. If blank, will compute minimum atom distance + # Filter that defines which atoms will be used for distance calculations. If empty/None, no filter will be + # applied (resulting in the computation of minimum atom distances between all pairs of atoms). If setting to any + # particular PDB atom type, only these atoms will be used for the computation (e.g. CA will give C_alpha distances, + # CB will give C_beta distances, etc.) atom_filter: # Distance cutoff (Angstrom) for a true positive pair @@ -246,6 +249,12 @@ compare: # draw secondary structure on contact map plots draw_secondary_structure: True + # Alignment method to use to search the PDB Seqres database. Options: jackhmmer, hmmsearch + # Set to jackhmmer to search the PDB Seqres database using jackhmmer from the target sequence only (more stringent). + # Set to hmmsearch to search the PDB seqres database using an HMM built from the output monomer alignment (less stringent). + # Warning: searching by HMM may result in crystal structures from very distant homologs or even unrelated sequences. + pdb_search_method: jackhmmer + # Settings for Mutation effect predictions mutate: # Options: standard @@ -364,8 +373,9 @@ databases: # Directory with PDB MMTF structures (leave blank to fetch structures from web) pdb_mmtf_dir: - # SIFTS mapping information. Point to valid directory, and if these files do not exist, they will be automatically generated - # (this may take a while). Periodically delete these files to more recent versions of SIFTS are used. + # SIFTS mapping information. Point to file paths in an existing directory, and if these files do not exist, they will be + # automatically generated and saved at the given file path (this may take a while). + # Periodically delete these files to more recent versions of SIFTS are used. sifts_mapping_table: /groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.csv sifts_sequence_db: /groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.fasta @@ -373,7 +383,9 @@ databases: tools: jackhmmer: /groups/marks/pipelines/evcouplings/software/hmmer-3.1b2-linux-intel-x86_64/binaries/jackhmmer plmc: /groups/marks/pipelines/evcouplings/software/plmc/bin/plmc + hmmbuild: /groups/marks/pipelines/evcouplings/software/hmmer-3.1b2-linux-intel-x86_64/binaries/hmmbuild + hmmsearch: /groups/marks/pipelines/evcouplings/software/hmmer-3.1b2-linux-intel-x86_64/binaries/hmmsearch hhfilter: /groups/marks/pipelines/evcouplings/software/hh-suite/bin/hhfilter - psipred: /groups/marks/software/runpsipred + psipred: /groups/marks/software/runpsipred_o2 cns: /groups/marks/pipelines/evcouplings/software/cns_solve_1.21/intel-x86_64bit-linux/bin/cns maxcluster: /groups/marks/pipelines/evcouplings/software/maxcluster64bit diff --git a/config/sample_config_monomer_o2.txt b/config/sample_config_monomer_o2.txt index 69afc600..e5406de4 100644 --- a/config/sample_config_monomer_o2.txt +++ b/config/sample_config_monomer_o2.txt @@ -219,7 +219,10 @@ compare: # Comparison and plotting settings - # Set atom_filter to "CA" to compute C_alpha distances instead of minimum atom distances. If blank, will compute minimum atom distance + # Filter that defines which atoms will be used for distance calculations. If empty/None, no filter will be + # applied (resulting in the computation of minimum atom distances between all pairs of atoms). If setting to any + # particular PDB atom type, only these atoms will be used for the computation (e.g. CA will give C_alpha distances, + # CB will give C_beta distances, etc.) atom_filter: # Distance cutoff (Angstrom) for a true positive pair @@ -246,6 +249,12 @@ compare: # draw secondary structure on contact map plots draw_secondary_structure: True + # Alignment method to use to search the PDB Seqres database. Options: jackhmmer, hmmsearch + # Set to jackhmmer to search the PDB Seqres database using jackhmmer from the target sequence only (more stringent). + # Set to hmmsearch to search the PDB seqres database using an HMM built from the output monomer alignment (less stringent). + # Warning: searching by HMM may result in crystal structures from very distant homologs or even unrelated sequences. + pdb_search_method: jackhmmer + # Settings for Mutation effect predictions mutate: # Options: standard @@ -364,8 +373,9 @@ databases: # Directory with PDB MMTF structures (leave blank to fetch structures from web) pdb_mmtf_dir: - # SIFTS mapping information. Point to valid directory, and if these files do not exist, they will be automatically generated - # (this may take a while). Periodically delete these files to more recent versions of SIFTS are used. + # SIFTS mapping information. Point to file paths in an existing directory, and if these files do not exist, they will be + # automatically generated and saved at the given file path (this may take a while). + # Periodically delete these files to more recent versions of SIFTS are used. sifts_mapping_table: /n/groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.csv sifts_sequence_db: /n/groups/marks/databases/SIFTS/pdb_chain_uniprot_plus_current.fa @@ -373,6 +383,8 @@ databases: tools: jackhmmer: /n/groups/marks/pipelines/evcouplings/software/hmmer-3.1b2-linux-intel-x86_64/binaries/jackhmmer plmc: /n/groups/marks/pipelines/evcouplings/software/plmc/bin/plmc + hmmbuild: /n/groups/marks/pipelines/evcouplings/software/hmmer-3.1b2-linux-intel-x86_64/binaries/hmmbuild + hmmsearch: /n/groups/marks/pipelines/evcouplings/software/hmmer-3.1b2-linux-intel-x86_64/binaries/hmmsearch hhfilter: /n/groups/marks/pipelines/evcouplings/software/hh-suite/bin/hhfilter psipred: /n/groups/marks/software/runpsipred_o2 cns: /n/groups/marks/pipelines/evcouplings/software/cns_solve_1.21/intel-x86_64bit-linux/bin/cns diff --git a/evcouplings/align/protocol.py b/evcouplings/align/protocol.py index 178668e2..4c4fe9ee 100644 --- a/evcouplings/align/protocol.py +++ b/evcouplings/align/protocol.py @@ -3,7 +3,9 @@ Authors: Thomas A. Hopf - Anna G. Green (complex protocol) + Anna G. Green - complex protocol, hmm_build_and_search + Chan Kang - hmm_build_and_search + """ from collections import OrderedDict, Iterable @@ -39,6 +41,104 @@ ) +def _make_hmmsearch_raw_fasta(alignment_result, prefix): + """ + HMMsearch results do not contain the query sequence + so we must construct a raw_fasta file with the query + sequence as the first hit, to ensure proper numbering. + The search result is filtered to only contain the columns with + match states to the HMM, which has a one to one mapping to the + query sequence. + + Paramters + --------- + alignment_result : dict + Alignment result dictionary, output by run_hmmsearch + prefix : str + Prefix for file creation + + Returns + ------- + str + path to raw focus alignment file + + """ + def _add_gaps_to_query(query_sequence_ali, ali): + + # get the index of columns that do not contain match states (indicated by an x) + gap_index = [ + i for i, x in enumerate(ali.annotation["GC"]["RF"]) if x != "x" + ] + # get the index of columns that contain match states (indicated by an x) + match_index = [ + i for i, x in enumerate(ali.annotation["GC"]["RF"]) if x == "x" + ] + + # ensure that the length of the match states + # match the length of the sequence + if len(match_index) != query_sequence_ali.L: + raise ValueError( + "HMMsearch result {} does not have a one-to-one" + " mapping to the query sequence columns".format(ar["raw_alignment_file"]) + ) + + gapped_query_sequence = "" + seq = list(query_sequence_ali.matrix[0, :]) + + # loop through every position in the HMMsearch hits + for i in range(len(ali.annotation["GC"]["RF"])): + # if that position should be a gap, add a gap + if i in gap_index: + gapped_query_sequence += "-" + # if that position should be a letter, pop the next + # letter in the query sequence + else: + gapped_query_sequence += seq.pop(0) + + new_sequence_ali = Alignment.from_dict({ + query_sequence_ali.ids[0]: gapped_query_sequence + }) + return new_sequence_ali + + # open the sequence file + with open(alignment_result["target_sequence_file"]) as a: + query_sequence_ali = Alignment.from_file(a, format="fasta") + + # if the provided alignment is empty, just return the target sequence + raw_focus_alignment_file = prefix + "_raw.fasta" + if not valid_file(alignment_result["raw_alignment_file"]): + # write the query sequence to a fasta file + with open(raw_focus_alignment_file, "w") as of: + query_sequence_ali.write(of) + + # return as an alignment object + return raw_focus_alignment_file + + # else, open the HMM search result + with open(alignment_result["raw_alignment_file"]) as a: + ali = Alignment.from_file(a, format="stockholm") + + # make sure that the stockholm alignment contains the match annotation + if not ("GC" in ali.annotation and "RF" in ali.annotation["GC"]): + raise ValueError( + "Stockholm alignment {} missing RF" + " annotation of match states".format(alignment_result["raw_alignment_file"]) + ) + + # add insertions to the query sequence in order to preserve correct + # numbering of match sequences + gapped_sequence_ali = _add_gaps_to_query(query_sequence_ali, ali) + + # write a new alignment file with the query sequence as + # the first entry + + with open(raw_focus_alignment_file, "w") as of: + gapped_sequence_ali.write(of) + ali.write(of) + + return raw_focus_alignment_file + + def fetch_sequence(sequence_id, sequence_file, sequence_download_url, out_file): """ @@ -494,16 +594,6 @@ def existing(**kwargs): Mandatory kwargs arguments: See list below in code where calling check_required - .. todo:: - explain meaning of parameters in detail. - - If skip is given and True, the workflow will only return - the output configuration (outcfg) and ali will be None. - - If callback is given, the function will be called at the - end of the workflow with the kwargs arguments updated with - the outcfg results. - Returns ------- outcfg : dict @@ -885,6 +975,7 @@ def jackhmmer_search(**kwargs): the following fields: * sequence_id (passed through from input) + * first_index (passed through from input) * target_sequence_file * sequence_file * raw_alignment_file @@ -992,11 +1083,271 @@ def jackhmmer_search(**kwargs): "sequence_id": kwargs["sequence_id"], "target_sequence_file": target_sequence_file, "sequence_file": full_sequence_file, + "first_index": kwargs["first_index"], + "focus_mode": True, + "raw_alignment_file": ali["alignment"], + "hittable_file": ali["domtblout"], + } + + # define a single protein segment based on target sequence + outcfg["segments"] = [ + Segment( + "aa", kwargs["sequence_id"], + region_start, region_end, + range(region_start, region_end + 1) + ).to_list() + ] + + outcfg["focus_sequence"] = "{}/{}-{}".format( + kwargs["sequence_id"], region_start, region_end + ) + + return outcfg + + +def hmmbuild_and_search(**kwargs): + """ + Protocol: + + Build HMM from sequence alignment using hmmbuild and + search against a sequence database using hmmsearch. + + Parameters + ---------- + Mandatory kwargs arguments: + See list below in code where calling check_required + + Returns + ------- + outcfg : dict + Output configuration of the protocol, including + the following fields: + + * target_sequence_file + * sequence_file + * raw_alignment_file + * hittable_file + * focus_mode + * focus_sequence + * segments + """ + + def _format_alignment_for_hmmbuild(input_alignment_file, **kwargs): + # this file is starting point of pipeline; + # check if input alignment actually exists + + verify_resources( + "Input alignment does not exist", + input_alignment_file + ) + + # first try to autodetect format of alignment + with open(input_alignment_file) as f: + format = detect_format(f) + if format is None: + raise InvalidParameterError( + "Format of input alignment {} could not be " + "automatically detected.".format( + input_alignment_file + ) + ) + + with open(input_alignment_file) as f: + ali_raw = Alignment.from_file(f, format) + + # Target sequence of alignment + sequence_id = kwargs["sequence_id"] + + if sequence_id is None: + raise InvalidParameterError( + "Parameter sequence_id must be defined" + ) + + # First, find focus sequence in alignment + focus_index = None + for i, id_ in enumerate(ali_raw.ids): + if id_.startswith(sequence_id): + focus_index = i + break + + # if we didn't find it, cannot continue + if focus_index is None: + raise InvalidParameterError( + "Target sequence {} could not be found in alignment" + .format(sequence_id) + ) + + # identify what columns (non-gap) to keep for focus + # this should be all columns in the raw_focus_alignment_file + # but checking anyway + focus_seq = ali_raw[focus_index] + focus_cols = np.array( + [c not in [ali_raw._match_gap, ali_raw._insert_gap] for c in focus_seq] + ) + + # extract focus alignment + focus_ali = ali_raw.select(columns=focus_cols) + focus_seq_nogap = "".join(focus_ali[focus_index]) + + # determine region of sequence. If first_index is given, + # use that in any case, otherwise try to autodetect + full_focus_header = ali_raw.ids[focus_index] + focus_id = full_focus_header.split()[0] + + # try to extract region from sequence header + id_, region_start, region_end = parse_header(focus_id) + + # override with first_index if given + if kwargs["first_index"] is not None: + region_start = kwargs["first_index"] + region_end = region_start + len(focus_seq_nogap) - 1 + + if region_start is None or region_end is None: + raise InvalidParameterError( + "Could not extract region information " + + "from sequence header {} ".format(full_focus_header) + + "and first_index parameter is not given." + ) + + # resubstitute full sequence ID from identifier + # and region information + header = "{}/{}-{}".format( + id_, region_start, region_end + ) + + focus_ali.ids[focus_index] = header + + # write target sequence to file + target_sequence_file = prefix + ".fa" + with open(target_sequence_file, "w") as f: + write_fasta( + [(header, focus_seq_nogap)], f + ) + + # swap target sequence to first position if it is not + # the first sequence in alignment; + # this is particularly important for hhfilter run + # because target sequence might otherwise be filtered out + if focus_index != 0: + indices = np.arange(0, len(focus_ali)) + indices[0] = focus_index + indices[focus_index] = 0 + focus_index = 0 + focus_ali = focus_ali.select(sequences=indices) + + # write the raw focus alignment for hmmbuild + focus_fasta_file = prefix + "_raw_focus_input.fasta" + with open(focus_fasta_file, "w") as f: + focus_ali.write(f, "fasta") + + return focus_fasta_file, target_sequence_file, region_start, region_end + + + # define the gap threshold for inclusion in HMM's build by HMMbuild. + SYMFRAC_HMMBUILD = 0.0 + + # check for required options + check_required( + kwargs, + [ + "prefix", "sequence_id", "alignment_file", + "use_bitscores", "domain_threshold", "sequence_threshold", + "database", "cpu", "nobias", "reuse_alignment", + "hmmbuild", "hmmsearch" + ] + ) + prefix = kwargs["prefix"] + + # make sure output directory exists + create_prefix_folders(prefix) + + # prepare input alignment for hmmbuild + focus_fasta_file, target_sequence_file, region_start, region_end = \ + _format_alignment_for_hmmbuild( + kwargs["alignment_file"], **kwargs + ) + + # run hmmbuild_and_search... allow to reuse pre-exisiting + # Stockholm alignment file here + ali_outcfg_file = prefix + ".align_hmmbuild_and_search.outcfg" + + # determine if to rerun, only possible if previous results + # were stored in ali_outcfg_file + if kwargs["reuse_alignment"] and valid_file(ali_outcfg_file): + ali = read_config_file(ali_outcfg_file) + + # check if the alignment file itself is also there + verify_resources( + "Tried to reuse alignment, but empty or " + "does not exist", + ali["alignment"], ali["domtblout"] + ) + else: + # otherwise, we have to run the alignment + # modify search thresholds to be suitable for hmmsearch + sequence_length = region_end - region_start + 1 + seq_threshold, domain_threshold = search_thresholds( + kwargs["use_bitscores"], + kwargs["sequence_threshold"], + kwargs["domain_threshold"], + sequence_length + ) + + # create the hmm + hmmbuild_result = at.run_hmmbuild( + alignment_file=focus_fasta_file, + prefix=prefix, + symfrac=SYMFRAC_HMMBUILD, + cpu=kwargs["cpu"], + binary=kwargs["hmmbuild"], + ) + hmmfile = hmmbuild_result.hmmfile + + # run the alignment from the hmm + ali = at.run_hmmsearch( + hmmfile=hmmfile, + database=kwargs[kwargs["database"]], + prefix=prefix, + use_bitscores=kwargs["use_bitscores"], + domain_threshold=domain_threshold, + seq_threshold=seq_threshold, + nobias=kwargs["nobias"], + cpu=kwargs["cpu"], + binary=kwargs["hmmsearch"], + ) + + # get rid of huge stdout log file immediately + try: + os.remove(ali.output) + except OSError: + pass + + # turn namedtuple into dictionary to make + # restarting code nicer + ali = dict(ali._asdict()) + # only item from hmmsearch_result to save is the hmmfile + ali["hmmfile"] = hmmfile + + # save results of search for possible restart + write_config_file(ali_outcfg_file, ali) + + # prepare output dictionary with result files + outcfg = { + "sequence_file": target_sequence_file, + "first_index": region_start, + "input_raw_focus_alignment": focus_fasta_file, + "target_sequence_file": target_sequence_file, "focus_mode": True, "raw_alignment_file": ali["alignment"], "hittable_file": ali["domtblout"], } + + # convert the raw output alignment to fasta format + # and add the appropriate query sequecne + raw_focus_alignment_file = _make_hmmsearch_raw_fasta(outcfg, prefix) + outcfg["raw_focus_alignment_file"] = raw_focus_alignment_file + # define a single protein segment based on target sequence outcfg["segments"] = [ Segment( @@ -1027,17 +1378,6 @@ def standard(**kwargs): Mandatory kwargs arguments: See list below in code where calling check_required - .. todo:: - - explain meaning of parameters in detail. - - If skip is given and True, the workflow will only return - the output configuration (outcfg) and ali will be None. - - If callback is given, the function will be called at the - end of the workflow with the kwargs arguments updated with - the outcfg results. - Returns ------- outcfg : dict @@ -1045,6 +1385,7 @@ def standard(**kwargs): the following fields: * sequence_id (passed through from input) + * first_index (passed through from input) * alignment_file * raw_alignment_file * raw_focus_alignment_file diff --git a/evcouplings/align/tools.py b/evcouplings/align/tools.py index fcc1d30a..8c86a665 100644 --- a/evcouplings/align/tools.py +++ b/evcouplings/align/tools.py @@ -3,6 +3,8 @@ Authors: Thomas A. Hopf + Anna G. Green - run_hmmbuild, run_hmmsearch + Chan Kang - run_hmmbuild, run_hmmsearch """ from collections import namedtuple @@ -10,6 +12,223 @@ from evcouplings.utils.system import ( run, create_prefix_folders, verify_resources, temp ) +from evcouplings.utils.config import check_required + + +# output fields for storing results of a hmmbuild run +# (returned by run_hmmbuild) +HmmbuildResult = namedtuple( + "HmmbuildResult", + ["prefix", "hmmfile", "output"] +) + + +def run_hmmbuild(alignment_file, prefix, cpu=None, + stdout_redirect=None, symfrac=None, + binary="hmmbuild"): + """ + Profile HMM construction from multiple sequence alignments + Refer to HMMER documentation for details. + + http://eddylab.org/software/hmmer3/3.1b2/Userguide.pdf + + Parameters + ---------- + alignment_file : str + File containing the multiple sequence alignment. Can be in + Stockholm, a2m, or clustal formats, or any other format + recognized by hmmer. Please note that ALL POSITIONS + above the symfrac cutoff will be used in HMM + construction (if the alignment contains columns that are + insertions relative to the query sequence, this may be + problematic for structure comparison) + prefix : str + Prefix path for output files. Folder structure in + the prefix will be created if not existing. + cpu : int, optional (default: None) + Number of CPUs to use for search. Uses all if None. + stdout_redirect : str, optional (default: None) + Redirect bulky stdout instead of storing + with rest of results (use "/dev/null" to dispose) + symfrac : float, optional (default: None) + range 0.0 - 1.0, HMMbuild will use columns with + > symfrac percent gaps to construct the HMM. + If None provided, HMMbuild internal default is 0.5. + (Note: this is calculated after their internal sequence + weighting is calculated) + binary : str (default: "hmmbuild") + Path to jackhmmer binary (put in PATH for + default to work) + + Returns + ------- + HmmbuildResult + namedtuple with fields corresponding to the different + output files (prefix, alignment, output, tblout, domtblout) + + Raises + ------ + ExternalToolError, ResourceError + """ + verify_resources( + "Input file does not exist or is empty", + alignment_file + ) + + create_prefix_folders(prefix) + + # store filenames of all individual results; + # these will be returned as result of the + # function. + result = HmmbuildResult( + prefix, + prefix + ".hmm", + prefix + ".output" if stdout_redirect is None else stdout_redirect, + ) + + cmd = [ + binary, + "-o", result.output, + ] + + # number of CPUs + if cpu is not None: + cmd += ["--cpu", str(cpu)] + + if symfrac is not None: + cmd += ["--symfrac", str(symfrac)] + + cmd += [result.hmmfile, alignment_file] + + return_code, stdout, stderr = run(cmd) + + # also check we actually created some sort of alignment + verify_resources( + "hmmbuild returned empty HMM profile: " + "stdout={} stderr={} file={}".format( + stdout, stderr, result.hmmfile + ), + result.hmmfile + ) + + return result + + +# output fields for storing results of a hmmsearch run +# (returned by run_hmmsearch) +HmmsearchResult = namedtuple( + "HmmsearchResult", + ["prefix", "alignment", "output", "tblout", "domtblout"] +) + + +def run_hmmsearch(hmmfile, database, prefix, + use_bitscores, domain_threshold, seq_threshold, + nobias=False, cpu=None, + stdout_redirect=None, binary="hmmsearch"): + """ + Search profile(s) against a sequence database. + Refer to HMMER documentation for details. + + http://eddylab.org/software/hmmer3/3.1b2/Userguide.pdf + + Parameters + ---------- + hmmfile : str + File containing the profile(s) + database : str + File containing sequence database + prefix : str + Prefix path for output files. Folder structure in + the prefix will be created if not existing. + use_bitscores : bool + Use bitscore inclusion thresholds rather than E-values. + domain_threshold : int or float or str + Inclusion threshold applied on the domain level + (e.g. "1E-03" or 0.001 or 50) + seq_threshold : int or float or str + Inclusion threshold applied on the sequence level + (e.g. "1E-03" or 0.001 or 50) + nobias : bool, optional (default: False) + Turn of bias correction + cpu : int, optional (default: None) + Number of CPUs to use for search. Uses all if None. + stdout_redirect : str, optional (default: None) + Redirect bulky stdout instead of storing + with rest of results (use "/dev/null" to dispose) + binary : str (default: "hmmsearch") + Path to jackhmmer binary (put in PATH for + default to work) + + Returns + ------- + HmmsearchResult + namedtuple with fields corresponding to the different + output files (prefix, alignment, output, tblout, domtblout) + + Raises + ------ + ExternalToolError, ResourceError + """ + verify_resources( + "Input file does not exist or is empty", + hmmfile, database + ) + + create_prefix_folders(prefix) + + # store filenames of all individual results; + # these will be returned as result of the + # function. + result = HmmsearchResult( + prefix, + prefix + ".sto", + prefix + ".output" if stdout_redirect is None else stdout_redirect, + prefix + ".tblout", + prefix + ".domtblout" + ) + + cmd = [ + binary, + "-o", result.output, + "-A", result.alignment, + "--tblout", result.tblout, + "--domtblout", result.domtblout, + "--noali", + "--notextw" + ] + + # reporting thresholds are set accordingly to + # inclusion threshold to reduce memory footprint + if use_bitscores: + cmd += [ + "-T", str(seq_threshold), + "--domT", str(domain_threshold), + "--incT", str(seq_threshold), + "--incdomT", str(domain_threshold) + ] + else: + cmd += [ + "-E", str(seq_threshold), + "--domE", str(domain_threshold), + "--incE", str(seq_threshold), + "--incdomE", str(domain_threshold) + ] + + # number of CPUs + if cpu is not None: + cmd += ["--cpu", str(cpu)] + + # bias correction filter + if nobias: + cmd += ["--nobias"] + + cmd += [hmmfile, database] + + return_code, stdout, stderr = run(cmd) + + return result + # output fields for storing results of a jackhmmer run # (returned by run_jackhmmer) diff --git a/evcouplings/compare/protocol.py b/evcouplings/compare/protocol.py index d9f6ce16..159e44b9 100644 --- a/evcouplings/compare/protocol.py +++ b/evcouplings/compare/protocol.py @@ -63,10 +63,11 @@ def _filter_by_id(x, id_list): "max_num_hits", "max_num_structures", "pdb_mmtf_dir", "sifts_mapping_table", "sifts_sequence_db", - "by_alignment", "alignment_min_overlap", + "by_alignment", "pdb_alignment_method", + "alignment_min_overlap", "sequence_id", "sequence_file", "region", "use_bitscores", "domain_threshold", - "sequence_threshold", "jackhmmer", + "sequence_threshold" ] ) # get SIFTS mapping object/sequence DB @@ -81,6 +82,19 @@ def _filter_by_id(x, id_list): # by sequence search or just fetching # based on Uniprot/PDB identifier if kwargs["by_alignment"]: + + # if searching by alignment, verify that + # user selected jackhmmer or hmmsearch + SEARCH_METHODS = ["jackhmmer", "hmmsearch"] + + if kwargs["pdb_alignment_method"] not in SEARCH_METHODS: + raise InvalidParameterError( + "Invalid pdb search method: " + + "{}. Valid selections are: {}".format( + ", ".join(SEARCH_METHODS.keys()) + ) + ) + sifts_map = s.by_alignment( reduce_chains=reduce_chains, min_overlap=kwargs["alignment_min_overlap"], @@ -665,7 +679,7 @@ def complex(**kwargs): # initialize output EC files "ec_compared_all_file": prefix + "_CouplingScoresCompared_all.csv", "ec_compared_longrange_file": prefix + "_CouplingScoresCompared_longrange.csv", - "ec_compared_inter_file": prefix + "_CouplingsScoresCompared_inter.csv", + "ec_compared_inter_file": prefix + "_CouplingScoresCompared_inter.csv", # initialize output inter distancemap files "distmap_inter": prefix + "_distmap_inter", @@ -699,8 +713,16 @@ def complex(**kwargs): aux_prefix = insert_dir(prefix, "aux", rootname_subdir=False) create_prefix_folders(aux_prefix) + # store auxiliary files here (too much for average user) + first_aux_prefix = insert_dir(aux_prefix, "first_monomer", rootname_subdir=False) + create_prefix_folders(first_aux_prefix) + + # store auxiliary files here (too much for average user) + second_aux_prefix = insert_dir(aux_prefix, "second_monomer", rootname_subdir=False) + create_prefix_folders(second_aux_prefix) + # Step 1: Identify 3D structures for comparison - def _identify_monomer_structures(name_prefix, outcfg): + def _identify_monomer_structures(name_prefix, outcfg, aux_prefix): # create a dictionary with kwargs for just the current monomer # remove the "prefix" kwargs so that we can replace with the # aux prefix when calling _identify_structures @@ -709,6 +731,10 @@ def _identify_monomer_structures(name_prefix, outcfg): k.replace(name_prefix + "_", "", 1): v for k, v in kwargs.items() if "prefix" not in k } + # this field needs to be set explicitly else it gets overwritten by concatenated file + monomer_kwargs["alignment_file"] = kwargs[name_prefix + "_alignment_file"] + monomer_kwargs["raw_focus_alignment_file"] = kwargs[name_prefix + "_raw_focus_alignment_file"] + # identify structures for that monomer sifts_map, sifts_map_full = _identify_structures( **monomer_kwargs, @@ -726,8 +752,8 @@ def _identify_monomer_structures(name_prefix, outcfg): ) return outcfg, sifts_map - outcfg, first_sifts_map = _identify_monomer_structures("first", outcfg) - outcfg, second_sifts_map = _identify_monomer_structures("second", outcfg) + outcfg, first_sifts_map = _identify_monomer_structures("first", outcfg, first_aux_prefix) + outcfg, second_sifts_map = _identify_monomer_structures("second", outcfg, second_aux_prefix) # get the segment names from the kwargs segment_list = kwargs["segments"] diff --git a/evcouplings/compare/sifts.py b/evcouplings/compare/sifts.py index 541a29ed..6bf476d7 100644 --- a/evcouplings/compare/sifts.py +++ b/evcouplings/compare/sifts.py @@ -9,26 +9,35 @@ Authors: Thomas A. Hopf + Anna G. Green (find_homologs) + Chan Kang (find_homologs) """ from os import path import json from collections import OrderedDict +from copy import deepcopy import pandas as pd import requests +import numpy as np from evcouplings.align.alignment import ( Alignment, read_fasta, parse_header ) -from evcouplings.align.protocol import jackhmmer_search +from evcouplings.align.protocol import ( + jackhmmer_search, hmmbuild_and_search, _make_hmmsearch_raw_fasta +) + from evcouplings.align.tools import read_hmmer_domtbl from evcouplings.compare.mapping import alignment_index_mapping, map_indices from evcouplings.utils.system import ( get_urllib, ResourceError, valid_file, tempdir ) -from evcouplings.utils.config import parse_config +from evcouplings.utils.config import ( + parse_config, check_required +) from evcouplings.utils.helpers import range_overlap UNIPROT_MAPPING_URL = "http://www.uniprot.org/mapping/" @@ -106,14 +115,14 @@ def fetch_uniprot_mapping(ids, from_="ACC", to="ACC", format="fasta"): return r.text -def find_homologs_jackhmmer(**kwargs): +def find_homologs(**kwargs): """ - Identify homologs using jackhmmer + Identify homologs using jackhmmer or hmmbuild/hmmsearch Parameters ---------- **kwargs - Passed into jackhmmer_search protocol + Passed into jackhmmer / hmmbuild_and_search protocol (see documentation for available options) Returns @@ -124,6 +133,7 @@ def find_homologs_jackhmmer(**kwargs): hits : pandas.DataFrame Tabular representation of hits """ + # load default configuration config = parse_config(JACKHMMER_CONFIG) @@ -137,16 +147,35 @@ def find_homologs_jackhmmer(**kwargs): if config["prefix"] is None: config["prefix"] = path.join(tempdir(), "compare") + check_required( + config, ["prefix"] + ) + + # run hmmsearch (possibly preceded by hmmbuild) + if kwargs["pdb_alignment_method"] == "hmmsearch": + # set up config to run hmmbuild_and_search on the unfiltered alignment file + updated_config = deepcopy(config) + updated_config["alignment_file"] = config["raw_focus_alignment_file"] + ar = hmmbuild_and_search(**updated_config) + + # For hmmbuild and search, we have to read the raw focus alignment file + # to guarantee that the query sequence is present + with open(ar["raw_focus_alignment_file"]) as a: + ali = Alignment.from_file(a, "fasta") + # run jackhmmer against sequence database - ar = jackhmmer_search(**config) + # at this point we have already checked to ensure + # that the input is either jackhmmer or hmmsearch + else: + ar = jackhmmer_search(**config) - with open(ar["raw_alignment_file"]) as a: - ali = Alignment.from_file(a, "stockholm") + with open(ar["raw_alignment_file"]) as a: + ali = Alignment.from_file(a, "stockholm") - # write alignment as FASTA file for easier checking by hand, - # if necessary - with open(config["prefix"] + "_raw.fasta", "w") as f: - ali.write(f) + # write alignment as FASTA file for easier checking by hand, + # if necessary + with open(config["prefix"] + "_raw.fasta", "w") as f: + ali.write(f) # read hmmer hittable and simplify hits = read_hmmer_domtbl(ar["hittable_file"]) @@ -160,6 +189,8 @@ def find_homologs_jackhmmer(**kwargs): "domain_i_Evalue": "e_value", "ali_from": "alignment_start", "ali_to": "alignment_end", + "hmm_from": "hmm_start", + "hmm_to": "hmm_end", } ) @@ -644,8 +675,9 @@ def _create_mapping(r): "method or constructor." ) - ali, hits = find_homologs_jackhmmer( - sequence_database=self.sequence_file, **kwargs + ali, hits = find_homologs( + sequence_database=self.sequence_file, + **kwargs ) # merge with internal table to identify overlap of diff --git a/evcouplings/complex/alignment.py b/evcouplings/complex/alignment.py index 401016a5..d381d124 100644 --- a/evcouplings/complex/alignment.py +++ b/evcouplings/complex/alignment.py @@ -119,17 +119,17 @@ def _prepare_header(id1, id2): ) # concatenate strings - sequences_full = OrderedDict({ - header: np.concatenate([seq1, seq2]) for header, seq1, seq2 in sequences_to_write - }) + sequences_full = OrderedDict([ + (header, np.concatenate([seq1, seq2])) for header, seq1, seq2 in sequences_to_write + ]) - sequences_monomer_1 = OrderedDict({ - header: seq1 for header, seq1, seq2 in sequences_to_write - }) + sequences_monomer_1 = OrderedDict([ + (header, seq1) for header, seq1, seq2 in sequences_to_write + ]) - sequences_monomer_2 = OrderedDict({ - header: seq2 for header, seq1, seq2 in sequences_to_write - }) + sequences_monomer_2 = OrderedDict([ + (header, seq2) for header, seq1, seq2 in sequences_to_write + ]) full_ali = Alignment.from_dict(sequences_full) monomer_ali_1 = Alignment.from_dict(sequences_monomer_1) diff --git a/evcouplings/complex/protocol.py b/evcouplings/complex/protocol.py index 720cc1e8..3bcf7195 100644 --- a/evcouplings/complex/protocol.py +++ b/evcouplings/complex/protocol.py @@ -428,7 +428,7 @@ def _load_monomer_info(annotations_file, identities_file, print(most_similar_in_species.head()) if use_best_reciprocal: paralogs = find_paralogs( - target_sequence, most_similar_in_species, + target_sequence, annotation_table, similarities, identity_threshold ) diff --git a/evcouplings/complex/similarity.py b/evcouplings/complex/similarity.py index e34c7f6e..cc8cdcd7 100644 --- a/evcouplings/complex/similarity.py +++ b/evcouplings/complex/similarity.py @@ -32,8 +32,7 @@ def read_species_annotation_table(annotation_file): Returns ------- pd.DataFrame - the annotation dataframe with an additional column - called species. + an annotation dataframe with columns id, species, and annotation """ data = pd.read_csv(annotation_file, dtype=str) @@ -42,18 +41,18 @@ def read_species_annotation_table(annotation_file): # Determine whether to extract based on the "OS" field # or the "Tax" field. Generally, OS is for Uniprot - # and "Tax" is for Uniref + annotation_column = SPECIES_ANNOTATION_COLUMNS[0] + for column in SPECIES_ANNOTATION_COLUMNS: - # if this column contains non-null values - if data[column].notnull().any(): + # if this column contains more non-null values + if sum(data[column].notnull()) > sum(data[annotation_column].notnull()): # use that column to extract data annotation_column = column - break # creates a new column called species with the species annotations data.loc[:, "species"] = data.loc[:, annotation_column] - return data + return data[["id", "name", "species"]] def most_similar_by_organism(similarities, id_to_organism): @@ -84,11 +83,12 @@ def most_similar_by_organism(similarities, id_to_organism): # find the most similar in every organism most_similar_in_species = data.sort_values(by="identity_to_query").groupby("species").last() most_similar_in_species["species"] = most_similar_in_species.index + most_similar_in_species = most_similar_in_species.reset_index(drop=True) return most_similar_in_species -def find_paralogs(target_id, annotation_data, identity_threshold): +def find_paralogs(target_id, id_to_organism, similarities, identity_threshold): """ Finds all the sequences in the alignment that originate from the same species as the target_id, if those sequences @@ -99,10 +99,10 @@ def find_paralogs(target_id, annotation_data, identity_threshold): ---------- target_id : str Full identifier of the query sequence - annotation_data : pd.DataFrame - With columns id, species, identity_to_query. - The column species contains the species annotation to use. - The column identity_to_query contains the prcent identity to the target id. + similarities : pd.DataFrame + The contents of identities.csv + id_to_organism : pd.DataFrame + The contents of annotation.csv identity_threshold : float Sequences above this identity to the query are not considered paralogs @@ -113,13 +113,14 @@ def find_paralogs(target_id, annotation_data, identity_threshold): Entries are paralogs found in the same genome as the query id """ - base_query = parse_header(target_id) + # output of parse_header is (ID, region_start, region_end) + base_query_id, _, _ = parse_header(target_id) # get all the rows that have an id that contains the # query id. This includes the focus sequence and its hit to # itself in the database. - - contains_annotation = [ True if base_query in x else False for x in annotation_data.id.str ] + annotation_data = similarities.merge(id_to_organism, on="id") + contains_annotation = [base_query_id in x for x in annotation_data.id] query_hits = annotation_data.loc[contains_annotation , :] # get the species annotation for the query sequence query_species = list(query_hits.species.dropna()) diff --git a/evcouplings/couplings/model.py b/evcouplings/couplings/model.py index 6769c038..c82a5bd2 100755 --- a/evcouplings/couplings/model.py +++ b/evcouplings/couplings/model.py @@ -20,6 +20,12 @@ HAMILTONIAN_COMPONENTS = [FULL, COUPLINGS, FIELDS] = [0, 1, 2] NUM_COMPONENTS = len(HAMILTONIAN_COMPONENTS) +# fallback is used when model does not have a defined regularization +# weight for fields (e.g. plmc_v1 format or mean-field couplings) for +# inferring the independent model, note that this model may not +# be directly comparable to the pairwise model (e.g. if it was +# regularized with pseudocounts during mean-field inference) +LAMBDA_H_FALLBACK = 0.01 # Methods for fast calculations (moved outside of class for numba jit) @@ -328,6 +334,18 @@ def __read_plmc_v2(self, filename, precision): np.fromfile(f, precision, 5) ) + # if model was inferred using mean-field, these parameters are meaningless + # and set to -1 in the model file. To prohibit any calculations with these + # values, set to None + if self.lambda_h < 0: + self.lambda_h = None + + if self.lambda_J < 0: + self.lambda_J = None + + if self.lambda_group < 0: + self.lambda_group = None + # Read alphabet (make sure we get proper unicode rather than byte string) self.alphabet = np.fromfile( f, "S1", self.num_symbols @@ -865,13 +883,6 @@ def to_independent_model(self): Estimate parameters of a single-site model using Gaussian prior/L2 regularization. - Parameters - ---------- - N : float - Effective (reweighted) number of sequences - lambda_h : float - Strength of L2 regularization on h_i parameters - Returns ------- CouplingsModel @@ -898,11 +909,20 @@ def _gradient(x, *args): h_i = np.zeros((self.L, self.num_symbols)) + # temporary fix for cases where lambda_h is not defined; + # ultimately different inference should be used when this + # is because of mean-field model that has pseudocounts instead + # of l2 regularization (and therefore no lambda_h or lambda_J) + if self.lambda_h is not None: + lambda_h = self.lambda_h + else: + lambda_h = LAMBDA_H_FALLBACK + for i in range(self.L): - x0 = np.zeros((self.num_symbols)) + x0 = np.zeros(self.num_symbols) h_i[i] = fmin_bfgs( _log_post, x0, _gradient, - args=(self.f_i[i], self.lambda_h, self.N_eff), + args=(self.f_i[i], lambda_h, self.N_eff), disp=False ) diff --git a/evcouplings/couplings/protocol.py b/evcouplings/couplings/protocol.py index d77f39db..e453bd20 100644 --- a/evcouplings/couplings/protocol.py +++ b/evcouplings/couplings/protocol.py @@ -297,6 +297,9 @@ def standard(**kwargs): outcfg, ecs, segments = infer_plmc(**kwargs) model = CouplingsModel(outcfg["model_file"]) + # add mixture model probability + ecs = pairs.add_mixture_probability(ecs) + # following computations are mostly specific to monomer pipeline is_single_segment = segments is None or len(segments) == 1 outcfg = { @@ -447,6 +450,14 @@ def complex(**kwargs): chain=chain_mapping ) } + + # save just the inter protein ECs + ## TODO: eventually have this accomplished by _postprocess_inference + ## right now avoiding a second call with a different ec_filter + ecs = pd.read_csv(outcfg["ec_file"]) + outcfg["inter_ec_file"] = prefix + "_CouplingScores_inter.csv" + inter_ecs = ecs.query("segment_i != segment_j") + inter_ecs.to_csv(outcfg["inter_ec_file"], index=False) # dump output config to YAML file for debugging/logging write_config_file(prefix + ".couplings_complex.outcfg", outcfg) diff --git a/evcouplings/fold/protocol.py b/evcouplings/fold/protocol.py index 8ddac07f..3e17461c 100644 --- a/evcouplings/fold/protocol.py +++ b/evcouplings/fold/protocol.py @@ -167,7 +167,13 @@ def compare_models_maxcluster(experiments, predictions, norm_by_intersection=Tru # structures, so we check that here and fail otherwise) def _determine_pos(filename): structure = ClassicPDB.from_file(filename) - if len(structure.model_to_chains) != 1: + if len(structure.model_to_chains) == 0: + raise InvalidParameterError( + "Structure contains no model (is empty): " + + filename + + " - please verify that no problems occurred during structure mapping" + ) + elif len(structure.model_to_chains) > 1: raise InvalidParameterError( "Structure contains more than one model: " + filename diff --git a/evcouplings/utils/database.py b/evcouplings/utils/database.py index 4278c8be..bedc6993 100644 --- a/evcouplings/utils/database.py +++ b/evcouplings/utils/database.py @@ -50,10 +50,10 @@ class ComputeJob(Base): # be used to point to cloud locations, etc. location = Column(String(1024)) - # job group this job is associated with + # the job_hash this job is associated with # (foreign key in group key if this table # is present, e.g. in webserver). - group_id = Column(Integer) + group_id = Column(String(32)) # job status ("pending", "running", "finished", # "failed", "terminated") diff --git a/evcouplings/utils/update_database.py b/evcouplings/utils/update_database.py index ddc9cca5..c27b5acc 100644 --- a/evcouplings/utils/update_database.py +++ b/evcouplings/utils/update_database.py @@ -119,7 +119,7 @@ def run(**kwargs): if verbose: print("Updating SIFTS") - SIFTS_dir = kwargs.get("sifts", os.path.realpath(__file__)) + SIFTS_dir = os.path.abspath(kwargs.get("sifts", os.path.realpath(__file__))) # create directory if it does not exist # ignores if directory on the way already exist dir = Path(SIFTS_dir) @@ -137,7 +137,7 @@ def run(**kwargs): symlink_force(sifts_fasta, sifts_curr.format(extension="fasta")) # update uniref - db_path = kwargs.get("db", os.path.realpath(__file__)) + db_path = os.path.abspath(kwargs.get("db", os.path.realpath(__file__))) for db_type in ["uniprot", "uniref100", "uniref90" ]: if verbose: diff --git a/evcouplings/visualize/mutations.py b/evcouplings/visualize/mutations.py index 8f8719cf..feccb509 100644 --- a/evcouplings/visualize/mutations.py +++ b/evcouplings/visualize/mutations.py @@ -333,7 +333,7 @@ def matrix_base_bokeh(matrix, positions, substitutions, ) ) - TOOLS = "resize,hover" + TOOLS = "hover" height_factor = 12 width_factor = 10 diff --git a/setup.py b/setup.py index a6f50cf9..d3db73af 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,7 @@ name='evcouplings', # Version: - version='0.0.1', + version='0.0.2', description='A Framework for evolutionary couplings analysis', long_description=readme, @@ -95,4 +95,4 @@ 'biopython', 'sqlalchemy', 'seaborn', ], -) \ No newline at end of file +)