Merge pull request #2896 from tonydavis629/seq_feat

Position Frequency Matrix Featurizer
deepchem · Apr 22, 2022 · 1e522e8 · 1e522e8
2 parents c606d52 + 76c45df
commit 1e522e8
Show file tree

Hide file tree

Showing 7 changed files with 191 additions and 2 deletions.
diff --git a/deepchem/feat/sequence_featurizers/__init__.py b/deepchem/feat/sequence_featurizers/__init__.py
@@ -0,0 +1,2 @@
+# flake8: noqa
+from deepchem.feat.sequence_featurizers.position_frequency_matrix_featurizer import PFMFeaturizer
diff --git a/deepchem/feat/sequence_featurizers/position_frequency_matrix_featurizer.py b/deepchem/feat/sequence_featurizers/position_frequency_matrix_featurizer.py
@@ -0,0 +1,93 @@
+import numpy as np
+from deepchem.feat.molecule_featurizers import OneHotFeaturizer
+from deepchem.feat.base_classes import Featurizer
+from typing import List, Optional
+
+CHARSET = [
+    'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R',
+    'S', 'T', 'V', 'W', 'Y', 'X', 'Z', 'B', 'U', 'O'
+]
+
+
+class PFMFeaturizer(Featurizer):
+  """
+    Encodes a list position frequency matrices for a given list of multiple sequence alignments
+
+    The default character set is 25 amino acids. If you want to use a different character set, such as nucleotides, simply pass in
+    a list of character strings in the featurizer constructor.
+
+    The max_length parameter is the maximum length of the sequences to be featurized. If you want to featurize longer sequences, modify the
+    max_length parameter in the featurizer constructor.
+
+    The final row in the position frequency matrix is the unknown set, if there are any characters which are not included in the charset.
+
+    Examples
+    --------
+    >>> from deepchem.feat.sequence_featurizers import PFMFeaturizer
+    >>> from deepchem.data import NumpyDataset
+    >>> msa = NumpyDataset(X=[['ABC','BCD'],['AAA','AAB']], ids=[['seq01','seq02'],['seq11','seq12']])
+    >>> seqs = msa.X
+    >>> featurizer = PFMFeaturizer()
+    >>> pfm = featurizer.featurize(seqs)
+    >>> pfm.shape
+    (2, 26, 100)
+
+    """
+
+  def __init__(self,
+               charset: List[str] = CHARSET,
+               max_length: Optional[int] = 100):
+    """Initialize featurizer.
+
+        Parameters
+        ----------
+        charset: List[str] (default CHARSET)
+            A list of strings, where each string is length 1 and unique.
+        max_length: int, optional (default 25)
+            Maximum length of sequences to be featurized.
+        """
+    if len(charset) != len(set(charset)):
+      raise ValueError("All values in charset must be unique.")
+    self.charset = charset
+    self.max_length = max_length
+    self.ohe = OneHotFeaturizer(charset=CHARSET, max_length=max_length)
+
+  def _featurize(self, datapoint):
+    """Featurize a multisequence alignment into a position frequency matrix
+
+        Use dc.utils.sequence_utils.hhblits or dc.utils.sequence_utils.hhsearch to create a multiple sequence alignment from a fasta file.
+
+        Parameters
+        ----------
+        datapoint: np.ndarray
+            MSA to featurize. A list of sequences which have been aligned and padded to the same length.
+
+        Returns
+        -------
+        pfm: np.ndarray
+            Position frequency matrix for the set of sequences with the rows corresponding to the unique characters and the columns corresponding to the position in the alignment.
+
+        """
+
+    seq_one_hot = self.ohe.featurize(datapoint)
+
+    seq_one_hot_array = np.transpose(
+        np.array(seq_one_hot), (0, 2, 1)
+    )  # swap rows and columns to make rows the characters, columns the positions
+
+    pfm = np.sum(seq_one_hot_array, axis=0)
+
+    return pfm
+
+
+def PFM_to_PPM(pfm):
+  """
+    Calculate position probability matrix from a position frequency matrix
+    """
+  ppm = pfm.copy()
+  for col in range(ppm.shape[1]):
+    total_count = np.sum(ppm[:, col])
+    if total_count > 0:
+      # Calculate frequency
+      ppm[:, col] = ppm[:, col] / total_count
+  return ppm
diff --git a/deepchem/feat/tests/test_position_frequency_matrix_featurizer.py b/deepchem/feat/tests/test_position_frequency_matrix_featurizer.py
@@ -0,0 +1,34 @@
+import unittest
+import numpy as np
+from deepchem.feat.sequence_featurizers.position_frequency_matrix_featurizer import PFMFeaturizer, CHARSET, PFM_to_PPM
+
+
+class TestPFMFeaturizer(unittest.TestCase):
+  """
+  Test PFMFeaturizer.
+  """
+
+  def setUp(self):
+    """
+    Set up test.
+    """
+    self.msa = np.array([['ABC', 'BCD'], ['AAA', 'AAB']])
+    self.featurizer = PFMFeaturizer()
+    self.max_length = 100
+
+  def test_PFMFeaturizer_arbitrary(self):
+    """
+    Test PFM featurizer for simple MSA.
+    """
+    pfm = self.featurizer.featurize(self.msa)
+    assert pfm.shape == (2, len(CHARSET) + 1, self.max_length)
+    assert pfm[0][0][0] == 1
+
+  def test_PFM_to_PPM(self):
+    """
+    Test PFM_to_PPM.
+    """
+    pfm = self.featurizer.featurize(self.msa)
+    ppm = PFM_to_PPM(pfm[0])
+    assert ppm.shape == (len(CHARSET) + 1, self.max_length)
+    assert ppm[0][0] == .5
diff --git a/deepchem/utils/sequence_utils.py b/deepchem/utils/sequence_utils.py
@@ -51,7 +51,7 @@ def hhblits(dataset_path,
   Examples
   --------
   >>> from deepchem.utils.sequence_utils import hhblits
-  >>> hhblits('test/data/example.fasta', database='example_db', data_dir='test/data/', evalue=0.001, num_iterations=2, num_threads=4)
+  >>> msa_path = hhblits('test/data/example.fasta', database='example_db', data_dir='test/data/', evalue=0.001, num_iterations=2, num_threads=4)
 
   """
 
@@ -89,6 +89,10 @@ def hhblits(dataset_path,
 
   system_call(command)
 
+  msa_path = os.path.join(save_dir, 'results.a3m')
+
+  return msa_path
+
 
 def hhsearch(dataset_path,
              database=None,
@@ -110,7 +114,7 @@ def hhsearch(dataset_path,
   Examples
   --------
   >>> from deepchem.utils.sequence_utils import hhsearch
-  >>> hhsearch('test/data/example.fasta', database='example_db', data_dir='test/data/', evalue=0.001, num_iterations=2, num_threads=4)
+  >>> msa_path = hhsearch('test/data/example.fasta', database='example_db', data_dir='test/data/', evalue=0.001, num_iterations=2, num_threads=4)
 
   Parameters
   ----------
@@ -167,3 +171,28 @@ def hhsearch(dataset_path,
     raiseExceptions('Unsupported file type')
 
   system_call(command)
+
+  msa_path = os.path.join(save_dir, 'results.a3m')
+
+  return msa_path
+
+
+def MSA_to_dataset(msa_path):
+  """
+  Convert a multiple sequence alignment to a NumpyDataset object.
+  """
+
+  from deepchem.data.datasets import NumpyDataset  # NumpyDataset depends on utils, so imported here to prevent circular import
+  from Bio import SeqIO
+
+  with open(msa_path, 'r') as f:
+    ids = []
+    sequences = []
+    for record in SeqIO.parse(f, 'fasta'):
+      ids.append(record.id)
+      seq = []
+      for res in record:
+        seq.append(res)
+      sequences.append(seq)
+    dataset = NumpyDataset(X=sequences, ids=ids)
+    return dataset
diff --git a/deepchem/utils/test/test_sequence_utils.py b/deepchem/utils/test/test_sequence_utils.py
@@ -50,3 +50,18 @@ def test_hhblits(self):
     assert resultsline[0:5] == '>seq0'
     os.remove(results_file)
     os.remove(hhr_file)
+
+  def test_MSA_to_dataset(self):
+    seq_utils.hhsearch(self.dataset_file,
+                       database=self.database_name,
+                       data_dir=self.data_dir)
+    results_file = os.path.join(self.data_dir, 'results.a3m')
+    msa_path = results_file
+    dataset = seq_utils.MSA_to_dataset(msa_path)
+    print(dataset.ids[0])
+    print(dataset.X)
+    assert dataset.ids[0] == 'seq0'
+    assert dataset.ids[1] == 'seq1'
+    bool_arr = dataset.X[0] == ['X', 'Y']
+    assert bool_arr.all()
+    os.remove(results_file)
diff --git a/docs/source/api_reference/featurizers.rst b/docs/source/api_reference/featurizers.rst
@@ -400,6 +400,19 @@ References:
 .. _`RXN Mapper: Unsupervised Attention-Guided Atom-Mapping`: https://chemrxiv.org/articles/Unsupervised_Attention-Guided_Atom-Mapping/12298559
 .. _`Molecular Transformer: Unsupervised Attention-Guided Atom-Mapping`: https://pubs.acs.org/doi/10.1021/acscentsci.9b00576
 
+Sequence Featurizers
+---------------------
+
+PFMFeaturizer
+^^^^^^^^^^^^^
+
+The :code:`dc.feat.PFMFeaturizer` module implements a featurizer for position frequency matrices. 
+This takes in a list of multisequence alignments and returns a list of position frequency matrices.
+
+.. autoclass:: deepchem.feat.sequence_featurizers.PFMFeaturizer
+  :members:
+
+
 Other Featurizers
 -----------------
 
@@ -494,3 +507,4 @@ This featurizer can take a pair of PDB or SDF files which contain ligand molecul
 
 .. autoclass:: deepchem.feat.ComplexFeaturizer
   :members:
+
diff --git a/docs/source/api_reference/utils.rst b/docs/source/api_reference/utils.rst
@@ -123,6 +123,8 @@ Genomic Utilities
 
 .. autofunction:: deepchem.utils.sequence_utils.hhsearch
 
+.. autofunction:: deepchem.utils.sequence_utils.MSA_to_dataset
+
 
 Geometry Utilities
 ------------------