Skip to content

Commit

Permalink
Merge pull request #2896 from tonydavis629/seq_feat
Browse files Browse the repository at this point in the history
Position Frequency Matrix Featurizer
  • Loading branch information
rbharath committed Apr 22, 2022
2 parents c606d52 + 76c45df commit 1e522e8
Show file tree
Hide file tree
Showing 7 changed files with 191 additions and 2 deletions.
2 changes: 2 additions & 0 deletions deepchem/feat/sequence_featurizers/__init__.py
@@ -0,0 +1,2 @@
# flake8: noqa
from deepchem.feat.sequence_featurizers.position_frequency_matrix_featurizer import PFMFeaturizer
@@ -0,0 +1,93 @@
import numpy as np
from deepchem.feat.molecule_featurizers import OneHotFeaturizer
from deepchem.feat.base_classes import Featurizer
from typing import List, Optional

CHARSET = [
'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R',
'S', 'T', 'V', 'W', 'Y', 'X', 'Z', 'B', 'U', 'O'
]


class PFMFeaturizer(Featurizer):
"""
Encodes a list position frequency matrices for a given list of multiple sequence alignments
The default character set is 25 amino acids. If you want to use a different character set, such as nucleotides, simply pass in
a list of character strings in the featurizer constructor.
The max_length parameter is the maximum length of the sequences to be featurized. If you want to featurize longer sequences, modify the
max_length parameter in the featurizer constructor.
The final row in the position frequency matrix is the unknown set, if there are any characters which are not included in the charset.
Examples
--------
>>> from deepchem.feat.sequence_featurizers import PFMFeaturizer
>>> from deepchem.data import NumpyDataset
>>> msa = NumpyDataset(X=[['ABC','BCD'],['AAA','AAB']], ids=[['seq01','seq02'],['seq11','seq12']])
>>> seqs = msa.X
>>> featurizer = PFMFeaturizer()
>>> pfm = featurizer.featurize(seqs)
>>> pfm.shape
(2, 26, 100)
"""

def __init__(self,
charset: List[str] = CHARSET,
max_length: Optional[int] = 100):
"""Initialize featurizer.
Parameters
----------
charset: List[str] (default CHARSET)
A list of strings, where each string is length 1 and unique.
max_length: int, optional (default 25)
Maximum length of sequences to be featurized.
"""
if len(charset) != len(set(charset)):
raise ValueError("All values in charset must be unique.")
self.charset = charset
self.max_length = max_length
self.ohe = OneHotFeaturizer(charset=CHARSET, max_length=max_length)

def _featurize(self, datapoint):
"""Featurize a multisequence alignment into a position frequency matrix
Use dc.utils.sequence_utils.hhblits or dc.utils.sequence_utils.hhsearch to create a multiple sequence alignment from a fasta file.
Parameters
----------
datapoint: np.ndarray
MSA to featurize. A list of sequences which have been aligned and padded to the same length.
Returns
-------
pfm: np.ndarray
Position frequency matrix for the set of sequences with the rows corresponding to the unique characters and the columns corresponding to the position in the alignment.
"""

seq_one_hot = self.ohe.featurize(datapoint)

seq_one_hot_array = np.transpose(
np.array(seq_one_hot), (0, 2, 1)
) # swap rows and columns to make rows the characters, columns the positions

pfm = np.sum(seq_one_hot_array, axis=0)

return pfm


def PFM_to_PPM(pfm):
"""
Calculate position probability matrix from a position frequency matrix
"""
ppm = pfm.copy()
for col in range(ppm.shape[1]):
total_count = np.sum(ppm[:, col])
if total_count > 0:
# Calculate frequency
ppm[:, col] = ppm[:, col] / total_count
return ppm
34 changes: 34 additions & 0 deletions deepchem/feat/tests/test_position_frequency_matrix_featurizer.py
@@ -0,0 +1,34 @@
import unittest
import numpy as np
from deepchem.feat.sequence_featurizers.position_frequency_matrix_featurizer import PFMFeaturizer, CHARSET, PFM_to_PPM


class TestPFMFeaturizer(unittest.TestCase):
"""
Test PFMFeaturizer.
"""

def setUp(self):
"""
Set up test.
"""
self.msa = np.array([['ABC', 'BCD'], ['AAA', 'AAB']])
self.featurizer = PFMFeaturizer()
self.max_length = 100

def test_PFMFeaturizer_arbitrary(self):
"""
Test PFM featurizer for simple MSA.
"""
pfm = self.featurizer.featurize(self.msa)
assert pfm.shape == (2, len(CHARSET) + 1, self.max_length)
assert pfm[0][0][0] == 1

def test_PFM_to_PPM(self):
"""
Test PFM_to_PPM.
"""
pfm = self.featurizer.featurize(self.msa)
ppm = PFM_to_PPM(pfm[0])
assert ppm.shape == (len(CHARSET) + 1, self.max_length)
assert ppm[0][0] == .5
33 changes: 31 additions & 2 deletions deepchem/utils/sequence_utils.py
Expand Up @@ -51,7 +51,7 @@ def hhblits(dataset_path,
Examples
--------
>>> from deepchem.utils.sequence_utils import hhblits
>>> hhblits('test/data/example.fasta', database='example_db', data_dir='test/data/', evalue=0.001, num_iterations=2, num_threads=4)
>>> msa_path = hhblits('test/data/example.fasta', database='example_db', data_dir='test/data/', evalue=0.001, num_iterations=2, num_threads=4)
"""

Expand Down Expand Up @@ -89,6 +89,10 @@ def hhblits(dataset_path,

system_call(command)

msa_path = os.path.join(save_dir, 'results.a3m')

return msa_path


def hhsearch(dataset_path,
database=None,
Expand All @@ -110,7 +114,7 @@ def hhsearch(dataset_path,
Examples
--------
>>> from deepchem.utils.sequence_utils import hhsearch
>>> hhsearch('test/data/example.fasta', database='example_db', data_dir='test/data/', evalue=0.001, num_iterations=2, num_threads=4)
>>> msa_path = hhsearch('test/data/example.fasta', database='example_db', data_dir='test/data/', evalue=0.001, num_iterations=2, num_threads=4)
Parameters
----------
Expand Down Expand Up @@ -167,3 +171,28 @@ def hhsearch(dataset_path,
raiseExceptions('Unsupported file type')

system_call(command)

msa_path = os.path.join(save_dir, 'results.a3m')

return msa_path


def MSA_to_dataset(msa_path):
"""
Convert a multiple sequence alignment to a NumpyDataset object.
"""

from deepchem.data.datasets import NumpyDataset # NumpyDataset depends on utils, so imported here to prevent circular import
from Bio import SeqIO

with open(msa_path, 'r') as f:
ids = []
sequences = []
for record in SeqIO.parse(f, 'fasta'):
ids.append(record.id)
seq = []
for res in record:
seq.append(res)
sequences.append(seq)
dataset = NumpyDataset(X=sequences, ids=ids)
return dataset
15 changes: 15 additions & 0 deletions deepchem/utils/test/test_sequence_utils.py
Expand Up @@ -50,3 +50,18 @@ def test_hhblits(self):
assert resultsline[0:5] == '>seq0'
os.remove(results_file)
os.remove(hhr_file)

def test_MSA_to_dataset(self):
seq_utils.hhsearch(self.dataset_file,
database=self.database_name,
data_dir=self.data_dir)
results_file = os.path.join(self.data_dir, 'results.a3m')
msa_path = results_file
dataset = seq_utils.MSA_to_dataset(msa_path)
print(dataset.ids[0])
print(dataset.X)
assert dataset.ids[0] == 'seq0'
assert dataset.ids[1] == 'seq1'
bool_arr = dataset.X[0] == ['X', 'Y']
assert bool_arr.all()
os.remove(results_file)
14 changes: 14 additions & 0 deletions docs/source/api_reference/featurizers.rst
Expand Up @@ -400,6 +400,19 @@ References:
.. _`RXN Mapper: Unsupervised Attention-Guided Atom-Mapping`: https://chemrxiv.org/articles/Unsupervised_Attention-Guided_Atom-Mapping/12298559
.. _`Molecular Transformer: Unsupervised Attention-Guided Atom-Mapping`: https://pubs.acs.org/doi/10.1021/acscentsci.9b00576

Sequence Featurizers
---------------------

PFMFeaturizer
^^^^^^^^^^^^^

The :code:`dc.feat.PFMFeaturizer` module implements a featurizer for position frequency matrices.
This takes in a list of multisequence alignments and returns a list of position frequency matrices.

.. autoclass:: deepchem.feat.sequence_featurizers.PFMFeaturizer
:members:


Other Featurizers
-----------------

Expand Down Expand Up @@ -494,3 +507,4 @@ This featurizer can take a pair of PDB or SDF files which contain ligand molecul

.. autoclass:: deepchem.feat.ComplexFeaturizer
:members:

2 changes: 2 additions & 0 deletions docs/source/api_reference/utils.rst
Expand Up @@ -123,6 +123,8 @@ Genomic Utilities

.. autofunction:: deepchem.utils.sequence_utils.hhsearch

.. autofunction:: deepchem.utils.sequence_utils.MSA_to_dataset


Geometry Utilities
------------------
Expand Down

0 comments on commit 1e522e8

Please sign in to comment.