Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Position Frequency Matrix Featurizer #2896

Merged
merged 27 commits into from Apr 22, 2022
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
726565a
hhblits/hhsearch wrapper, tests, tutorial, docs
tonydavis629 Feb 25, 2022
389a150
add missing example_db files
tonydavis629 Feb 25, 2022
a507a40
fixed test
tonydavis629 Mar 3, 2022
cddd32d
Merge branch 'master' into hhsuite
tonydavis629 Mar 3, 2022
df148c7
fix test
tonydavis629 Mar 4, 2022
302e463
Merge branch 'master' into hhsuite
tonydavis629 Mar 4, 2022
34d3285
fix doc test
tonydavis629 Mar 4, 2022
f998486
fix doctest
tonydavis629 Mar 8, 2022
29b0add
Merge branch 'master' into hhsuite
tonydavis629 Mar 8, 2022
fc03218
msa to dataset added
tonydavis629 Mar 9, 2022
90a58f3
seq_feat
tonydavis629 Mar 10, 2022
a1797ac
seq_feat
tonydavis629 Mar 11, 2022
2153025
Merge branch 'master' into seq_feat
tonydavis629 Mar 28, 2022
3152943
pfm featurizer
tonydavis629 Mar 28, 2022
d0b1be5
sequence featurizers group
tonydavis629 Mar 28, 2022
aa33db7
one_hot
tonydavis629 Apr 8, 2022
ab6a0e8
position freq matrix
tonydavis629 Apr 8, 2022
a3f0f17
pfm to ppm
tonydavis629 Apr 8, 2022
9896692
msa_to_dataset test complete
tonydavis629 Apr 11, 2022
4d9f505
move pfm_to_ppm to test_pfm
tonydavis629 Apr 11, 2022
feb8cd1
updated docs for PFM
tonydavis629 Apr 11, 2022
b4858fd
yapf, flake8, doctest
tonydavis629 Apr 11, 2022
52a6668
Merge branch 'master' into seq_feat
tonydavis629 Apr 11, 2022
c2277b9
remove bioconda from requirements channels
tonydavis629 Apr 15, 2022
d7c9d5b
lowecase bio
tonydavis629 Apr 18, 2022
d6cf7a3
move bio.seq import to function
tonydavis629 Apr 22, 2022
76c45df
fix base_classes.py formatting
tonydavis629 Apr 22, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions deepchem/feat/base_classes.py
Expand Up @@ -498,3 +498,4 @@ def featurize(self,
the datapoints.
"""
return np.asarray(datapoints)

2 changes: 2 additions & 0 deletions deepchem/feat/sequence_featurizers/__init__.py
@@ -0,0 +1,2 @@
# flake8: noqa
from deepchem.feat.sequence_featurizers.position_frequency_matrix_featurizer import PFMFeaturizer
@@ -0,0 +1,93 @@
import numpy as np
from deepchem.feat.molecule_featurizers import OneHotFeaturizer
from deepchem.feat.base_classes import Featurizer
from typing import List, Optional

CHARSET = [
'A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R',
'S', 'T', 'V', 'W', 'Y', 'X', 'Z', 'B', 'U', 'O'
]


class PFMFeaturizer(Featurizer):
"""
Encodes a list position frequency matrices for a given list of multiple sequence alignments

The default character set is 25 amino acids. If you want to use a different character set, such as nucleotides, simply pass in
a list of character strings in the featurizer constructor.

The max_length parameter is the maximum length of the sequences to be featurized. If you want to featurize longer sequences, modify the
max_length parameter in the featurizer constructor.

The final row in the position frequency matrix is the unknown set, if there are any characters which are not included in the charset.

Examples
--------
>>> from deepchem.feat.sequence_featurizers import PFMFeaturizer
>>> from deepchem.data import NumpyDataset
>>> msa = NumpyDataset(X=[['ABC','BCD'],['AAA','AAB']], ids=[['seq01','seq02'],['seq11','seq12']])
>>> seqs = msa.X
>>> featurizer = PFMFeaturizer()
>>> pfm = featurizer.featurize(seqs)
>>> pfm.shape
(2, 26, 100)

"""

def __init__(self,
charset: List[str] = CHARSET,
max_length: Optional[int] = 100):
"""Initialize featurizer.

Parameters
----------
charset: List[str] (default CHARSET)
A list of strings, where each string is length 1 and unique.
max_length: int, optional (default 25)
Maximum length of sequences to be featurized.
"""
if len(charset) != len(set(charset)):
raise ValueError("All values in charset must be unique.")
self.charset = charset
self.max_length = max_length
self.ohe = OneHotFeaturizer(charset=CHARSET, max_length=max_length)

def _featurize(self, datapoint):
"""Featurize a multisequence alignment into a position frequency matrix

Use dc.utils.sequence_utils.hhblits or dc.utils.sequence_utils.hhsearch to create a multiple sequence alignment from a fasta file.

Parameters
----------
datapoint: np.ndarray
MSA to featurize. A list of sequences which have been aligned and padded to the same length.

Returns
-------
pfm: np.ndarray
Position frequency matrix for the set of sequences with the rows corresponding to the unique characters and the columns corresponding to the position in the alignment.

"""

seq_one_hot = self.ohe.featurize(datapoint)

seq_one_hot_array = np.transpose(
np.array(seq_one_hot), (0, 2, 1)
) # swap rows and columns to make rows the characters, columns the positions

pfm = np.sum(seq_one_hot_array, axis=0)

return pfm


def PFM_to_PPM(pfm):
"""
Calculate position probability matrix from a position frequency matrix
"""
ppm = pfm.copy()
for col in range(ppm.shape[1]):
total_count = np.sum(ppm[:, col])
if total_count > 0:
# Calculate frequency
ppm[:, col] = ppm[:, col] / total_count
return ppm
34 changes: 34 additions & 0 deletions deepchem/feat/tests/test_position_frequency_matrix_featurizer.py
@@ -0,0 +1,34 @@
import unittest
import numpy as np
from deepchem.feat.sequence_featurizers.position_frequency_matrix_featurizer import PFMFeaturizer, CHARSET, PFM_to_PPM


class TestPFMFeaturizer(unittest.TestCase):
"""
Test PFMFeaturizer.
"""

def setUp(self):
"""
Set up test.
"""
self.msa = np.array([['ABC', 'BCD'], ['AAA', 'AAB']])
self.featurizer = PFMFeaturizer()
self.max_length = 100

def test_PFMFeaturizer_arbitrary(self):
"""
Test PFM featurizer for simple MSA.
"""
pfm = self.featurizer.featurize(self.msa)
assert pfm.shape == (2, len(CHARSET) + 1, self.max_length)
assert pfm[0][0][0] == 1

def test_PFM_to_PPM(self):
"""
Test PFM_to_PPM.
"""
pfm = self.featurizer.featurize(self.msa)
ppm = PFM_to_PPM(pfm[0])
assert ppm.shape == (len(CHARSET) + 1, self.max_length)
assert ppm[0][0] == .5
33 changes: 31 additions & 2 deletions deepchem/utils/sequence_utils.py
@@ -1,6 +1,7 @@
from logging import raiseExceptions
import os
import subprocess
from bio import SeqIO
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should be in a try except block or SeqIO should be imported locally in a function where it is used



def system_call(command):
Expand Down Expand Up @@ -51,7 +52,7 @@ def hhblits(dataset_path,
Examples
--------
>>> from deepchem.utils.sequence_utils import hhblits
>>> hhblits('test/data/example.fasta', database='example_db', data_dir='test/data/', evalue=0.001, num_iterations=2, num_threads=4)
>>> msa_path = hhblits('test/data/example.fasta', database='example_db', data_dir='test/data/', evalue=0.001, num_iterations=2, num_threads=4)

"""

Expand Down Expand Up @@ -89,6 +90,10 @@ def hhblits(dataset_path,

system_call(command)

msa_path = os.path.join(save_dir, 'results.a3m')

return msa_path


def hhsearch(dataset_path,
database=None,
Expand All @@ -110,7 +115,7 @@ def hhsearch(dataset_path,
Examples
--------
>>> from deepchem.utils.sequence_utils import hhsearch
>>> hhsearch('test/data/example.fasta', database='example_db', data_dir='test/data/', evalue=0.001, num_iterations=2, num_threads=4)
>>> msa_path = hhsearch('test/data/example.fasta', database='example_db', data_dir='test/data/', evalue=0.001, num_iterations=2, num_threads=4)

Parameters
----------
Expand Down Expand Up @@ -167,3 +172,27 @@ def hhsearch(dataset_path,
raiseExceptions('Unsupported file type')

system_call(command)

msa_path = os.path.join(save_dir, 'results.a3m')

return msa_path


def MSA_to_dataset(msa_path):
"""
Convert a multiple sequence alignment to a NumpyDataset object.
"""

from deepchem.data.datasets import NumpyDataset # NumpyDataset depends on utils, so imported here to prevent circular import

with open(msa_path, 'r') as f:
ids = []
sequences = []
for record in SeqIO.parse(f, 'fasta'):
ids.append(record.id)
seq = []
for res in record:
seq.append(res)
sequences.append(seq)
dataset = NumpyDataset(X=sequences, ids=ids)
return dataset
15 changes: 15 additions & 0 deletions deepchem/utils/test/test_sequence_utils.py
Expand Up @@ -50,3 +50,18 @@ def test_hhblits(self):
assert resultsline[0:5] == '>seq0'
os.remove(results_file)
os.remove(hhr_file)

def test_MSA_to_dataset(self):
seq_utils.hhsearch(self.dataset_file,
database=self.database_name,
data_dir=self.data_dir)
results_file = os.path.join(self.data_dir, 'results.a3m')
msa_path = results_file
dataset = seq_utils.MSA_to_dataset(msa_path)
print(dataset.ids[0])
print(dataset.X)
assert dataset.ids[0] == 'seq0'
assert dataset.ids[1] == 'seq1'
bool_arr = dataset.X[0] == ['X', 'Y']
assert bool_arr.all()
os.remove(results_file)
14 changes: 14 additions & 0 deletions docs/source/api_reference/featurizers.rst
Expand Up @@ -400,6 +400,19 @@ References:
.. _`RXN Mapper: Unsupervised Attention-Guided Atom-Mapping`: https://chemrxiv.org/articles/Unsupervised_Attention-Guided_Atom-Mapping/12298559
.. _`Molecular Transformer: Unsupervised Attention-Guided Atom-Mapping`: https://pubs.acs.org/doi/10.1021/acscentsci.9b00576

Sequence Featurizers
---------------------

PFMFeaturizer
^^^^^^^^^^^^^

The :code:`dc.feat.PFMFeaturizer` module implements a featurizer for position frequency matrices.
This takes in a list of multisequence alignments and returns a list of position frequency matrices.

.. autoclass:: deepchem.feat.sequence_featurizers.PFMFeaturizer
:members:


Other Featurizers
-----------------

Expand Down Expand Up @@ -494,3 +507,4 @@ This featurizer can take a pair of PDB or SDF files which contain ligand molecul

.. autoclass:: deepchem.feat.ComplexFeaturizer
:members:

2 changes: 2 additions & 0 deletions docs/source/api_reference/utils.rst
Expand Up @@ -123,6 +123,8 @@ Genomic Utilities

.. autofunction:: deepchem.utils.sequence_utils.hhsearch

.. autofunction:: deepchem.utils.sequence_utils.MSA_to_dataset


Geometry Utilities
------------------
Expand Down