-
Notifications
You must be signed in to change notification settings - Fork 2.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
30 changed files
with
894 additions
and
498 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
9b18d6a
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
#@title 4. Search against genetic databases
#@markdown Once this cell has been executed, you will see
#@markdown statistics about the multiple sequence alignment
#@markdown (MSA) that will be used by AlphaFold. In particular,
#@markdown you’ll see how well each residue is covered by similar
#@markdown sequences in the MSA.
Track cell execution to ensure correct order
notebook_utils.check_cell_execution_order(executed_cells, 4)
--- Python imports ---
import collections
import copy
from concurrent import futures
import json
import random
import shutil
from urllib import request
from google.colab import files
from matplotlib import gridspec
import matplotlib.pyplot as plt
import numpy as np
import py3Dmol
from alphafold.model import model
from alphafold.model import config
from alphafold.model import data
from alphafold.data import feature_processing
from alphafold.data import msa_pairing
from alphafold.data import pipeline
from alphafold.data import pipeline_multimer
from alphafold.data.tools import jackhmmer
from alphafold.common import protein
from alphafold.relax import relax
from alphafold.relax import utils
from IPython import display
from ipywidgets import GridspecLayout
from ipywidgets import Output
Color bands for visualizing plddt
PLDDT_BANDS = [(0, 50, '#FF7D45'),
(50, 70, '#FFDB13'),
(70, 90, '#65CBF3'),
(90, 100, '#0053D6')]
--- Find the closest source ---
test_url_pattern = 'https://storage.googleapis.com/alphafold-colab{:s}/latest/uniref90_2022_01.fasta.1'
ex = futures.ThreadPoolExecutor(3)
def fetch(source):
request.urlretrieve(test_url_pattern.format(source))
return source
fs = [ex.submit(fetch, source) for source in ['', '-europe', '-asia']]
source = None
for f in futures.as_completed(fs):
source = f.result()
ex.shutdown()
break
JACKHMMER_BINARY_PATH = '/usr/bin/jackhmmer'
DB_ROOT_PATH = f'https://storage.googleapis.com/alphafold-colab{source}/latest/'
The z_value is the number of sequences in a database.
MSA_DATABASES = [
{'db_name': 'uniref90',
'db_path': f'{DB_ROOT_PATH}uniref90_2022_01.fasta',
'num_streamed_chunks': 62,
'z_value': 144_113_457},
{'db_name': 'smallbfd',
'db_path': f'{DB_ROOT_PATH}bfd-first_non_consensus_sequences.fasta',
'num_streamed_chunks': 17,
'z_value': 65_984_053},
{'db_name': 'mgnify',
'db_path': f'{DB_ROOT_PATH}mgy_clusters_2022_05.fasta',
'num_streamed_chunks': 120,
'z_value': 623_796_864},
]
Search UniProt and construct the all_seq features only for heteromers, not homomers.
if model_type_to_use == ModelType.MULTIMER and len(set(sequences)) > 1:
MSA_DATABASES.extend([
# Swiss-Prot and TrEMBL are concatenated together as UniProt.
{'db_name': 'uniprot',
'db_path': f'{DB_ROOT_PATH}uniprot_2021_04.fasta',
'num_streamed_chunks': 101,
'z_value': 225_013_025 + 565_928},
])
TOTAL_JACKHMMER_CHUNKS = sum([cfg['num_streamed_chunks'] for cfg in MSA_DATABASES])
MAX_HITS = {
'uniref90': 10_000,
'smallbfd': 5_000,
'mgnify': 501,
'uniprot': 50_000,
}
def get_msa(sequences):
"""Searches for MSA for given sequences using chunked Jackhmmer search.
Args:
sequences: A list of sequences to search against all databases.
Returns:
A dictionary mapping unique sequences to dicionaries mapping each database
to a list of results, one for each chunk of the database.
"""
sequence_to_fasta_path = {}
Deduplicate to not do redundant work for multiple copies of the same chain in homomers.
for sequence_index, sequence in enumerate(sorted(set(sequences)), 1):
fasta_path = f'target_{sequence_index:02d}.fasta'
with open(fasta_path, 'wt') as f:
f.write(f'>query\n{sequence}')
sequence_to_fasta_path[sequence] = fasta_path
Run the search against chunks of genetic databases (since the genetic
databases don't fit in Colab disk).
raw_msa_results = {sequence: {} for sequence in sequence_to_fasta_path.keys()}
print('\nGetting MSA for all sequences')
with tqdm.notebook.tqdm(total=TOTAL_JACKHMMER_CHUNKS, bar_format=TQDM_BAR_FORMAT) as pbar:
def jackhmmer_chunk_callback(i):
pbar.update(n=1)
return raw_msa_results
features_for_chain = {}
raw_msa_results_for_sequence = get_msa(sequences)
for sequence_index, sequence in enumerate(sequences, start=1):
raw_msa_results = copy.deepcopy(raw_msa_results_for_sequence[sequence])
Extract the MSAs from the Stockholm files.
NB: deduplication happens later in pipeline.make_msa_features.
single_chain_msas = []
uniprot_msa = None
for db_name, db_results in raw_msa_results.items():
merged_msa = notebook_utils.merge_chunked_msa(
results=db_results, max_hits=MAX_HITS.get(db_name))
if merged_msa.sequences and db_name != 'uniprot':
single_chain_msas.append(merged_msa)
msa_size = len(set(merged_msa.sequences))
print(f'{msa_size} unique sequences found in {db_name} for sequence {sequence_index}')
elif merged_msa.sequences and db_name == 'uniprot':
uniprot_msa = merged_msa
notebook_utils.show_msa_info(single_chain_msas=single_chain_msas, sequence_index=sequence_index)
Turn the raw data into model features.
feature_dict = {}
feature_dict.update(pipeline.make_sequence_features(
sequence=sequence, description='query', num_res=len(sequence)))
feature_dict.update(pipeline.make_msa_features(msas=single_chain_msas))
We don't use templates in AlphaFold Colab notebook, add only empty placeholder features.
feature_dict.update(notebook_utils.empty_placeholder_template_features(
num_templates=0, num_res=len(sequence)))
Construct the all_seq features only for heteromers, not homomers.
if model_type_to_use == ModelType.MULTIMER and len(set(sequences)) > 1:
valid_feats = msa_pairing.MSA_FEATURES + (
'msa_species_identifiers',
)
all_seq_features = {
f'{k}_all_seq': v for k, v in pipeline.make_msa_features([uniprot_msa]).items()
if k in valid_feats}
feature_dict.update(all_seq_features)
features_for_chain[protein.PDB_CHAIN_IDS[sequence_index - 1]] = feature_dict
Do further feature post-processing depending on the model type.
if model_type_to_use == ModelType.MONOMER:
np_example = features_for_chain[protein.PDB_CHAIN_IDS[0]]
elif model_type_to_use == ModelType.MULTIMER:
all_chain_features = {}
for chain_id, chain_features in features_for_chain.items():
all_chain_features[chain_id] = pipeline_multimer.convert_monomer_features(
chain_features, chain_id)
all_chain_features = pipeline_multimer.add_assembly_features(all_chain_features)
np_example = feature_processing.pair_and_merge(
all_chain_features=all_chain_features)
Pad MSA to avoid zero-sized extra_msa.
np_example = pipeline_multimer.pad_msa(np_example, min_num_seq=512)
executed_cells.add(4)