In [1]:
# Run once to install the ESM-1b model: https://github.com/facebookresearch/esm
!pip install git+https://github.com/facebookresearch/esm.git

Collecting git+https://github.com/facebookresearch/esm.git
  Cloning https://github.com/facebookresearch/esm.git to /private/var/folders/kp/nj27yjxx3n3dqz4hw_jw1vrh0000gr/T/pip-req-build-ncgljbdk
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/esm.git /private/var/folders/kp/nj27yjxx3n3dqz4hw_jw1vrh0000gr/T/pip-req-build-ncgljbdk
  Resolved https://github.com/facebookresearch/esm.git to commit 2b369911bb5b4b0dda914521b9475cad1656b2ac
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25h

In [2]:
import torch

In [3]:
# https://www.uniprot.org/uniprotkb/P04637/entry
wt_seq = 'MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD'
print(len(wt_seq), 'residues in sequence')

mutations = [
    # TP53 mutation known to destabilize the protein: https://www.pnas.org/doi/10.1073/pnas.0805326105
    'Y220C',
    # Two ClinVar mutations classified as 'benign': https://www.ncbi.nlm.nih.gov/clinvar/?term=Li-Fraumeni+syndrome
    'E298S',
    'Q354K',
]
print(len(mutations), 'mutations')

393 residues in sequence
3 mutations


In [4]:
with open('sequences.fasta', 'w') as fh:
    for mut in mutations:
        aa_pos = int(mut[1:-1])
        aa_ref = mut[0]
        aa_alt = mut[-1]
        print(aa_pos, aa_ref, aa_alt)
        mut_seq = wt_seq[:aa_pos - 1] + aa_alt + wt_seq[aa_pos:]

        print(wt_seq)
        print(mut_seq)

        assert wt_seq[aa_pos - 1] == aa_ref
        assert mut_seq[aa_pos - 1] == aa_alt


        print(f'>{mut}', file=fh)
        print(mut_seq, file=fh)

220 Y C
MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPYEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD
MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIRVEGNLRVEYLDDRNTFRHSVVVPCEPPEVGSDCTTIHYNYMCNSSCMGGMNRRPILTIITLEDSSGNLLGRNSFEVRVCACPGRDRRTEEENLRKKGEPHHELPPGSTKRALPNNTSSSPQPKKKPLDGEYFTLQIRGRERFEMFRELNEALELKDAQAGKEPGGSRAHSSHLKSKKGQSTSRHKKLMFKTEGPDSD
298 E S
MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLSPDDIEQWFTEDPGPDEAPRMPEAAPPVAPAPAAPTPAAPAPAPSWPLSSSVPSQKTYQGSYGFRLGFLHSGTAKSVTCTYSPALNKMFCQLAKTCPVQLWVDSTPPPGTRVRAMAIYKQSQHMTEVVRRCPHHERCSDSDGLAPPQHLIR

In [5]:
!python $CONDA_PREFIX/lib/python3.10/site-packages/esm/scripts/extract.py esm1_t6_43M_UR50S sequences.fasta embeddings --include mean

Read sequences.fasta with 3 sequences
Processing 1 of 1 batches (3 sequences)


In [6]:
# Check shape of arbitrary embedding
torch.load('embeddings/E298S.pt')['mean_representations'][6].shape

torch.Size([768])

In [7]:
# Check shape of arbitrary embedding from validation set
torch.load('project_data/mega_val_embeddings/1I6C.pdb_A26D.pt')['mean_representations'][6].shape

torch.Size([768])