In [29]:
from fold import fold

variant = fold("MVTRLEIHYTGEIPVRYNLKADFEGSRYTVEGKGTVNPATGKLTLRLVCTTGDLPVYWPTLVTTFGYGLQCFAEEQKGNRIYPFMGSWGPRKKVLTRHITDGKDIVDATFAFEGNVLVTDVNLYADKGAINGAIMRKLLKKQERPYLHHWRYDPERQGFMGAQRVFQHLKNGKEAEVLEAIEIVKTDNFGHGRPSEYVTKYTSYLGHHADLLEDAIEIEVALEQFGADSNGLIARLGSD" + "RPAANDENYAASV")

In [30]:
wt_seq = 'MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFGYGLQCFARYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSYQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK' + 'RPAANDENYAASV'
wt = fold(wt_seq)

In [None]:
import torch
import numpy as np
from typing import List, Tuple
import py3Dmol

from esm.pretrained import ESM3_sm_open_v0
from esm.utils.structure.protein_chain import ProteinChain
from esm.utils.structure.protein_structure import compute_affine_and_rmsd
from esm.utils.structure.aligner import Aligner
from esm.sdk.api import ESMProtein, GenerationConfig
from esm.tokenization import EsmSequenceTokenizer
from esm.utils.constants.esm3 import SEQUENCE_MASK_TOKEN
from esm.models.esm3 import ESM3
from esm.utils.generation import iterative_sampling_raw
from esm.sdk.api import ESM3InferenceClient, ESMProtein, GenerationConfig, SamplingConfig, SamplingTrackConfig

import pandas as pd
import os
from huggingface_hub import snapshot_download
from pathlib import Path
from tqdm import tqdm
import os
from pathlib import Path
import sys
from transformers import EsmTokenizer, EsmForSequenceClassification
import torch
from peft import PeftModelForSequenceClassification
import seaborn as sns
import pandas as pd
%set_env TOKENIZERS_PARALLELISM=false

# Visualize original YFP structure and generated variant
def visualize_structures(template: ProteinChain, variant: ProteinChain, highlight_residues: List[int] = None):
    view = py3Dmol.view(width=800, height=400, viewergrid=(1, 2))
    
    # Template structure
    template_pdb = template.to_pdb_string()
    view.addModel(template_pdb, "pdb", viewer=(0,0))
    view.setStyle({"cartoon": {"color": "lightgrey"}}, viewer=(0,0))
    if highlight_residues:
        view.addStyle({"resi": highlight_residues}, {"cartoon": {"color": "red"}}, viewer=(0,0))
    
    # Variant structure
    variant_pdb = variant.to_pdb_string()
    view.addModel(variant_pdb, "pdb", viewer=(0,1))
    view.setStyle({"cartoon": {"color": "lightblue"}}, viewer=(0,1))
    if highlight_residues:
        view.addStyle({"resi": highlight_residues}, {"cartoon": {"color": "red"}}, viewer=(0,1))
    
    view.zoomTo()
    return view

print("Visualizing original YFP structure (left) and generated variant (right) with key residues highlighted in red:")
# all_key_residues = list(set(sequence_fixed_indices + structure_fixed_indices))



In [21]:
ESMProtein.from_pdb('/home/naka/Downloads/AF-A0A059PIR9-F1-model_v4.pdb').sequence

'MRKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATNGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFARYPDHMKQHDFFKSAMPEGYVQERTISFKDDGTYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNFNSHNVYITADKQKNGIKANFKIRHNVEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSYQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYKRPAANDENYAASV'

In [23]:
from_pdb_seq = 'MRKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATNGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFARYPDHMKQHDFFKSAMPEGYVQERTISFKDDGTYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNFNSHNVYITADKQKNGIKANFKIRHNVEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSYQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYKRPAANDENYAASV'
from_pdb = fold(from_pdb_seq)

In [28]:
len(wt_seq)

239

In [27]:
len(from_pdb_seq)

251

In [31]:
ref_chain = wt.to_protein_chain()

In [32]:
variant_chain = variant.to_protein_chain()

In [33]:
from_pdb_chain = from_pdb.to_protein_chain()

In [36]:
visualize_structures(ref_chain, from_pdb_chain, ).show()

In [40]:
model: ESM3InferenceClient = ESM3.from_pretrained("esm3_sm_open_v1").to("cuda") 
tokenizer = EsmSequenceTokenizer()

Fetching 22 files:   0%|          | 0/22 [00:00<?, ?it/s]

In [41]:
structure_generation_config = GenerationConfig(
    track="structure", 
    num_steps=250, 
    # temperature=0.1
    )
wt_esm = model.generate(
    ESMProtein(sequence=wt_seq), 
    structure_generation_config
    )

# Convert ESMProtein to ProteinChain for easier handling
wt_esm = wt_esm.to_protein_chain()

100%|██████████| 250/250 [00:15<00:00, 15.88it/s]


In [44]:
len(wt_seq)

252

In [42]:
visualize_structures(ref_chain, wt_esm, ).show()

In [45]:


# set up saprot models
yfp_adapter_input = "SaProtHub/Model-EYFP-650M"
base_model_name = "westlake-repl/SaProt_650M_AF2"
fluor_adapter_input = 'SaProtHub/Model-Fluorescence-650M'

yfp_adapter_path = snapshot_download(repo_id=yfp_adapter_input, repo_type="model")
fluor_adapter_path = snapshot_download(repo_id=fluor_adapter_input, repo_type="model")
base_model = EsmForSequenceClassification.from_pretrained(base_model_name, num_labels=1,)
saprot_yfp_model = PeftModelForSequenceClassification.from_pretrained(
    base_model,
    yfp_adapter_path,
)

base_model_fluor = EsmForSequenceClassification.from_pretrained(base_model_name, num_labels=1,)
saprot_fluor_model = PeftModelForSequenceClassification.from_pretrained(
    base_model_fluor,
    fluor_adapter_path,
)

tokenizer = EsmTokenizer.from_pretrained(base_model_name)
device = "cuda" if torch.cuda.is_available() else "cpu"
saprot_yfp_model.to(device);
saprot_fluor_model.to(device);


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at westlake-repl/SaProt_650M_AF2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'esm.contact_head.regression.bias', 'esm.contact_head.regression.weight', 'esm.embeddings.position_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of EsmForSequenceClassification were not initialized from the model checkpoint at westlake-repl/SaProt_650M_AF2 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'esm.contact_head.regression.bias', 'esm.contact_head.regression.weight', 'esm.embeddings.position_embeddings.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
sequences = {
    'wt_seq': 'MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFGYGLQCFARYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSYQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK',
    'extended': 'MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFGYGLQCFARYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSYQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYKRPAANDENYAASV',
    'alt_seq': 'MRKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFGYGLQCFARYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSYQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK',
    'alt_extended': 'MRKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFGYGLQCFARYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSYQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYKRPAANDENYAASV',
    'check': 'MRKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFGYGLQCFARYPDHMKQGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSYQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK'
}


In [70]:
aa_seqs = [
    {'name': 'eYFP', 'sequence': 'MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFGYGLQCFARYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSYQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK',},
    {'name': 'Citrine', 'sequence': 'MVSKGEELFT GVVPILVELD GDVNGHKFSV SGEGEGDATY GKLTLKFICT TGKLPVPWPT LVTTFGYGLM CFARYPDHMK QHDFFKSAMP EGYVQERTIF FKDDGNYKTR AEVKFEGDTL VNRIELKGID FKEDGNILGH KLEYNYNSHN VYIMADKQKN GIKVNFKIRH NIEDGSVQLA DHYQQNTPIG DGPVLLPDNH YLSYQSALSK DPNEKRDHMV LLEFVTAAGI TLGMDELYK'.replace(' ', '')},
    {'name': 'mCitrine', 'sequence': 'MVSKGEELFT GVVPILVELD GDVNGHKFSV SGEGEGDATY GKLTLKFICT TGKLPVPWPT LVTTFGYGLM CFARYPDHMK QHDFFKSAMP EGYVQERTIF FKDDGNYKTR AEVKFEGDTL VNRIELKGID FKEDGNILGH KLEYNYNSHN VYIMADKQKN GIKVNFKIRH NIEDGSVQLA DHYQQNTPIG DGPVLLPDNH YLSYQSKLSK DPNEKRDHMV LLEFVTAAGI TLGMDELYK'.replace(' ', '')},
    {'name': 'Citrine2', 'sequence': 'MVSKGEELFT GVVPILVELD GDVNGHKFSV TGEGEGDATY GKLTLKFICT TGKLPVPWPT LVTTFGYGLT CFARYPDHMK QHDFFKSAMP EGYVQERTIF FKDDGNYKTR AEVKFEGDTL VNRIELKGID FKEDGNILGH KLEYNHNSHY VYIMADKQKN GIKANFKIRH NIEDGSVQLA DHYQQNTPIG DGPVLLPDNH YLSYQSQLSK DPNEERDHTV LLEFVTAAGI TLGMGELYK'.replace(' ', '')},
    {'name': 'Venus', 'sequence': 'MVSKGEELFT GVVPILVELD GDVNGHKFSV SGEGEGDATY GKLTLKLICT TGKLPVPWPT LVTTLGYGLQ CFARYPDHMK QHDFFKSAMP EGYVQERTIF FKDDGNYKTR AEVKFEGDTL VNRIELKGID FKEDGNILGH KLEYNYNSHN VYITADKQKN GIKANFKIRH NIEDGGVQLA DHYQQNTPIG DGPVLLPDNH YLSYQSALSK DPNEKRDHMV LLEFVTAAGI TLGMDELYK'.replace(' ', '')},
    {'name': 'mVenus', 'sequence': 'MVSKGEELFT GVVPILVELD GDVNGHKFSV SGEGEGDATY GKLTLKLICT TGKLPVPWPT LVTTLGYGLQ CFARYPDHMK QHDFFKSAMP EGYVQERTIF FKDDGNYKTR AEVKFEGDTL VNRIELKGID FKEDGNILGH KLEYNYNSHN VYITADKQKN GIKANFKIRH NIEDGGVQLA DHYQQNTPIG DGPVLLPDNH YLSYQSKLSK DPNEKRDHMV LLEFVTAAGI TLGMDELYK'.replace(' ', '')},
    {'name': 'mTurquoise', 'sequence': 'MVSKGEELFT GVVPILVELD GDVNGHKFSV SGEGEGDATY GKLTLKFICT TGKLPVPWPT LVTTLSWGVQ CFARYPDHMK QHDFFKSAMP EGYVQERTIF FKDDGNYKTR AEVKFEGDTL VNRIELKGID FKEDGNILGH KLEYNYISDN VYITADKQKN GIKANFKIRH NIEDGGVQLA DHYQQNTPIG DGPVLLPDNH YLSTQSKLSK DPNEKRDHMV LLEFVTAAGI TLGMDELYK'.replace(' ', '')},
    {'name': 'mEmerald', 'sequence': 'MVSKGEELFT GVVPILVELD GDVNGHKFSV SGEGEGDATY GKLTLKFICT TGKLPVPWPT LVTTLTYGVQ CFARYPDHMK QHDFFKSAMP EGYVQERTIF FKDDGNYKTR AEVKFEGDTL VNRIELKGID FKEDGNILGH KLEYNYNSHK VYITADKQKN GIKVNFKTRH NIEDGSVQLA DHYQQNTPIG DGPVLLPDNH YLSTQSKLSK DPNEKRDHMV LLEFVTAAGI TLGMDELYK'.replace(' ', '')},
    {'name': 'mRuby3', 'sequence': 'MVSKGEELIK ENMRMKVVME GSVNGHQFKC TGEGEGRPYE GVQTMRIKVI EGGPLPFAFD ILATSFMYGS RTFIKYPADI PDFFKQSFPE GFTWERVTRY EDGGVVTVTQ DTSLEDGELV YNVKVRGVNF PSNGPVMQKK TKGWEPNTEM MYPADGGLRG YTDIALKVDG GGHLHCNFVT TYRSKKTVGN IKMPGVHAVD HRLERIEESD NETYVVQREV AVAKYSNLGG GMDELYK'.replace(' ', '')},
    {'name': 'mStayGold2', 'sequence': 'MVSTGEELFT GVVPFKFQLK GTINGKSFTV EGEGEGNSHE GSHKGKYVCT SGKLPMSWAA LGTSFGYGMK YYTKYPSGLK NWFHEVMPEG FTYDRHIQYK GDGSIHAKHQ HFMKNGTYHN IVEFTGQDFK ENSPVLTGDM DVSLPNEVQH IPRDDGVECT VTLTYPLLSD ESKCVEAYQN TIIKPLHNQP APDVPYHWIR KQYTQSKDDT EERDHIIQSE TLEAHLYSRT KLE'.replace(' ', '')},
    ]

In [65]:
def AA_to_SA(aa_seq):
    sa_seq = ''
    for aa in aa_seq:
        sa_seq += aa + '#'
    return sa_seq

In [68]:
scores = {}
for key, value in sa_seqs.items():
    scores[key] = saprot_yfp_model(tokenizer(value, return_tensors='pt').input_ids.to(device))

In [69]:
scores

{'wt_seq': SequenceClassifierOutput(loss=None, logits=tensor([[0.9814]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None),
 'extended': SequenceClassifierOutput(loss=None, logits=tensor([[0.9788]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None),
 'alt_seq': SequenceClassifierOutput(loss=None, logits=tensor([[0.8731]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None),
 'alt_extended': SequenceClassifierOutput(loss=None, logits=tensor([[0.8259]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None),
 'check': SequenceClassifierOutput(loss=None, logits=tensor([[0.0690]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)}

SequenceClassifierOutput(loss=None, logits=tensor([[0.3708]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [54]:
saprot_yfp_model(tokenizer(wt_seq, return_tensors='pt').input_ids.to(device))


SequenceClassifierOutput(loss=None, logits=tensor([[0.3708]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [55]:
saprot_yfp_model(tokenizer(alt_seq, return_tensors='pt').input_ids.to(device))

SequenceClassifierOutput(loss=None, logits=tensor([[0.3708]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [56]:
saprot_yfp_model(tokenizer(alt_extended, return_tensors='pt').input_ids.to(device))

SequenceClassifierOutput(loss=None, logits=tensor([[0.3708]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [58]:
saprot_yfp_model(tokenizer(check, return_tensors='pt').input_ids.to(device))

SequenceClassifierOutput(loss=None, logits=tensor([[0.3708]], device='cuda:0', grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [75]:

aa_seqs = [
    {'name': 'eYFP', 'sequence': 'MVSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTFGYGLQCFARYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSYQSALSKDPNEKRDHMVLLEFVTAAGITLGMDELYK',},
    {'name': 'Citrine', 'sequence': 'MVSKGEELFT GVVPILVELD GDVNGHKFSV SGEGEGDATY GKLTLKFICT TGKLPVPWPT LVTTFGYGLM CFARYPDHMK QHDFFKSAMP EGYVQERTIF FKDDGNYKTR AEVKFEGDTL VNRIELKGID FKEDGNILGH KLEYNYNSHN VYIMADKQKN GIKVNFKIRH NIEDGSVQLA DHYQQNTPIG DGPVLLPDNH YLSYQSALSK DPNEKRDHMV LLEFVTAAGI TLGMDELYK'.replace(' ', '')},
    {'name': 'mCitrine', 'sequence': 'MVSKGEELFT GVVPILVELD GDVNGHKFSV SGEGEGDATY GKLTLKFICT TGKLPVPWPT LVTTFGYGLM CFARYPDHMK QHDFFKSAMP EGYVQERTIF FKDDGNYKTR AEVKFEGDTL VNRIELKGID FKEDGNILGH KLEYNYNSHN VYIMADKQKN GIKVNFKIRH NIEDGSVQLA DHYQQNTPIG DGPVLLPDNH YLSYQSKLSK DPNEKRDHMV LLEFVTAAGI TLGMDELYK'.replace(' ', '')},
    {'name': 'Citrine2', 'sequence': 'MVSKGEELFT GVVPILVELD GDVNGHKFSV TGEGEGDATY GKLTLKFICT TGKLPVPWPT LVTTFGYGLT CFARYPDHMK QHDFFKSAMP EGYVQERTIF FKDDGNYKTR AEVKFEGDTL VNRIELKGID FKEDGNILGH KLEYNHNSHY VYIMADKQKN GIKANFKIRH NIEDGSVQLA DHYQQNTPIG DGPVLLPDNH YLSYQSQLSK DPNEERDHTV LLEFVTAAGI TLGMGELYK'.replace(' ', '')},
    {'name': 'Venus', 'sequence': 'MVSKGEELFT GVVPILVELD GDVNGHKFSV SGEGEGDATY GKLTLKLICT TGKLPVPWPT LVTTLGYGLQ CFARYPDHMK QHDFFKSAMP EGYVQERTIF FKDDGNYKTR AEVKFEGDTL VNRIELKGID FKEDGNILGH KLEYNYNSHN VYITADKQKN GIKANFKIRH NIEDGGVQLA DHYQQNTPIG DGPVLLPDNH YLSYQSALSK DPNEKRDHMV LLEFVTAAGI TLGMDELYK'.replace(' ', '')},
    {'name': 'mVenus', 'sequence': 'MVSKGEELFT GVVPILVELD GDVNGHKFSV SGEGEGDATY GKLTLKLICT TGKLPVPWPT LVTTLGYGLQ CFARYPDHMK QHDFFKSAMP EGYVQERTIF FKDDGNYKTR AEVKFEGDTL VNRIELKGID FKEDGNILGH KLEYNYNSHN VYITADKQKN GIKANFKIRH NIEDGGVQLA DHYQQNTPIG DGPVLLPDNH YLSYQSKLSK DPNEKRDHMV LLEFVTAAGI TLGMDELYK'.replace(' ', '')},
    {'name': 'mTurquoise', 'sequence': 'MVSKGEELFT GVVPILVELD GDVNGHKFSV SGEGEGDATY GKLTLKFICT TGKLPVPWPT LVTTLSWGVQ CFARYPDHMK QHDFFKSAMP EGYVQERTIF FKDDGNYKTR AEVKFEGDTL VNRIELKGID FKEDGNILGH KLEYNYISDN VYITADKQKN GIKANFKIRH NIEDGGVQLA DHYQQNTPIG DGPVLLPDNH YLSTQSKLSK DPNEKRDHMV LLEFVTAAGI TLGMDELYK'.replace(' ', '')},
    {'name': 'mEmerald', 'sequence': 'MVSKGEELFT GVVPILVELD GDVNGHKFSV SGEGEGDATY GKLTLKFICT TGKLPVPWPT LVTTLTYGVQ CFARYPDHMK QHDFFKSAMP EGYVQERTIF FKDDGNYKTR AEVKFEGDTL VNRIELKGID FKEDGNILGH KLEYNYNSHK VYITADKQKN GIKVNFKTRH NIEDGSVQLA DHYQQNTPIG DGPVLLPDNH YLSTQSKLSK DPNEKRDHMV LLEFVTAAGI TLGMDELYK'.replace(' ', '')},
    {'name': 'mRuby3', 'sequence': 'MVSKGEELIK ENMRMKVVME GSVNGHQFKC TGEGEGRPYE GVQTMRIKVI EGGPLPFAFD ILATSFMYGS RTFIKYPADI PDFFKQSFPE GFTWERVTRY EDGGVVTVTQ DTSLEDGELV YNVKVRGVNF PSNGPVMQKK TKGWEPNTEM MYPADGGLRG YTDIALKVDG GGHLHCNFVT TYRSKKTVGN IKMPGVHAVD HRLERIEESD NETYVVQREV AVAKYSNLGG GMDELYK'.replace(' ', '')},
    {'name': 'mStayGold2', 'sequence': 'MVSTGEELFT GVVPFKFQLK GTINGKSFTV EGEGEGNSHE GSHKGKYVCT SGKLPMSWAA LGTSFGYGMK YYTKYPSGLK NWFHEVMPEG FTYDRHIQYK GDGSIHAKHQ HFMKNGTYHN IVEFTGQDFK ENSPVLTGDM DVSLPNEVQH IPRDDGVECT VTLTYPLLSD ESKCVEAYQN TIIKPLHNQP APDVPYHWIR KQYTQSKDDT EERDHIIQSE TLEAHLYSRT KLE'.replace(' ', '')},
    {'name': 'mGold', 'sequence': 'MVSKGEELFT GVVPILVELD GDVNGHKFSV SGEGEGDATY GKLTLKFICT TGKLPVPWPT LVTSLGYGLQ CFARYPDHMK QHDFFKSAMP EGYVQERTIF FKDDGNYKTR AEVKFEGDTL VNRIELKGID FKEDGNILGH KLEYNYNSHN VYITADKQKN GIKANFKIRH NIEDGGVQLA DHYQQNTPIG DGPVLLPDNH YLSYQSKLSK DPNEKRDHMV LLEFVTAAGI TLGMDELYK'.replace(' ', '')}
    ]





sa_seqs = [{'name': aa_seq['name'], 'sa_sequence': AA_to_SA(aa_seq['sequence']), 'length': len(aa_seq['sequence'])} for aa_seq in aa_seqs]


df = pd.DataFrame(sa_seqs)


yfp_outputs_list = []
fluor_outputs_list = []
for index in tqdm(range(len(df))):
    seq = df['sa_sequence'].iloc[index]
    inputs = tokenizer(seq, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad(): yfp_outputs = saprot_yfp_model(**inputs)
    yfp_outputs_list.append(float(yfp_outputs.logits.detach().cpu().numpy()[0][0]))
    
    with torch.no_grad(): fluor_outputs = saprot_fluor_model(**inputs)
    fluor_outputs_list.append(float(fluor_outputs.logits.detach().cpu().numpy()[0][0]))
    
df['yfp_model_score'] = yfp_outputs_list
df['fluor_model_score'] = fluor_outputs_list

100%|██████████| 11/11 [00:00<00:00, 20.16it/s]
