##Frame2seq: structure-conditioned masked language modeling for protein sequence design

Official notebook for [Frame2seq](https://doi.org/10.1101/2023.12.15.571823): Structure-conditioned masked language models for protein sequence design generalize beyond the native sequence space.

<a target="_blank" href="https://colab.research.google.com/github/dakpinaroglu/Frame2seq/blob/main/Frame2seq.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

<img src="https://raw.githubusercontent.com/dakpinaroglu/Frame2seq/main/.github/frame2seq_net_arc.png" align="center" style="height:240px">

In [None]:
#@title Install dependencies
%%capture
!git clone https://github.com/drumyerscough/Frame2seq
!cd Frame2seq
!pip install frame2seq

import os
from google.colab import files


def get_pdb(pdb_code=""):
# code credit: https://github.com/sokrypton/ColabDesign notebooks
  if pdb_code is None or pdb_code == "":
    upload_dict = files.upload()
    pdb_string = upload_dict[list(upload_dict.keys())[0]]
    with open("tmp.pdb","wb") as out: out.write(pdb_string)
    return "tmp.pdb"
  elif os.path.isfile(pdb_code):
    return pdb_code
  elif len(pdb_code) == 4:
    os.system(f"wget -qnc https://files.rcsb.org/view/{pdb_code}.pdb")
    return f"{pdb_code}.pdb"
  else:
    os.system(f"wget -qnc https://alphafold.ebi.ac.uk/files/AF-{pdb_code}-F1-model_v3.pdb")
    return f"AF-{pdb_code}-F1-model_v3.pdb"

In [None]:
#@title Frame2seq Runner
%%capture
from frame2seq import Frame2seqRunner


runner = Frame2seqRunner()

In [None]:
#@title Design sequences with Frame2seq (sampled sequences saved to .fasta)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# USER OPTIONS
#@markdown #### Input options for fixed-backbone design
pdb='2FRA' #@param {type:"string"}
#@markdown - leave blank to get an upload prompt
chain_id = "A" #@param {type:"string"}
temperature = 1.0 #@param {type:"raw"}
num_samples = 10 #@param {type:"raw"}
omit_AA = "C" #@param {type:"string"}
#@markdown - amino acid to omit from design
fixed_positions = 1,3,11 #@param {type:"raw"}
#@markdown - fixed positions are used as input
#@markdown - positions are 1-indexed
save_indiv_seqs = False #@param {type:"boolean"}
#@markdown - saves sequences to individual fasta files
save_indiv_neg_pll = False #@param {type:"boolean"}
#@markdown - saves per-position negative pseudo-log-likelihoods to csv
verbose = True #@param {type:"boolean"}

pdb_path = get_pdb(pdb)

# run Frame2seq
runner.design(pdb_path, chain_id, temperature, num_samples, omit_AA, fixed_positions, save_indiv_seqs, save_indiv_neg_pll, verbose)

In [None]:
#@title Score sequences with Frame2seq (scores saved to .csv)

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)


# USER OPTIONS
#@markdown #### Input options for scoring
pdb='2FRA' #@param {type:"string"}
#@markdown - leave blank to get an upload prompt
chain_id = "A" #@param {type:"string"}
save_indiv_neg_pll = False #@param {type:"boolean"}
#@markdown - saves per-position negative pseudo-log-likelihoods to csv
verbose = True #@param {type:"boolean"}

pdb_path = get_pdb(pdb)

fasta_file = None

# run Frame2seq
runner.score(pdb_path, chain_id, fasta_file, save_indiv_neg_pll, verbose)