In [21]:
# Available here: https://mygithub.gsk.com/dmf32881/quantum_codon_opt
from qodon.src.classical_ga import CodonOptimization
from qodon.src.codon_bqm import DWaveBQM
from qodon.src.scoring import SeqScorer
from rna_folding.rna_fold import RNAFold
import tensorflow as tf, numpy as np
import tensorflow_probability as tfp
from Bio.Seq import Seq
from Bio import SeqIO

In [23]:
seq = str(SeqIO.read('examples/spike_trim.fasta','fasta').seq)

# Run all of the preprocessing from the CodonOptimization class, but
# no need to run the Genetic Algorithm (GA)
co = CodonOptimization(seq,lazy=True)

# Pull out initial population generation by previous command and
# convert to TF object
initial_members = tf.convert_to_tensor(([_[1] for _ in co.population]),np.float32)

# Helper function to get number of possible codons for an amino acid
get_nc = lambda res: len(co.code_map[res]['codons'])

In [30]:
seq

'MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIRGWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVYSSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQGFSALEPLVDLPIGINITRF'

In [29]:
initial_members[0]

<tf.Tensor: shape=(238,), dtype=float32, numpy=
array([0., 0., 3., 1., 2., 3., 4., 1., 0., 2., 1., 2., 1., 1., 1., 3., 0.,
       0., 2., 1., 3., 2., 1., 5., 2., 2., 2., 1., 1., 1., 2., 0., 1., 0.,
       1., 2., 0., 1., 2., 0., 1., 3., 1., 5., 0., 4., 3., 2., 0., 4., 2.,
       0., 1., 4., 1., 2., 0., 1., 0., 3., 0., 2., 0., 0., 1., 1., 2., 2.,
       0., 0., 5., 1., 0., 0., 2., 1., 0., 3., 1., 1., 1., 2., 1., 2., 2.,
       1., 0., 1., 3., 2., 0., 0., 1., 2., 1., 0., 0., 3., 0., 1., 1., 3.,
       3., 0., 2., 0., 3., 3., 3., 1., 0., 0., 0., 3., 1., 3., 3., 2., 2.,
       1., 1., 0., 3., 0., 0., 2., 2., 1., 0., 2., 1., 0., 0., 1., 1., 1.,
       0., 1., 2., 0., 2., 2., 3., 1., 1., 1., 0., 0., 1., 1., 5., 0., 0.,
       0., 0., 0., 0., 5., 2., 0., 1., 0., 0., 1., 0., 1., 2., 1., 0., 1.,
       1., 4., 0., 2., 1., 1., 0., 1., 3., 0., 2., 0., 1., 3., 1., 1., 0.,
       1., 2., 5., 1., 0., 0., 0., 0., 1., 2., 1., 1., 1., 0., 0., 2., 0.,
       1., 0., 1., 3., 0., 2., 1., 2., 2., 1., 1., 5

In [27]:
co.code_map

{'A': {'scores': [0.21, 0.27, 0.36, 0.16],
  'codons': ['GCA', 'GCC', 'GCG', 'GCT'],
  'log_scores': [0.5389965007326869,
   0.2876820724517808,
   0.0,
   0.8109302162163287],
  'probs': [0.21, 0.48, 0.84, 1.0]},
 'C': {'scores': [0.56, 0.44],
  'codons': ['TGC', 'TGT'],
  'log_scores': [0.0, 0.2411620568168881],
  'probs': [0.56, 1.0]},
 'D': {'scores': [0.37, 0.63],
  'codons': ['GAC', 'GAT'],
  'log_scores': [0.5322168137473082, 0.0],
  'probs': [0.37, 1.0]},
 'E': {'scores': [0.69, 0.31],
  'codons': ['GAA', 'GAG'],
  'log_scores': [0.0, 0.8001193001121131],
  'probs': [0.69, 1.0]},
 'F': {'scores': [0.43, 0.57],
  'codons': ['TTC', 'TTT'],
  'log_scores': [0.2818511521409877, 0.0],
  'probs': [0.43, 1.0]},
 'G': {'scores': [0.11, 0.41, 0.15, 0.34],
  'codons': ['GGA', 'GGC', 'GGG', 'GGT'],
  'log_scores': [1.3156767939059373,
   0.0,
   1.0055218656020977,
   0.18721154208814633],
  'probs': [0.11, 0.52, 0.67, 1.01]},
 'H': {'scores': [0.43, 0.57],
  'codons': ['CAC', 'CAT'],
  '

In [3]:
def tf_fold(nseq):
    rna_ss = RNAFold(nseq, min_stem_len=4, min_loop_len=4)
    results = rna_ss.compute_dwave_sa()
    return results.first.energy

In [4]:
def convert_to_nseqs(members):
    # This is a hack. TF deals with continuous valued functions. We need discrete and finite.
    # So let's cheat. Whatever values are assigned, make them ints and take the absolute value.
    members = np.absolute(np.array(members).astype(int))
    
    # Now we want to do something with the values. It's possible that some values exceed the
    # number of codons for the given position, so take the modulus. This is effectively a hashing
    # function. It's not mathematically rigorous, but it's good enough.
    # Finally, convert list of indices to the RNA sequence.
    get_seq = lambda se: ''.join([co.code_map[res]['codons'][se[i] % get_nc(res)] for i, res in enumerate(seq)])
    n_seqs = [get_seq(se) for se in members]
    return n_seqs

def objective(members):
    '''
    Objective function for TF to minimize
    
    NOTE: TF uses gradient descent to minimize continuous valued functions.
    The approach used here is not mathematically sound. It's a hack. But
    it gets the job done. 
    
    '''
    
    # Map continuous valued tensor to RNA sequence
    n_seqs = convert_to_nseqs(members)
    
    # Use the imported scoring function to score all sequences.
    #scores = [SeqScorer(s).score for s in n_seqs]
    scores = [tf_fold(s) for s in n_seqs]
    
    # Return TF object
    return tf.cast(scores, np.float32)

In [5]:
%%time
# More tricks. 
# Differential_weight: controls strength of mutations. We basically want to turn this off.
# Crossover_prob: set this low. Need to think more about why this helps.
optim_results = tfp.optimizer.differential_evolution_minimize(
    objective,
    initial_population=initial_members,
    max_iterations=10,
    differential_weight=0.01,
    crossover_prob=0.1,
)
# Translate "best" result back to protein sequence to verify it is valid
nseq = convert_to_nseqs(optim_results.final_population)[np.argmin(optim_results.final_objective_values)]
print('TF with 100 iterations:',np.min(optim_results.final_objective_values))

TF with 100 iterations: -1821.0
CPU times: user 53.5 s, sys: 1.05 s, total: 54.5 s
Wall time: 52.9 s


In [9]:
aas = 'ACDEFGHIKLMNPQRSTVWY'

In [16]:
any(_ not in aas for _ in 'AAC')

True

In [20]:
set('ACG').issubset(set('ACGU'))

True