In [1]:
# Run this cell to mount your Google Drive.
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
! cp /content/drive/My\ Drive/model.ckpt-600000* /tmp/t2t/output/

In [0]:
% cd /content/

In [0]:
#upload data
from google.colab import files
files.upload()

In [0]:
!unzip prot_lig_binding_human_ds5.zip

In [0]:
import tensorflow as tf

# Enable Eager execution - useful for seeing the generated data.
tf.enable_eager_execution()

from tensor2tensor.utils import trainer_lib

# Set a seed so that we have deterministic outputs.
RANDOM_SEED = 301
trainer_lib.set_random_seed(RANDOM_SEED)

In [0]:
import os

# Setup and create directories.
DATA_DIR = os.path.expanduser("/tmp/t2t/data")
OUTPUT_DIR = os.path.expanduser("/tmp/t2t/output")
TMP_DIR = os.path.expanduser("/tmp/t2t/tmp")
USR_DIR = os.path.expanduser("/tmp/t2t/usr")

# Create them.
tf.gfile.MakeDirs(DATA_DIR)
tf.gfile.MakeDirs(OUTPUT_DIR)
tf.gfile.MakeDirs(TMP_DIR)
tf.gfile.MakeDirs(USR_DIR)

In [0]:
! mkdir transformer_scratch

In [0]:
% cd /tmp/t2t/data

In [0]:
#upload vocab file
from google.colab import files
files.upload()

In [0]:
% cd /content

In [0]:
%%writefile transformer_scratch/prob.py

import re

  
  
# `Problem` is the base class for any dataset that we want to add to T2T -- it
# unifies the specification of the problem for generating training data,
# training, evaluation and inference.
#
# All its methods (except `generate_data`) have reasonable default
# implementations.
#
# A sub-class must implement `generate_data(data_dir, tmp_dir)` -- this method
# is called by t2t-trainer or t2t-datagen to actually generate TFRecord dataset
# files on disk.
from tensor2tensor.data_generators import problem

# Certain categories of problems are very common, like where either the input or
# output is text, for such problems we define an (abstract) sub-class of
# `Problem` called `Text2TextProblem` -- this implements `generate_data` in
# terms of another function `generate_samples`. Sub-classes must override
# `generate_samples` and `is_generate_per_split`.
from tensor2tensor.data_generators import text_problems

# Every non-abstract problem sub-class (as well as models and hyperparameter
# sets) must be registered with T2T so that T2T knows about it and can look it
# up when you specify your problem on the commandline to t2t-trainer or
# t2t-datagen.
#
# One uses:
# `register_problem` for a new Problem sub-class.
# `register_model` for a new T2TModel sub-class.
# `register_hparams` for a new hyperparameter set. All hyperparameter sets
# typically extend `common_hparams.basic_params1` (directly or indirectly).
from tensor2tensor.utils import registry


# By default, when you register a problem (or model or hyperparameter set) the
# name with which it gets registered is the 'snake case' version -- so here
# the Problem class `ProteinSpecificLigandGeneration` will be registered with
# the name `protein_specific_ligand_generation`.
#
# One can override this default by actually assigning a name as follows:
# `@registry.register_problem("my_awesome_problem")`
#
# The registered name is specified to the t2t-trainer or t2t-datagen using the
# commandline flag `--problem`.
@registry.register_problem('protein_specific_ligand_generation')

# We inherit from `Text2TextProblem` which takes care of a lot of details
# regarding reading and writing the data to disk, what vocabulary type one
# should use, its size etc -- so that we need not worry about them, one can,
# of course, override those.


class ProteinSpecificLigandGeneration(text_problems.Text2TextProblem):
  

  # START: Methods we should override.

  # The methods that need to be overriden from `Text2TextProblem` are:
  # `is_generate_per_split` and
  # `generate_samples`.

  @property
  def is_generate_per_split(self):
    # If we have pre-existing data splits for (train, eval, test) then we set
    # this to True, which will have generate_samples be called for each of the
    # dataset_splits.
    #
    # If we do not have pre-existing data splits, we set this to False, which
    # will have generate_samples be called just once and the Problem will
    # automatically partition the data into dataset_splits.
    return False

  def generate_samples(self, data_dir, tmp_dir, dataset_split):
    # Here we are generating the data in-situ using the `sample_sentence`
    # function, otherwise we would have downloaded the data and put it in
    # `tmp_dir` -- and read it from that location.
    del tmp_dir

    # Unused here, is used in `Text2TextProblem.generate_data`.
    del data_dir

    # This would have been useful if `self.is_generate_per_split()` was True.
    # In that case we would have checked if we were generating a training,
    # evaluation or test sample. This is of type `problem.DatasetSplit`.
    del dataset_split

    
  #  drug_file = open ("drugs.smi.txt", "r")
  
  
   
    with open("/content/proteins_train.space_sep_seq") as file1, open("/content/ligands_train.space_sep_seq") as file2:
      for x, y in zip(file1, file2):
        inputs = x.strip()
        targets = y.strip()
        yield {"inputs":inputs,"targets":targets}
     
    
    
  # END: Methods we should override.

  # START: Overridable methods.

  @property
  def vocab_type(self):
    # We can use different types of vocabularies, `VocabType.CHARACTER`,
    # `VocabType.SUBWORD` and `VocabType.TOKEN`.
    #
    # SUBWORD and CHARACTER are fully invertible -- but SUBWORD provides a good
    # tradeoff between CHARACTER and TOKEN.
    return text_problems.VocabType.TOKEN
  



  @property
  def dataset_splits(self):
    # Since we are responsible for generating the dataset splits, we override
    # `Text2TextProblem.dataset_splits` to specify that we intend to keep
    # 80% data for training and 10% for evaluation and testing each.
    return [{
        "split": problem.DatasetSplit.TRAIN,
        "shards": 196504,
    }, {
        "split": problem.DatasetSplit.EVAL,
        "shards": 0,
    }, {
        "split": problem.DatasetSplit.TEST,
        "shards": 0,
    }]

 # END: Overridable methods.


In [0]:
%%writefile transformer_scratch/__init__.py

from transformer_scratch import prob

In [0]:
! t2t-datagen \
  --problem=protein_specific_ligand_generation \
  --data_dir=/tmp/t2t/data \
  --tmp_dir=/tmp/t2t/tmp \
  --t2t_usr_dir=/content/transformer_scratch

In [0]:
! t2t-trainer \
  --model=transformer \
  --hparams_set=transformer_tiny \
  --hparams="batch_size=4096,max_length=0,label_smoothing=0,learning_rate_decay_scheme='noam',num_hidden_layers=4" \
  --problem=protein_specific_ligand_generation \
  --worker_gpu=1 \
  --train_steps=5000000 \
  --schedule=train \
  --data_dir=/tmp/t2t/data \
  --output_dir=/tmp/t2t/output \
  --t2t_usr_dir=/content/transformer_scratch

In [0]:
%cd /tmp/t2t/data

/tmp/t2t/data


In [0]:
from google.colab import files
files.upload()

In [0]:
!unzip proteins_test.zip

In [0]:
% cd /content

/content


In [0]:
from tensor2tensor.bin import t2t_decoder

In [0]:
! t2t-decoder \
  --hparams_set=transformer_tiny \
  --hparams="batch_size=4096,max_length=0,label_smoothing=0,learning_rate_decay_scheme='noam',num_hidden_layers=4" \
  --decode_hparams="beam_size=4,alpha=0.6,batch_size=4" \
  --model=transformer \
  --problem=protein_specific_ligand_generation \
  --data_dir=/tmp/t2t/data \
  --output_dir=/tmp/t2t/output \
  --t2t_usr_dir=/content/transformer_scratch \
  --decode_from_file=/tmp/t2t/data/proteins_test5.space_sep_seq_unique \
  --decode_to_file=protein.seq.decode.results_bs4

In [0]:
! t2t-decoder \
  --hparams_set=transformer_tiny \
  --hparams="batch_size=4096,max_length=0,label_smoothing=0,learning_rate_decay_scheme='noam',num_hidden_layers=4" \
  --decode_hparams="beam_size=10,alpha=0.6,batch_size=4,write_beam_scores=False, return_beams=True" \
  --model=transformer \
  --problem=protein_specific_ligand_generation \
  --data_dir=/tmp/t2t/data \
  --output_dir=/tmp/t2t/output \
  --t2t_usr_dir=/content/transformer_scratch \
  --decode_from_file=/tmp/t2t/data/proteins_test5.space_sep_seq_unique \
  --decode_to_file=protein.seq.decode.results_bs4_1000

In [0]:
%cd /tmp/t2t/output

In [0]:
!ls

In [0]:
!rm checkpoint

In [0]:
! cp model.ckpt-600000.* /content/drive/My\ Drive/

In [0]:
from google.colab import files
files.upload()