In [2]:
import os
import multiprocessing
from absl import flags
import common.model.utils_ori as utils
import numpy as np
import tensorflow as tf
from common.bio.amino_acid import numpy_seqs_to_fasta

from common.bio.constants import get_lesk_color_mapping
from gan.documentation import add_image_grid
from gan.protein.custom_scalars import add_custom_scalar
from gan.protein.helpers import convert_to_acid_ids, REAL_PROTEINS, ACID_EMBEDDINGS_SCOPE, ACID_EMBEDDINGS, \
    FAKE_PROTEINS, CLASS_MAPPING, SEQ_LENGTH, get_shape, LABELS, NUM_AMINO_ACIDS, get_file
from common.model import ops
from common.model.ops import pad_up_to, gelu


from gan.parameters import get_flags

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
def extract_seq_and_label(record, args):
    """Extracts and preprocesses the sequences and label from the record.

    Args:
      record: tfrecord from file
      args: list of arguments

    Returns:
      Sequence and label as tensors

    """
    features = tf.parse_single_example(
        serialized=record,
        features={
            'label': tf.FixedLenFeature([1], tf.int64),
            'sequence': tf.FixedLenSequenceFeature([], dtype=tf.int64, allow_missing=True)
        },
    )
    seq = tf.cast(features['sequence'], tf.int32, name="seq")
    labels = tf.cast(features['label'], tf.int32, name="labels")

    seq = pad_up_to(seq, args[0], dynamic_padding=args[1])
    return seq, labels

In [4]:
def get_upsampling_factor(full_path):
    """
    Function that parses the file name to know how much upsampling is required.
    Args:
        full_path: a path of the tfrecords file

    Returns:
        return upsampling factor. If file is not in the right format return -1 (infinite upsampling)
    """
    filename = os.path.splitext(os.path.basename(full_path))[0]
    parts = filename.split("_")
    if len(parts) == 3 or len(parts) == 2:
        return int(parts[-1])*int(parts[-2])
    else:
        return -1

In [None]:
def get_batches(fn, data_dir, batch_size, shuffle_buffer_size=100000, running_mode='train', args=None,
                balance=True):
    """

    Args:
      fn: function that tells how to process tfrecord
      data_dir: The directory to read data from.
      batch_size: The number of elements in a single minibatch.
      cycle_length: The number of input elements to process concurrently in the dataset loader. (Default: 1)
      shuffle_buffer_size: The number of records to load before shuffling. (Default: 100000)
      running_mode: string that is used to determine from where the data should be loaded (Default: train)
      args: list of arguments that will be passed into function that processes tfrecord (Default: None)

    Returns:
      A batch worth of data.

    """

    print("Loading files from {}".format(data_dir))
    filenames = tf.gfile.Glob(os.path.join(data_dir, running_mode, "*.tfrecords"))
    print("Found {} file(s)".format(len(filenames)))
    upsampling_factor = [ get_upsampling_factor(fn) for fn in filenames]
    print(upsampling_factor)
    filename_dataset = tf.data.Dataset.from_tensor_slices((filenames, upsampling_factor))
    print(list(filename_dataset))
    filename_dataset = filename_dataset.shuffle(len(filenames))
    prefetch = max(int(batch_size / len(filenames)), 1)

    # Repeat data in the file for unlimited number. This solves class imbalance problem.
    def get_tfrecord_dataset(filename, upsampling_factor):
        tfrecord_dataset = tf.data.TFRecordDataset(filename).prefetch(prefetch)
        if balance:
            tfrecord_dataset = tfrecord_dataset.repeat(tf.cast(upsampling_factor, dtype=tf.int64))
        return tfrecord_dataset

    dataset = filename_dataset.interleave(
        lambda filename, upsampling_factor: get_tfrecord_dataset(filename, upsampling_factor),
        cycle_length=len(filenames))
    print(list(dataset))
    print("Loading process will use {} CPUs".format(multiprocessing.cpu_count()))
    dataset = dataset.shuffle(buffer_size=shuffle_buffer_size, reshuffle_each_iteration=True)
    dataset = dataset.map(lambda x: fn(x, args), num_parallel_calls=multiprocessing.cpu_count()).prefetch(batch_size)
    dataset = dataset.batch(batch_size, drop_remainder=True).repeat()

    iterator = dataset.make_one_shot_iterator()
    return iterator.get_next()


In [6]:
def get_batch(batch_size, data_dir, dataset, shuffle_buffer_size, running_mode, dynamic_padding):
    path = os.path.join(data_dir, dataset.replace("\\", os.sep))
    extract_fn = extract_seq_and_label
    batches = get_batches(extract_fn, path, batch_size, shuffle_buffer_size=shuffle_buffer_size,
                                running_mode=running_mode, args=[[512], dynamic_padding])
    return batches

In [7]:
def prepare_real_data(real_x, labels, data_dir, dataset):
    real_x = tf.reshape(real_x, [10, 512], name=REAL_PROTEINS)
    #reactions = get_reactions(labels, data_dir, dataset)
    labels = tf.identity(tf.squeeze(labels), name=LABELS)
    real_x = convert_real_to_one_hot(real_x)
    real_x = tf.expand_dims(real_x, 1)
    return real_x, labels

In [8]:
def get_reactions(labels, data_dir, dataset):
    reaction_tensors = []
    for reaction in get_reaction_data(data_dir, dataset):
        # label = tf.convert_to_tensor(reaction[0])
        r = [pad_up_to(tf.convert_to_tensor(reaction[i]), [128], True) for
             i in range(1, 5)]
        reaction_tensors.append(r)
    reaction_tensors = tf.stack(reaction_tensors)
    return tf.gather_nd(reaction_tensors, tf.expand_dims(labels, axis=1))

In [9]:
def convert_real_to_one_hot(real_x): # Label smoothing?
    real_to_display = tf.one_hot(real_x, 21, axis=1)
    real_to_display = tf.transpose(real_to_display, perm=[0, 2, 1])
    return real_to_display

In [10]:
def get_reaction_data(data_dir, dataset):
    filename = "train" + "_reactions.npy"
    reaction_path = os.path.join(data_dir, dataset.replace("\\", os.sep), filename)
    try:
        reactions = np.load(reaction_path)
    except Exception as e:
        tf.logging.warn("Reaction file could not be loaded: " + e.__str__())
        reactions = []
    return reactions

In [11]:
CURRENT_DIRECTORY = "/home/evgeny/code/AAVGAN/src/gan"
DATASET = 'protein'

In [20]:
data_dir = os.path.join(CURRENT_DIRECTORY, '..\..\data'.replace("\\", os.sep))
dataset = DATASET + "/testing_cgan_up_150"
already_embedded = False
shuffle_buffer_size = 100000
running_mode = "train"
dynamic_padding = True
sess = tf.InteractiveSession()
batch = get_batch(10, data_dir, dataset, shuffle_buffer_size, running_mode, dynamic_padding)
real_x, labels = batch[0], batch[1]
#print(real_x.eval())
#print(labels.eval())
real_x, labels = prepare_real_data(real_x, labels, data_dir, dataset)
print(labels)

2022-06-13 15:22:43.110176: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:1005] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-13 15:22:43.110314: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties: 
name: NVIDIA GeForce RTX 2070 major: 7 minor: 5 memoryClockRate(GHz): 1.62
pciBusID: 0000:01:00.0
2022-06-13 15:22:43.110351: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudart.so.10.0
2022-06-13 15:22:43.110361: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcublas.so.10.0
2022-06-13 15:22:43.110370: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcufft.so.10.0
2022-06-13 15:22:43.110379: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic librar

Loading files from /home/evgeny/code/AAVGAN/src/gan/../../data/protein/testing_cgan_up_150
Found 14 file(s)
[30, 90, 50, 450, 20, 5, 1, 2, 80, 40, 3, 10, 70, 60]


KeyboardInterrupt: 