In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
from tqdm import tqdm

In [2]:
## check version in case of version conflict
# import keras
# keras.__version__
# tf.__version__

# !pip install --upgrade tensorflow==2.8.0
# !pip install --upgrade keras

# !pip install tensorflow_addons==0.16.1

In [3]:
# import os
# os.chdir("/content/drive/MyDrive/DREAME_COMPETITION_2022/saved_model/transformer_model")

# def r_square(y_true, y_pred):
#    SS_res =  K.sum(K.square(y_true - y_pred)) 
#    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
#    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

import tensorflow_addons as tfa

r_square = tfa.metrics.r_square.RSquare(dtype=tf.float32, y_shape=(1,))
# load_options = tf.saved_model.LoadOptions(experimental_io_device='/job:localhost') # for tpu
# model = tf.keras.models.load_model('../saved_model/tpu_trans_unet_v0.54', custom_objects={'r_square':r_square},options=load_options)
model = tf.keras.models.load_model('../saved_model/tpu_trans_unet_v0.54', custom_objects={'r_square':r_square})
model.summary()

2022-08-07 15:12:46.849864: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-08-07 15:12:47.373782: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 30989 MB memory:  -> device: 0, name: Tesla V100-SXM2-32GB, pci bus id: 0000:b2:00.0, compute capability: 7.0


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 112, 5)]     0           []                               
                                                                                                  
 conv1d (Conv1D)                (None, 112, 64)      3584        ['input_1[0][0]']                
                                                                                                  
 ecb_1__0 (SqueezeExcitation1DL  (None, 112, 64)     4192        ['conv1d[0][0]']                 
 ayer)                                                                                            
                                                                                                  
 batch_normalization (BatchNorm  (None, 112, 64)     256         ['ecb_1__0[0][0]']           

In [4]:
# verify saved model's loss function: 'huber_loss'
# model.loss.name

In [5]:
# plot model
# tf.keras.utils.plot_model(model)

In [7]:
# read data
data_path="../data/test_sequences.txt"
df = pd.read_csv(data_path, sep="\t", header=None, names=["Seq", "Expression"])

dataset = tf.data.TextLineDataset(data_path)

seq_list=[]
exp_list=[]
pbar = tqdm(total=len(df))
    
for i in dataset.as_numpy_iterator():
  seq, exp = i.decode("utf-8").split("\t")
  seq_list.append(list(seq))
  exp_list.append(float(exp))
  pbar.update(1)

pbar.close()

100%|██████████| 71103/71103 [00:04<00:00, 17273.15it/s]


In [8]:
# "padding" the test data so as to make sure each batch contains 50 examples
seq_list = seq_list + seq_list[:47]
exp_list = exp_list + exp_list[:47]
print(len(seq_list))

71150


In [9]:
# pad sequence to 112 for trans_unet
pad_seq_list = tf.keras.preprocessing.sequence.pad_sequences(seq_list, maxlen=112, padding="post", truncating='post', dtype="str", value="N")

new_dataset = tf.data.Dataset.from_tensor_slices((pad_seq_list,exp_list))

vocab = ['A','C','G','T']
lookup = tf.keras.layers.StringLookup(vocabulary=vocab, output_mode='one_hot')
new_dataset = new_dataset.map(lambda x, y: (lookup(x), y))

new_dataset = new_dataset.map(lambda x, y: (tf.cast(x, dtype=tf.float32), y))
new_dataset = new_dataset.batch(50, drop_remainder=True)

In [10]:
pred = model.predict(new_dataset)

2022-08-07 15:13:31.946480: I tensorflow/stream_executor/cuda/cuda_dnn.cc:368] Loaded cuDNN version 8202


In [11]:
# Discard the "padding" samples
df["Expression"] = pred[:71103]

In [12]:
# inspect the prediction's statistics
df.describe()

Unnamed: 0,Expression
count,71103.0
mean,11.690157
std,2.230596
min,-2.536114
25%,9.924775
50%,11.687795
75%,13.600866
max,16.312073


In [13]:
# previous submission requirement
df.to_csv('../submission/submission.txt', sep="\t", header=False, index=False)

### To generate submission file, run following code. The pred.json is saved under /submission folder

In [14]:
# Y_pred = np.array(df['Expression'])
# Y_pred.shape

(71103,)

In [16]:
import json
from collections import OrderedDict
# file available at #https://github.com/de-Boer-Lab/DREAM-2022/blob/main/sample_submission.json
with open('../submission/sample_submission.json', 'r') as f: 
  ground = json.load(f)

indices = np.array([int(indice) for indice in list(ground.keys())]) 

PRED_DATA = OrderedDict()

for i in indices:
#Y_pred is an numpy array of dimension (71103,) that contains your
#predictions on the test sequences
  PRED_DATA[str(i)] = float(Y_pred[i])

def dump_predictions(prediction_dict, prediction_file): 
  with open(prediction_file, 'w') as f:
    json.dump(prediction_dict, f) 

dump_predictions(PRED_DATA, '../submission/pred.json')