# Install necessary packages

In [None]:
# !pip install --user pandas

# Imports

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer

# Project hyper-parameters

In [2]:
# Hyper-parameters
LR = 1e-2
EPOCHS = 200
BATCH_SIZE = 256
EMBED_DIM = 107
N_HIDDEN_LAYERS = 3
HIDDEN_DIM = 128
DROPOUT = .5
SP_DROPOUT = .2
TRAIN_SEQUENCE_LENGTH = 107
TEST_SEQUENCE_LENGTH = 103
MODEL_VERSION = "v0.1.3"

# Load and preprocess data

## Load data

In [3]:
train_df = pd.read_json("data/train.json", lines=True)

In [4]:
test_df = pd.read_json("data/test.json", lines=True)
public_test_df = test_df.query("seq_length == 107")
private_test_df = test_df.query("seq_length == 130")

In [5]:
sample_submission_df = pd.read_csv("data/sample_submission.csv")

## Preprocess data

In [6]:
symbols = "().ACGUBEHIMSX"
feat_cols = ['sequence', 'structure', 'predicted_loop_type']
pred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C', 'deg_pH10', 'deg_50C']

In [7]:
def process_features(df):
    df = df.copy()
    
    sequence_sentences = np.array(df["sequence"].values.tolist())
    structure_sentences = np.array(df["structure"].values.tolist())
    loop_sentences = np.array(df["predicted_loop_type"].values.tolist())
    
    tokenizer = Tokenizer(char_level=True, filters="")
    tokenizer.fit_on_texts(symbols)
    
    sequence_tokens = tokenizer.texts_to_sequences(sequence_sentences)
    structure_tokens = tokenizer.texts_to_sequences(structure_sentences)
    loop_tokens = tokenizer.texts_to_sequences(loop_sentences)
    
    sequences = np.stack((sequence_tokens, structure_tokens, loop_tokens), axis=1)
    
    return np.transpose(sequences, (0, 2, 1)), len(tokenizer.word_index)+1

def process_labels(df):
    df = df.copy()
    
    labels = np.array(df[pred_cols].values.tolist())
    labels = np.transpose(labels, (0, 2, 1))
    
    return labels

In [8]:
x_train, vocab_size = process_features(train_df)
y_train = process_labels(train_df)

# Define and train the model

In [9]:
def lstm_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(hidden_dim, dropout=dropout, return_sequences=True,)
    )

In [10]:
def build_model(vocab_size, seq_length=TRAIN_SEQUENCE_LENGTH, pred_len=68,
                embed_dim=EMBED_DIM,
                n_hidden_layers=N_HIDDEN_LAYERS,
                hidden_dim=HIDDEN_DIM, dropout=DROPOUT, sp_dropout=SP_DROPOUT):
    inputs = tf.keras.layers.Input(shape=(seq_length, 3))

    embed = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)
    
    hidden = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3])
    )
    
    hidden = tf.keras.layers.SpatialDropout1D(sp_dropout)(hidden)
    
    for x in range(n_hidden_layers):
        hidden = lstm_layer(hidden_dim, dropout)(hidden)
    
    truncated = hidden[:, :pred_len]
    
    out = tf.keras.layers.Dense(5, activation='linear')(truncated)
    
    model = tf.keras.Model(inputs=inputs, outputs=out)
    
    return model

In [11]:
model = build_model(vocab_size)

In [12]:
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 107, 3)]          0         
_________________________________________________________________
embedding (Embedding)        (None, 107, 3, 107)       1605      
_________________________________________________________________
tf_op_layer_Reshape (TensorF [(None, 107, 321)]        0         
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 107, 321)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 107, 256)          460800    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 107, 256)          394240    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 107, 256)         

In [13]:
class MeanColumnwiseRMSE(tf.keras.losses.Loss):
    def __init__(self, name='MeanColumnwiseRMSE'):
        super().__init__(name=name)

    def call(self, y_true, y_pred):
        colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
        return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

In [14]:
model.compile(tf.optimizers.Adam(learning_rate=LR), loss=MeanColumnwiseRMSE())

In [15]:
def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.01)

history = model.fit(x_train, y_train, validation_split=.1,
          batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1,
          callbacks=[
              tf.keras.callbacks.LearningRateScheduler(scheduler),
              tf.keras.callbacks.ModelCheckpoint(f"model_{MODEL_VERSION}.h5", save_best_only=True)
          ])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

In [22]:
model.save('saved_model')

INFO:tensorflow:Assets written to: saved_model/assets


# Fine tune the model

In [26]:
model = tf.keras.models.load_model("saved_model", compile=False)

In [27]:
model.compile(loss=MeanColumnwiseRMSE(),
              optimizer=tf.keras.optimizers.Adam(learning_rate=1e-4))

In [28]:
ft_history = model.fit(x_train, y_train, validation_split=.1,
                       batch_size=BATCH_SIZE, epochs=100, verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

## Evaluate the model

In [29]:
public_test_data, _ = process_features(public_test_df)
private_test_data, _ = process_features(private_test_df)

In [30]:
model_public = build_model(vocab_size, seq_length=107, pred_len=107)
model_private = build_model(vocab_size, seq_length=130, pred_len=130)

model_public.load_weights(f"model_{MODEL_VERSION}.h5")
model_private.load_weights(f"model_{MODEL_VERSION}.h5")

In [31]:
public_preds = model_public.predict(public_test_data)
private_preds = model_private.predict(private_test_data)

# Submission

In [32]:
preds_ls = []

for df, preds in [(public_test_df, public_preds), (private_test_df, private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=pred_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_ls.append(single_df)

preds_df = pd.concat(preds_ls)
preds_df.head()

Unnamed: 0,reactivity,deg_Mg_pH10,deg_Mg_50C,deg_pH10,deg_50C,id_seqpos
0,0.585464,0.598366,0.518347,1.969975,0.706725,id_00073f8be_0
1,2.00408,3.021512,3.253051,4.197938,2.865593,id_00073f8be_1
2,1.352085,0.45449,0.557595,0.554804,0.668549,id_00073f8be_2
3,1.140306,0.992853,1.636934,1.106833,1.573088,id_00073f8be_3
4,0.843897,0.560205,0.888484,0.568688,0.876356,id_00073f8be_4


In [33]:
submission = sample_submission_df[['id_seqpos']].merge(preds_df, on=['id_seqpos'])
submission.to_csv('submission.csv', index=False)