# Install necessary packages

In [None]:
# !pip install --user pandas

# Imports

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer

# Project hyper-parameters

In [None]:
# Hyper-parameters
LR = 1e-2
EPOCHS = 200
BATCH_SIZE = 256
EMBED_DIM = 107
N_HIDDEN_LAYERS = 3
HIDDEN_DIM = 128
DROPOUT = .5
SP_DROPOUT = .2
TRAIN_SEQUENCE_LENGTH = 107
TEST_SEQUENCE_LENGTH = 103
MODEL_VERSION = "v0.1.3"

# Load and preprocess data

## Load data

In [None]:
train_df = pd.read_json("data/train.json", lines=True)

In [None]:
test_df = pd.read_json("data/test.json", lines=True)
public_test_df = test_df.query("seq_length == 107")
private_test_df = test_df.query("seq_length == 130")

In [None]:
sample_submission_df = pd.read_csv("data/sample_submission.csv")

## Preprocess data

In [None]:
symbols = "().ACGUBEHIMSX"
feat_cols = ['sequence', 'structure', 'predicted_loop_type']
pred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C', 'deg_pH10', 'deg_50C']

In [None]:
def process_features(df):
    df = df.copy()
    
    sequence_sentences = np.array(df["sequence"].values.tolist())
    structure_sentences = np.array(df["structure"].values.tolist())
    loop_sentences = np.array(df["predicted_loop_type"].values.tolist())
    
    tokenizer = Tokenizer(char_level=True, filters="")
    tokenizer.fit_on_texts(symbols)
    
    sequence_tokens = tokenizer.texts_to_sequences(sequence_sentences)
    structure_tokens = tokenizer.texts_to_sequences(structure_sentences)
    loop_tokens = tokenizer.texts_to_sequences(loop_sentences)
    
    sequences = np.stack((sequence_tokens, structure_tokens, loop_tokens), axis=1)
    
    return np.transpose(sequences, (0, 2, 1)), len(tokenizer.word_index)+1

def process_labels(df):
    df = df.copy()
    
    labels = np.array(df[pred_cols].values.tolist())
    labels = np.transpose(labels, (0, 2, 1))
    
    return labels

In [None]:
x_train, vocab_size = process_features(train_df)
y_train = process_labels(train_df)

# Define and train the model

In [None]:
def lstm_layer(hidden_dim, dropout):
    return tf.keras.layers.Bidirectional(
        tf.keras.layers.LSTM(hidden_dim, dropout=dropout, return_sequences=True,)
    )

In [None]:
def build_model(vocab_size, seq_length=TRAIN_SEQUENCE_LENGTH, pred_len=68,
                embed_dim=EMBED_DIM,
                n_hidden_layers=N_HIDDEN_LAYERS,
                hidden_dim=HIDDEN_DIM, dropout=DROPOUT, sp_dropout=SP_DROPOUT):
    inputs = tf.keras.layers.Input(shape=(seq_length, 3))

    embed = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)(inputs)
    
    hidden = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3])
    )
    
    hidden = tf.keras.layers.SpatialDropout1D(sp_dropout)(hidden)
    
    for x in range(n_hidden_layers):
        hidden = lstm_layer(hidden_dim, dropout)(hidden)
    
    truncated = hidden[:, :pred_len]
    
    out = tf.keras.layers.Dense(5, activation='linear')(truncated)
    
    model = tf.keras.Model(inputs=inputs, outputs=out)
    
    return model

In [None]:
model = build_model(vocab_size)

In [None]:
model.summary()

In [None]:
class MeanColumnwiseRMSE(tf.keras.losses.Loss):
    def __init__(self, name='MeanColumnwiseRMSE'):
        super().__init__(name=name)

    def call(self, y_true, y_pred):
        colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
        return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

In [None]:
model.compile(tf.optimizers.Adam(learning_rate=LR), loss=MeanColumnwiseRMSE())

In [None]:
def scheduler(epoch, lr):
    if epoch < 10:
        return lr
    else:
        return lr * tf.math.exp(-0.01)

history = model.fit(x_train, y_train, validation_split=.1,
          batch_size=BATCH_SIZE, epochs=EPOCHS, verbose=1,
          callbacks=[
              tf.keras.callbacks.LearningRateScheduler(scheduler),
              tf.keras.callbacks.ModelCheckpoint(f"model_{MODEL_VERSION}.h5", save_best_only=True)
          ])

## Evaluate the model

In [None]:
model.save('saved_model')

In [None]:
public_test_data, _ = process_features(public_test_df)
private_test_data, _ = process_features(private_test_df)

In [None]:
model_public = build_model(vocab_size, seq_length=107, pred_len=107)
model_private = build_model(vocab_size, seq_length=130, pred_len=130)

model_public.load_weights(f"model_{MODEL_VERSION}.h5")
model_private.load_weights(f"model_{MODEL_VERSION}.h5")

In [None]:
public_preds = model_public.predict(public_test_data)
private_preds = model_private.predict(private_test_data)

# Submission

In [None]:
preds_ls = []

for df, preds in [(public_test_df, public_preds), (private_test_df, private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=pred_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_ls.append(single_df)

preds_df = pd.concat(preds_ls)
preds_df.head()

In [None]:
submission = sample_submission_df[['id_seqpos']].merge(preds_df, on=['id_seqpos'])
submission.to_csv('submission.csv', index=False)