# IMPORT LIBRARY

In [18]:
import json

import pandas as pd
import numpy as np
import plotly.express as px
import tensorflow.keras.layers as L
import tensorflow as tf
from sklearn.model_selection import train_test_split

# CODE

In [19]:
tf.random.set_seed(2020)
np.random.seed(2020)

In [20]:
# This will tell us the columns we are predicting
pred_cols = ['reactivity', 'deg_Mg_pH10', 'deg_Mg_50C', 'deg_pH10', 'deg_50C']

In [21]:
y_true = tf.random.normal((32, 68, 3))
y_pred = tf.random.normal((32, 68, 3))

In [22]:
def MCRMSE(y_true, y_pred):
    colwise_mse = tf.reduce_mean(tf.square(y_true - y_pred), axis=1)
    return tf.reduce_mean(tf.sqrt(colwise_mse), axis=1)

In [23]:
def gru_layer(hidden_dim, dropout):
    return L.Bidirectional(L.GRU(
        hidden_dim, dropout=dropout, return_sequences=True, kernel_initializer='orthogonal'))

In [24]:
def build_model(embed_size, seq_len=107, pred_len=68, dropout=0.5, 
                sp_dropout=0.2, embed_dim=200, hidden_dim=256, n_layers=3):
    inputs = L.Input(shape=(seq_len, 3))
    embed = L.Embedding(input_dim=embed_size, output_dim=embed_dim)(inputs)
    
    reshaped = tf.reshape(
        embed, shape=(-1, embed.shape[1],  embed.shape[2] * embed.shape[3])
    )
    hidden = L.SpatialDropout1D(sp_dropout)(reshaped)
    
    for x in range(n_layers):
        hidden = gru_layer(hidden_dim, dropout)(hidden)
    
    # Since we are only making predictions on the first part of each sequence, 
    # we have to truncate it
    truncated = hidden[:, :pred_len]
    out = L.Dense(5, activation='linear')(truncated)
    
    model = tf.keras.Model(inputs=inputs, outputs=out)
    model.compile(tf.optimizers.Adam(), loss=MCRMSE)
    
    return model

In [25]:
def pandas_list_to_array(df):
    """
    Input: dataframe of shape (x, y), containing list of length l
    Return: np.array of shape (x, l, y)
    """
    
    return np.transpose(
        np.array(df.values.tolist()),
        (0, 2, 1)
    )

In [26]:
def preprocess_inputs(df, token2int, cols=['sequence', 'structure', 'predicted_loop_type']):
    return pandas_list_to_array(
        df[cols].applymap(lambda seq: [token2int[x] for x in seq])
    )

In [27]:
data_dir = 'dataset/'
train = pd.read_json(data_dir + 'train.json', lines=True)
test = pd.read_json(data_dir + 'test.json', lines=True)
sample_df = pd.read_csv(data_dir + 'sample_submission.csv')

In [28]:
train = train.query("signal_to_noise >= 1")

In [29]:
# We will use this dictionary to map each character to an integer
# so that it can be used as an input in keras
token2int = {x:i for i, x in enumerate('().ACGUBEHIMSX')}

train_inputs = preprocess_inputs(train, token2int)
train_labels = pandas_list_to_array(train[pred_cols])

In [30]:
x_train, x_val, y_train, y_val = train_test_split(
    train_inputs, train_labels, test_size=.1, random_state=34, stratify=train.SN_filter)

In [31]:
public_df = test.query("seq_length == 107")
private_df = test.query("seq_length == 130")

public_inputs = preprocess_inputs(public_df, token2int)
private_inputs = preprocess_inputs(private_df, token2int)

In [32]:
model = build_model(embed_size=len(token2int))
model.summary()

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 107, 3)]          0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 107, 3, 200)       2800      
_________________________________________________________________
tf.reshape_1 (TFOpLambda)    (None, 107, 600)          0         
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 107, 600)          0         
_________________________________________________________________
bidirectional_3 (Bidirection (None, 107, 512)          1317888   
_________________________________________________________________
bidirectional_4 (Bidirection (None, 107, 512)          1182720   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 107, 512)          1182

In [33]:
history = model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    batch_size=64,
    epochs=75,
    verbose=2,
    callbacks=[
        tf.keras.callbacks.ReduceLROnPlateau(patience=5),
        tf.keras.callbacks.ModelCheckpoint('model.h5')
    ]
)

Epoch 1/75
30/30 - 166s - loss: 0.4564 - val_loss: 0.3878
Epoch 2/75
30/30 - 118s - loss: 0.3894 - val_loss: 0.3589
Epoch 3/75
30/30 - 133s - loss: 0.3632 - val_loss: 0.3428
Epoch 4/75
30/30 - 168s - loss: 0.3503 - val_loss: 0.3268
Epoch 5/75
30/30 - 178s - loss: 0.3392 - val_loss: 0.3210
Epoch 6/75
30/30 - 187s - loss: 0.3316 - val_loss: 0.3178
Epoch 7/75
30/30 - 186s - loss: 0.3245 - val_loss: 0.3170
Epoch 8/75
30/30 - 190s - loss: 0.3171 - val_loss: 0.3012
Epoch 9/75
30/30 - 189s - loss: 0.3096 - val_loss: 0.2959
Epoch 10/75
30/30 - 178s - loss: 0.3017 - val_loss: 0.2855
Epoch 11/75
30/30 - 191s - loss: 0.2937 - val_loss: 0.2824
Epoch 12/75
30/30 - 193s - loss: 0.2886 - val_loss: 0.2746
Epoch 13/75
30/30 - 198s - loss: 0.2815 - val_loss: 0.2701
Epoch 14/75
30/30 - 198s - loss: 0.2751 - val_loss: 0.2628
Epoch 15/75
30/30 - 196s - loss: 0.2684 - val_loss: 0.2567
Epoch 16/75
30/30 - 173s - loss: 0.2619 - val_loss: 0.2536
Epoch 17/75
30/30 - 181s - loss: 0.2592 - val_loss: 0.2575
Epoch 

In [34]:
fig = px.line(
    history.history, y=['loss', 'val_loss'],
    labels={'index': 'epoch', 'value': 'MCRMSE'}, 
    title='Training History')
fig.show()

In [35]:
# Caveat: The prediction format requires the output to be the same length as the input,
# although it's not the case for the training data.
model_public = build_model(seq_len=107, pred_len=107, embed_size=len(token2int))
model_private = build_model(seq_len=130, pred_len=130, embed_size=len(token2int))

model_public.load_weights('model.h5')
model_private.load_weights('model.h5')

In [36]:
public_preds = model_public.predict(public_inputs)
private_preds = model_private.predict(private_inputs)

In [37]:
preds_ls = []

for df, preds in [(public_df, public_preds), (private_df, private_preds)]:
    for i, uid in enumerate(df.id):
        single_pred = preds[i]

        single_df = pd.DataFrame(single_pred, columns=pred_cols)
        single_df['id_seqpos'] = [f'{uid}_{x}' for x in range(single_df.shape[0])]

        preds_ls.append(single_df)

preds_df = pd.concat(preds_ls)
preds_df.head()

Unnamed: 0,reactivity,deg_Mg_pH10,deg_Mg_50C,deg_pH10,deg_50C,id_seqpos
0,0.639359,0.62336,0.535205,2.100565,0.778556,id_00073f8be_0
1,1.977642,2.882958,3.149884,4.095637,2.873847,id_00073f8be_1
2,1.472459,0.582111,0.627631,0.60384,0.689328,id_00073f8be_2
3,1.277874,1.155778,1.657048,1.201191,1.790207,id_00073f8be_3
4,0.842316,0.661764,0.878112,0.564686,0.91311,id_00073f8be_4


In [38]:
submission = sample_df[['id_seqpos']].merge(preds_df, on=['id_seqpos'])
submission.to_csv('submission.csv', index=False)