In [None]:
from default_variables import build_model

# Import classes.
from default_variables import (
    rc_Conv1D,
    MultiHeadAttention,
    FeedForward,
    LayerNormalization
)


import copy
import datetime
import os
import random
import glob

import keras
import numpy as np
import pandas as pd
import scipy
import tensorflow as tf
import tensorflow.keras as keras
from scipy.stats import *
from sklearn.metrics import *
from sklearn.model_selection import train_test_split
from tensorflow.keras import Input
from tensorflow.keras import backend as K
from tensorflow.keras.backend import conv1d
from tensorflow.keras.layers import (LSTM, Activation, Add, BatchNormalization,
                                     Bidirectional, Concatenate, Conv1D,
                                     Conv2D, Dense, Dropout, Flatten, Lambda,
                                     LeakyReLU, MaxPooling1D, MaxPooling2D,
                                     Permute, Reshape, UpSampling2D)
from tensorflow.keras.models import Model
from tensorflow.keras.regularizers import l1_l2
from tensorflow.python.client import device_lib
from tensorflow.python.keras.utils import conv_utils
from tqdm import tqdm_notebook as tqdm



tf.config.list_logical_devices('TPU')
tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect("local")
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

# Required Input (!!!)

In [None]:
path_to_training_sequences = ""
path_to_test_sequences = ""

output_path_of_final_predictions = ""

# Augment the Datasets

In [None]:
def _preprocess_x(list_of_sequences):
    def seq2feature(data,mapper,worddim):
        transformed = np.zeros([data.shape[0],1,len(data[0]),4] , dtype=np.bool )
        for i in tqdm(range(data.shape[0])):
            for j,k in enumerate(data[i]):
                #print j,k
                transformed[i,0,j] = mapper[k] 
                #print mapper[k]
        return transformed

    # Add 'N' to sequences that are not of length 110.
    for i in range(0,len(list_of_sequences)) : 
        if (len(list_of_sequences[i]) > 110) :
            list_of_sequences[i] = list_of_sequences[i][-110:]
        if (len(list_of_sequences[i]) < 110) : 
            while (len(list_of_sequences[i]) < 110) :
                list_of_sequences[i] = 'N'+list_of_sequences[i]
                
    A_onehot = np.array([1,0,0,0] ,  dtype=np.bool)
    C_onehot = np.array([0,1,0,0] ,  dtype=np.bool)
    G_onehot = np.array([0,0,1,0] ,  dtype=np.bool)
    T_onehot = np.array([0,0,0,1] ,  dtype=np.bool)
    N_onehot = np.array([0,0,0,0] ,  dtype=np.bool)
    
    mapper = {'A':A_onehot,'C':C_onehot,'G':G_onehot,'T':T_onehot,'N':N_onehot}
    worddim = len(mapper['A'])
    seqdata = np.asarray(list_of_sequences)
    
    seqdata_transformed = seq2feature(seqdata, mapper, worddim)
    
    return np.squeeze(seqdata_transformed)

def _preprocess_y(list_of_expressions):
    return np.asarray(list_of_expressions).astype('float')

def _get_augmented_datasets(path_to_train_sequences: str, random_state: int=420, normal_dist_variance: float=0.3):
    data = pd.read_csv(
        path_to_train_sequences,
        delimiter="\t",
        names=["sequence", "expression"]
    )

    number_of_remaining_data_points = data.shape[0] % 1024

    data = data.iloc[:data.shape[0] - number_of_remaining_data_points]

    X_train, X_test, y_train, y_test = train_test_split(
        data["sequence"], 
        data["expression"],
        test_size=(1500)/(data.shape[0] / 1024),
        random_state=random_state
    )

    validation_train_size = X_train.shape[0] - (1024 * 50)
    validation_train_size = validation_train_size/X_train.shape[0]

    X_train, X_valid, y_train, y_valid = train_test_split(
        X_train, 
        y_train, 
        train_size=validation_train_size,
        random_state=random_state
    )
    
    y_train_df = pd.DataFrame()
    y_train_df["expression"] = y_train
    
    y_train_df["new_expression"] = y_train_df["expression"]
    for integer_expression in tqdm(range(18)):
        subset = y_train_df[y_train_df["expression"] == integer_expression]
        new_expressions = np.random.normal(
            integer_expression,
            normal_dist_variance,
            size=subset.shape[0]
        )
        y_train_df.loc[subset.index, "new_expression"] = new_expressions
        
    y_train = y_train_df["new_expression"]
    
    preprocessed_xtrain = _preprocess_x(list(X_train))
    print('...done with xtrain')
    preprocessed_ytrain = _preprocess_y(list(y_train))
    print('...done with ytrain')

    preprocessed_xtest = _preprocess_x(list(X_test))
    print('...done with xtest')
    preprocessed_ytest = _preprocess_y(list(y_test))
    print('...done with ytest')

    preprocessed_xval = _preprocess_x(list(X_valid))
    print('...done with xvalid')
    preprocessed_yval = _preprocess_y(list(y_valid))
    print('...done with yvalid')

    return (
        preprocessed_xtrain,
        preprocessed_ytrain,
        preprocessed_xtest,
        preprocessed_ytest,
        preprocessed_xval,
        preprocessed_yval
    )

In [None]:
preprocessed_xtrain, preprocessed_ytrain, preprocessed_xtest, preprocessed_ytest, preprocessed_xval, preprocessed_yval = _get_augmented_datasets(path_to_training_sequences, random_state=420)

# Train the model.

In [None]:
model_params = {
    "n_val_epoch": 1000,
    "epochs": 30,
    "batch_size": 1024,
    "l1_weight": 0,
    "l2_weight": 0,
    "motif_conv_hidden": 256,
    "conv_hidden": 64,
    "n_hidden": 64,
    "n_heads": 8,
    "conv_width_motif": 30,
    "dropout_rate": 0.1,
    "lr": 0.001,
    "add_cooperativity_layer": True,
    "n_aux_layers": 1,
    "n_attention_layers": 2,
    "attention_dropout_rate": 0.1,
    "device_type": "gpu",
    "input_shape": (5151744, 110, 4),
    "loss": "mean_squared_error",
    "optimizer": "Nadam",
    "trainable_layers": {
        "block1": {
            "rc_conv1d": True,
            "conv2d": True,
            "conv1d": True
        },
        "block2": {
            "multiheadattention": True,
            "lstm": True
        }
    }
}

In [None]:
def _create_weights_folder(
    output_folder: str="~/",
    model_name: str="DreamTeam",
    additional_info: str=""
):
    path_to_model_weights_folder = os.path.join(
        output_folder,
        model_name,
        
    )

    now = datetime.datetime.now().strftime("%Y-%m-%d_%Hh%M")
    checkpoint_path = os.path.join(
        path_to_model_weights_folder,
        f"{additional_info}_" + now
    )

    if not os.path.exists(path_to_model_weights_folder):
        os.mkdir(path_to_model_weights_folder)

    if not os.path.exists(checkpoint_path):
        os.mkdir(checkpoint_path)
        
    return checkpoint_path

def _create_folders_and_callbacks(
    output_folder: str,
    model_params: dict,
    model_name: str="DreamTeam",
    additional_info: str="",
):
    path_to_weights_folder = _create_weights_folder(
        output_folder=output_folder,
        model_name=model_name,
        additional_info=additional_info
    )

    path_to_logs = os.path.join(path_to_weights_folder, "logs")

    if not os.path.exists(path_to_logs):
        os.mkdir(path_to_logs)
        
    folder_of_best_weights = os.path.join(path_to_weights_folder, "best_weights")
    if not os.path.exists(folder_of_best_weights):
        os.mkdir(folder_of_best_weights)

    # path_to_weights_folder
    #   model_name
    #       configuration_file.json
    #       logs/       <--- for tensorboard
    #           train/
    #           validation/
    #       best_weights/ <--- best weights according to validation loss
    #           best_weights.h5
    #       model.epoch-01-loss-0.01-val-loss-0.01.h5 <--- all weights



    callbacks = [
        # Save all weights.
        tf.keras.callbacks.ModelCheckpoint(
            filepath=os.path.join(
                path_to_weights_folder,
                'model.epoch-{epoch:02d}_train-loss-{loss:.5f}_val-loss-{val_loss:.5f}.h5'),
            
        ),
        
        # Save best weights.
        tf.keras.callbacks.ModelCheckpoint(
            filepath=os.path.join(
                folder_of_best_weights,
                'best-weights.h5'
            ),
            save_weights_only=True,
            monitor='val_loss',
            mode='min',
            save_best_only=True
        ),

        # Tensorboard
        keras.callbacks.TensorBoard(
            log_dir=path_to_logs,
            histogram_freq=1
        )
    
    ]

    print(f"Path to weights and config: {path_to_weights_folder}")
    with open(os.path.join(path_to_weights_folder, "configuration_file.json"), "w") as f:
        json.dump(model_params, f)
    
    return callbacks

In [None]:
with tpu_strategy.scope():
    model = build_model(
        model_params=model_params
    )
    
    callbacks = _create_folders_and_callbacks(
        output_folder="~/"
        model_params=model_params,
        model_name="DreamTeam_model",
        additional_info=""
    )

    model.fit(
        preprocessed_xtrain,
        preprocessed_ytrain,
        epochs=model_params["epochs"],
        batch_size=model_params["batch_size"],
        callbacks = callbacks,
        validation_data=(preprocessed_xval, preprocessed_yval)
    )

# Testing

In [None]:
# REQUIRED:
path_to_best_weights = "" # within the folder "best_weights"

In [None]:
test_set = pd.read_csv(
    path_to_test_sequences, 
    delimiter="\t", 
    header=None, 
    names=["sequence", "expression"]
)
def _preprocess_x(list_of_sequences):
    def seq2feature(data,mapper,worddim):
        transformed = np.zeros([data.shape[0],1,len(data[0]),4] , dtype=np.bool )
        for i in tqdm(range(data.shape[0])):
            for j,k in enumerate(data[i]):
                #print j,k
                transformed[i,0,j] = mapper[k] 
                #print mapper[k]
        return transformed

    # Add 'N' to sequences that are not of length 110.
    for i in range(0,len(list_of_sequences)) : 
        if (len(list_of_sequences[i]) > 110) :
            list_of_sequences[i] = list_of_sequences[i][-110:]
        if (len(list_of_sequences[i]) < 110) : 
            while (len(list_of_sequences[i]) < 110) :
                list_of_sequences[i] = 'N'+list_of_sequences[i]
                
    A_onehot = np.array([1,0,0,0] ,  dtype=np.bool)
    C_onehot = np.array([0,1,0,0] ,  dtype=np.bool)
    G_onehot = np.array([0,0,1,0] ,  dtype=np.bool)
    T_onehot = np.array([0,0,0,1] ,  dtype=np.bool)
    N_onehot = np.array([0,0,0,0] ,  dtype=np.bool)
    
    mapper = {'A':A_onehot,'C':C_onehot,'G':G_onehot,'T':T_onehot,'N':N_onehot}
    worddim = len(mapper['A'])
    seqdata = np.asarray(list_of_sequences)
    
    seqdata_transformed = seq2feature(seqdata, mapper, worddim)
    
    return np.squeeze(seqdata_transformed)

test_x = _preprocess_x(test_set["sequence"])
filled_test_x = np.concatenate([test_x] * 21, axis=0)
filled_test_x = np.concatenate([filled_test_x, test_x[:42837, :, :]], axis=0)


with tpu_strategy.scope():
    model = build_model(
        model_params=model_params
    )
    
    model.load_weights(path_to_best_weights)
    
    pred = model.predict(
        filled_test_x, 
        batch_size=1024, 
        verbose=1
    ) 
    
real_predictions = pred[:71103, :].flatten()
import json
from collections import OrderedDict
with open('/home/b330-admin/data/sample_submission.json', 'r') as f:
    ground = json.load(f)

indices = np.array([int(indice) for indice in list(ground.keys())])
PRED_DATA = OrderedDict()

for i in indices:
    #Y_pred is an numpy array of dimension (71103,) that contains your
    #predictions on the test sequences
    PRED_DATA[str(i)] = float(real_predictions[i])
    
def dump_predictions(prediction_dict, prediction_file):
    with open(prediction_file, 'w') as f:
        json.dump(prediction_dict, f)
    
dump_predictions(PRED_DATA, output_path_of_final_predictions)