In [None]:
import os
import pickle
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr
from itertools import product
from sklearn.linear_model import ElasticNet
from sklearn.neural_network import MLPRegressor
import warnings
warnings.filterwarnings("ignore")

In [None]:
for rs in range(1, 6):
    # read data
    X_train = pd.read_csv("../../result/input_GTEx_phastcons/stratified_%d/X_train" % rs, sep="\t", index_col=0)
    X_valid = pd.read_csv("../../result/input_GTEx_phastcons/stratified_%d/X_valid" % rs, sep="\t", index_col=0)
    X_test = pd.read_csv("../../result/input_GTEx_phastcons/stratified_%d/X_test" % rs, sep="\t", index_col=0)
    Y_train = pd.read_csv("../../result/input_GTEx_phastcons/stratified_%d/Y_train" % rs, sep="\t", index_col=0)
    Y_valid = pd.read_csv("../../result/input_GTEx_phastcons/stratified_%d/Y_valid" % rs, sep="\t", index_col=0)
    Y_test = pd.read_csv("../../result/input_GTEx_phastcons/stratified_%d/Y_test" % rs, sep="\t", index_col=0)
    X = pd.concat([X_train, X_valid, X_test])
    Y = pd.concat([Y_train, Y_valid, Y_test]) * 100

    # split masks
    train_mask = np.concatenate([[True] * len(X_train), [False] * len(X_valid), [False] * len(X_test)])
    valid_mask = np.concatenate([[False] * len(X_train), [True] * len(X_valid), [False] * len(X_test)])
    test_mask = np.concatenate([[False] * len(X_train), [False] * len(X_valid), [True] * len(X_test)])
    mask = pd.DataFrame([train_mask, valid_mask, test_mask]).T
    mask.index = X.index

    # re-order data to match the network index
    valid_genes = pd.read_csv("../../result/network_GTEx_phastcons/valid_genes", sep="\t")
    X = X.loc[valid_genes['genes'].values]
    Y = Y.loc[valid_genes['genes'].values]
    mask = mask.loc[valid_genes['genes'].values]

    # model
    os.makedirs("../../result/model_GTEx_phastcons_MLP/%d/" % (rs), exist_ok=True)
    for rep in range(10):
        mse_list = []
        p_list = []
        hidden_size_combo = list(product([512, 128, 32], [512, 128, 32, 8, 4], [512, 128, 32, 8, 4]))
        for hidden_size in hidden_size_combo:
            mlp = MLPRegressor(hidden_layer_sizes=hidden_size, max_iter=1000, activation="relu", solver="adam", alpha=0)
            mlp.fit(X[mask[0]], Y[mask[0]].values.reshape(-1))
            Y_pred = mlp.predict(X[mask[1]])
            mse_list.append(mean_squared_error(Y[mask[1]].values.reshape(-1), Y_pred.reshape(-1)))
            p_list.append(pearsonr(Y[mask[1]].values.reshape(-1), Y_pred.reshape(-1))[0])
        best_para = hidden_size_combo[np.argmin(mse_list)]

        mlp = MLPRegressor(hidden_layer_sizes=best_para, max_iter=1000, activation="relu", solver="adam")
        mlp.fit(X[mask[0] | mask[1]], Y[mask[0] | mask[1]].values.reshape(-1))
        Y_pred = mlp.predict(X)

        # save model, hyperparameters and results
        prefix = "../../result/model_GTEx_phastcons_MLP/%d/model%d" % (rs, rep)
        with open(prefix+".para", "wb") as f:
            pickle.dump([best_para, mlp, Y_pred], f)