In [1]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler
from plink_datasets import *
from datasets import *
from ANN_MCC import *
import sys
import matplotlib.pyplot as plt
import sklearn.metrics as metrics
import scikitplot as skplt
import pickle
import matplotlib.image as mpimg
import subprocess

In [2]:
pheno_path = "/home/user/directory/phenotypes/pheno.txt"   
data_path = "/home/user/directory/model_runs/"
idx = "600"


subprocess.call(["/home/user/Phenotype_prediction/data_management/split_data.R", idx, data_path],
                   stdout=subprocess.DEVNULL, stderr=subprocess.STDOUT)

train_path = data_path + idx + "/train"
g_train = merge_geno_pheno(geno_path = train_path,
                           pheno_path = hos_path,
                           delim = " ")

val_path = data_path + idx + "/valid"
g_val = merge_geno_pheno(geno_path = val_path,
                         pheno_path = hos_path,
                         delim = " ")

test_path = data_path + idx + "/test"
g_test = merge_geno_pheno(geno_path = test_path,
                          pheno_path = hos_path,
                          delim = " ")

gwas_data = PreSplit(G_train = g_train, 
                 G_val = g_val, 
                 G_test = g_test, 
                 scale=False,
                 shuffle=True,
                 shuffle_phenotype=False)


X_train, y_train = gwas_data["train"]
w = torch.tensor(1 / y_train.mean().item())

In [3]:
net = ANN(X_train.shape[1], [25], 1, act_func=nn.ReLU, mlp_m=True)
trainloss_list, valloss_list, model = train(net=net,
                                            dataset=gwas_data,
                                            batch_size=300,
                                            nepochs=150,
                                            criterion=nn.BCEWithLogitsLoss(pos_weight=w),
                                            evaluate=MCCLoss_bin,
                                            test = test_mcc_bin,
                                            learning_rate=5e-3,
                                            l1_const=0,
                                            l2_const=0.1,
                                            early_stopping=True,
                                            verbose=True)

Epoch: 1 Training loss: 6.169233322143555 Correlation: -0.06675314903259277
Epoch: 2 Training loss: 5.228658676147461 Correlation: -0.04624195769429207
Epoch: 3 Training loss: 5.081997394561768 Correlation: -0.05331677198410034
Epoch: 4 Training loss: 2.9280364513397217 Correlation: -0.0038161154370754957
Epoch: 5 Training loss: 2.783190965652466 Correlation: 0.004978541284799576
Epoch: 6 Training loss: 2.181175947189331 Correlation: 0.003167770802974701
Epoch: 7 Training loss: 2.0707356929779053 Correlation: 0.05559272691607475
Epoch: 8 Training loss: 2.2267184257507324 Correlation: 0.033366769552230835
Epoch: 9 Training loss: 2.3239798545837402 Correlation: 0.04952580854296684
Epoch: 10 Training loss: 1.900946855545044 Correlation: 0.06058149412274361
Epoch: 11 Training loss: 1.809238314628601 Correlation: 0.01725410670042038
Epoch: 12 Training loss: 2.214118003845215 Correlation: 0.035829994827508926
Epoch: 13 Training loss: 1.7634093761444092 Correlation: 0.018149331212043762
Epoch

In [4]:
train_performance = pd.DataFrame({'TRAIN_LOSS': np.asarray(trainloss_list), 'VAL_LOSS':  np.asarray(valloss_list)})
train_performance.to_csv(data_path + idx + "/train_performance.csv", sep = "\t", index = False)

X_test, y_test = gwas_data["test"]
y_pred_raw = model(X_test)
y_true = y_test.view(-1,1).numpy()
test_performance = pd.DataFrame({'RAW_PRED': y_pred_raw.detach().numpy().flatten(), 'LABELS': y_true.flatten()})
test_performance.to_csv(data_path + idx + "/test_performance.csv", sep = "\t", index = False) 

torch.save(model, data_path + idx + "/model_weights.pth")