In [1]:
from pyfaidx import Fasta
import pandas as pd
import sys
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, WeightedRandomSampler, RandomSampler, Sampler
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from argparse import Namespace
import tqdm
import itertools
from collections import Counter
from sklearn.metrics import average_precision_score
import gzip
from functools import partial
from ray import tune
from ray.tune import JupyterNotebookReporter
from ray.tune.schedulers import ASHAScheduler



In [2]:
from torchdataset import *
from torchsamplers import *
from torchclassifiers import *
from torchutils import *

In [3]:
### GLOBALS 
SOURCE_GENOME="mm10"
TF="CTCF"
GENOME_FASTA='/storage/home/dzb5732/work/domain_adaptive_nn/genomes/mm10_no_alt_analysis_set_ENCODE.fasta'
PILOT_STUDY=False
FEAT_METHOD="ohe"
KMER_LENGTH=7
PYTORCH_DEVICE="cuda"
TRAIN=True

# Model Name to Object Mapper

# State defining arguments

In [4]:
args = Namespace(
    # Data and Path information
#    model_state_file=f'{MODEL_NAME}_{FEAT_METHOD}.pth',
    tf_csv=f'/storage/home/dzb5732/work/domain_adaptive_nn/data/{SOURCE_GENOME}/{TF}/split_data.csv.gz',
    genome_fasta=GENOME_FASTA,
    # model vectorizer parameters
    ohe=True if FEAT_METHOD=="ohe" else False,
    k=KMER_LENGTH,
    # Model hyper parameters
    cuda=True if PYTORCH_DEVICE=="cuda" else False,
    expand_filepaths_to_save_dir=True,
    seed=1337,
    pilot=PILOT_STUDY
)
    
# Check CUDA
if not torch.cuda.is_available():
    args.cuda = False

print("Using CUDA: {}".format(args.cuda))

args.device = torch.device("cuda" if args.cuda else "cpu")

# Set seed for reproducibility
set_seed_everywhere(args.seed, args.cuda)

Using CUDA: True


# Initialize Dataset and Model

In [5]:
def load_data(args):
    dataset = TFDataset.load_dataset_and_vectorizer_from_path(args.tf_csv, 
                                                              args.genome_fasta, 
                                                              ohe=args.ohe, 
                                                              k=args.k)
    return dataset

In [6]:
def make_samplers(dataset, args):
    dataset.set_split('train')
    # generate train sampler
    train_sampler = get_sampler(dataset, weighted=True, mini=args.pilot)
    dataset.set_split('valid')
    # generate valid sampler
    valid_sampler = get_sampler(dataset, weighted=True, mini=args.pilot)
    return train_sampler, valid_sampler

# Train function

In [7]:
def train_func(config, checkpoint_dir, args):
    
    dataset = load_data(args)
    train_sampler, valid_sampler = make_samplers(dataset, args)
    
    net = TFHybrid(config["cnn_filters"],
                   config["cnn_kernel"],
                   config["pool_size"],
                   config["pool_strides"],
                   config["lstm_out"],
                   config["fc1_nodes"],
                   config["fc2_nodes"],
                   config["drop_prob"])
    
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)
    
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(net.parameters(), lr=config["lr"])
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.5,
                                                 patience=1)

    
    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)
    

    dataset.set_split("train")
    train_loader = DataLoader(dataset=dataset, batch_size=int(config["batch_size"]),
                       sampler=train_sampler, shuffle=False,
                       drop_last=True, num_workers=4)
        
    
    dataset.set_split("valid")
    valid_loader = DataLoader(dataset=dataset, batch_size=int(config["batch_size"]),
                       sampler=valid_sampler, shuffle=False,
                       drop_last=True, num_workers=4)
    
    
    
    for epoch in range(10):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        net.train()
        dataset.set_split("train")
        for i, data in enumerate(train_loader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data["x_data"], data["y_target"]
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels.float())
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                running_loss = 0.0  
                
        # Validation loss
        net.eval()
        dataset.set_split("valid")
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valid_loader, 0):
            with torch.no_grad():
                inputs, labels = data["x_data"], data["y_target"]
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)

                loss = criterion(outputs, labels.float())
                val_loss += loss.cpu().numpy()
                val_steps += 1

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((net.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=(val_loss / val_steps))
    
    return

# hpopt main function

In [8]:
def main(args, checkpoint_dir="/storage/home/dzb5732/scratch/", num_samples=20, max_num_epochs=25, gpus_per_trial=1):
    
    config = {
          "cnn_filters":tune.choice([120, 240]), 
          "cnn_kernel":tune.choice([20, 40, 60]), 
          "pool_size":tune.choice([15, 30]), 
          "pool_strides":tune.choice([15]),
          "lstm_out":tune.choice([32, 64, 128, 256]), 
          "fc1_nodes":tune.choice([512, 1024]), 
          "fc2_nodes":tune.choice([128, 256, 512]), 
          "drop_prob":tune.choice([0.5]),
          "lr": tune.choice([1e-3]),
          "batch_size": tune.choice([512])
    }
    

    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    

    reporter = JupyterNotebookReporter(overwrite=True,
        metric_columns=["loss", "training_iteration"])

    
    result = tune.run(
        partial(train_func, checkpoint_dir=checkpoint_dir, args=args),
        resources_per_trial={"cpu": 4, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter,
        local_dir="/storage/home/dzb5732/scratch/ray_results/",)
    
    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(best_trial.last_result["loss"]))
    
    return best_trial

In [9]:
if __name__ == "__main__":
    # You can change the number of GPUs per trial here:
    best_trial = main(args, num_samples=40, max_num_epochs=25, gpus_per_trial=1)

Trial name,status,loc,batch_size,cnn_filters,cnn_kernel,drop_prob,fc1_nodes,fc2_nodes,lr,lstm_out,pool_size,pool_strides,loss,training_iteration
DEFAULT_eb178_00000,TERMINATED,,512,240,40,0.5,1024,256,0.001,32,15,15,0.212046,4
DEFAULT_eb178_00001,TERMINATED,,512,120,60,0.5,1024,512,0.001,32,15,15,0.240639,1
DEFAULT_eb178_00002,TERMINATED,,512,240,20,0.5,1024,512,0.001,256,30,15,0.20741,2
DEFAULT_eb178_00003,TERMINATED,,512,120,40,0.5,1024,128,0.001,256,15,15,0.182485,10
DEFAULT_eb178_00004,TERMINATED,,512,120,40,0.5,1024,256,0.001,256,30,15,0.233503,1
DEFAULT_eb178_00005,TERMINATED,,512,120,20,0.5,512,256,0.001,256,30,15,0.220591,2
DEFAULT_eb178_00006,TERMINATED,,512,240,20,0.5,512,512,0.001,32,15,15,0.263245,1
DEFAULT_eb178_00007,TERMINATED,,512,240,20,0.5,1024,256,0.001,128,15,15,0.243081,1
DEFAULT_eb178_00008,TERMINATED,,512,120,20,0.5,1024,128,0.001,128,15,15,0.24713,1
DEFAULT_eb178_00009,TERMINATED,,512,240,20,0.5,1024,256,0.001,128,30,15,0.246438,1


2021-08-20 19:35:59,580	INFO tune.py:550 -- Total run time: 4490.72 seconds (4490.48 seconds for the tuning loop).


Best trial config: {'cnn_filters': 240, 'cnn_kernel': 40, 'pool_size': 30, 'pool_strides': 15, 'lstm_out': 64, 'fc1_nodes': 1024, 'fc2_nodes': 256, 'drop_prob': 0.5, 'lr': 0.001, 'batch_size': 512}
Best trial final validation loss: 0.1672579643897491


In [9]:
### CNN HPOPT

if __name__ == "__main__":
    # You can change the number of GPUs per trial here:
    best_trial = main(args, num_samples=40, max_num_epochs=25, gpus_per_trial=1)

Trial name,status,loc,batch_size,cnn_filters,cnn_kernel,drop_prob,fc1_nodes,fc2_nodes,lr,pool_size,pool_strides,loss,training_iteration
DEFAULT_1356b_00000,TERMINATED,,512,240,40,0.5,512,128,0.001,15,15,0.182842,10
DEFAULT_1356b_00001,TERMINATED,,128,80,20,0.5,512,256,0.0001,30,15,0.425668,1
DEFAULT_1356b_00002,TERMINATED,,128,120,60,0.5,512,256,0.001,15,15,0.213006,1
DEFAULT_1356b_00003,TERMINATED,,128,80,60,0.5,1024,128,0.001,30,15,0.211352,1
DEFAULT_1356b_00004,TERMINATED,,256,120,60,0.5,1024,64,0.001,30,15,0.217837,1
DEFAULT_1356b_00005,TERMINATED,,128,80,20,0.5,1024,256,0.001,15,15,0.226068,1
DEFAULT_1356b_00006,TERMINATED,,128,240,40,0.5,512,64,0.001,15,15,0.196725,4
DEFAULT_1356b_00007,TERMINATED,,512,240,20,0.5,512,64,0.001,30,15,0.237534,1
DEFAULT_1356b_00008,TERMINATED,,256,120,20,0.5,512,64,0.0001,30,15,0.478181,1
DEFAULT_1356b_00009,TERMINATED,,128,200,40,0.5,512,128,0.001,30,15,0.199999,2


2021-08-20 13:03:19,018	INFO tune.py:550 -- Total run time: 2766.97 seconds (2766.66 seconds for the tuning loop).


Best trial config: {'cnn_filters': 240, 'cnn_kernel': 40, 'pool_size': 15, 'pool_strides': 15, 'fc1_nodes': 512, 'fc2_nodes': 128, 'drop_prob': 0.5, 'lr': 0.001, 'batch_size': 512}
Best trial final validation loss: 0.18284228635896552


# Scratch