In [1]:
%load_ext autoreload
%autoreload 2
%load_ext autotime

import os
import sys
sys.path.append('..')

from collections import OrderedDict

import splintr as sp
from splintr.splice import rmats_subset_top_events
sp.verbose = True

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import GridSearchCV

import torch
from torch.utils.data import Dataset, DataLoader, ConcatDataset, WeightedRandomSampler
import torch.nn.functional as F
from torchsummary import summary

from ax import optimize

from tqdm.autonotebook import tqdm
tqdm.pandas()

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")

seed = 99
np.random.seed(seed)
torch.manual_seed(seed)
torch.set_num_threads=16

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


# Load and transform dataset

In [2]:
# Parameters
data_dir = '../data/features'
feature_file = f'{data_dir}/SE.txt'
feature_df = rmats_subset_top_events(feature_file, 5)
feature_df = feature_df.loc[feature_df.IncLevelDifference > 0] # upregulated AS events

time: 93.6 ms


In [3]:
# Randomize sample order
rand_sample_i = np.random.choice(feature_df.shape[0], size=feature_df.shape[0], replace=False)

# Determine dataset split size
train_size, valid_size, test_size = [int(len(rand_sample_i) * s) for s in [0.8, 0.1, 0.1]]
train_size += 1

# Split into training, validation, and test
train_df = feature_df.iloc[rand_sample_i[:train_size]]
valid_df = feature_df.iloc[rand_sample_i[train_size : train_size + valid_size]]
test_df = feature_df.iloc[rand_sample_i[train_size + valid_size : train_size + valid_size + test_size]]
datasets_df = [train_df, valid_df, test_df]

time: 4.7 ms


In [4]:
# Additional parameters for loading data
seq_length = 250
genome_fa = '../data/hg19.fa'
k = 10

# Sample from each splice event k times
all_data = []
for df in datasets_df:
    augmented_data = []
    for i in tqdm(range(k), total=k):
        # Pad and crop transform
        tf1 = [sp.PadSequence(seq_length), sp.CropSequence(seq_length)]
        augmented_data.append(sp.SpliceEventDataset(feature_file=df,
                                                    genome_fa=genome_fa,
                                                    transform=tf1))
        
        # Pad and crop transform on reverse complement
        tf2 = [sp.PadSequence(seq_length), sp.CropSequence(seq_length), sp.ReverseComplement()]
        augmented_data.append(sp.SpliceEventDataset(feature_file=df,
                                                    genome_fa=genome_fa,
                                                    transform=tf2))
    augmented_data = torch.utils.data.ConcatDataset(augmented_data)
    all_data.append(augmented_data)

train_dataset, valid_dataset, test_dataset = all_data

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10), HTML(value='')))


time: 8.79 s


In [5]:
# Convert categorical labels to numerical
print(feature_df['sample'].value_counts())
label_names = pd.factorize(feature_df['sample'])

num_classes = int(max(label_names[0]) + 1)
print(f'Classes: {num_classes}')

# Balance class sampling using weighted sampler
samplers = []
for dataset in all_data:
    labels = [sample[1] for sample in dataset] # get label of each sample
    weights = 100. / pd.Series(labels).value_counts() # class weights
    weights = weights[labels].values 
    sampler = WeightedRandomSampler(weights=weights, num_samples=len(weights))
    samplers.append(sampler)

train_sampler, valid_sampler, test_sampler = samplers

AQR       234
HNRNPC    219
bg        175
U2AF2     135
RBM15      83
U2AF1      55
Name: sample, dtype: int64
Classes: 6
time: 6.67 s


In [18]:
sp.learning._calc_conv_pad(250, 50, 40, 5)

-7

time: 2.41 ms


In [58]:
# Run(num_classes=6, c1_in=250, c1_out=50, c1_kernel_w=10, c1_filter=64, c1_stride_w=5, c2_out=6, c2_kernel_w=4, c2_filter=8, c2_stride_w=4, fc_out=8, batch_size=128, lr=0.01, weight_decay=0, dropout=0)
params = OrderedDict(
    # model parameters
    num_classes = [num_classes],
    c1_in = [seq_length],
    c1_out = [50],
    c1_kernel_w = [20],
    c1_filter = [64],
    c1_stride_w = [5],
    c2_out = [6],
    c2_kernel_w = [4],
    c2_filter = [8],
    c2_stride_w = [4],
    fc_out = [8],
    
    # hyperparameters
    batch_size = [32],
    lr = [0.0001],
    weight_decay = [0],
    dropout = [0]
)

manager = sp.RunManager()
is_first_run = True
for run in sp.RunBuilder.get_runs(params):    
    # Initialize model and dataset
    network = sp.SplintrNet(num_classes=run.num_classes,
                      c1_in=run.c1_in,
                      c1_out=run.c1_out,
                      c1_kernel_w=run.c1_kernel_w,
                      c1_filter=run.c1_filter,
                      c1_stride_w=run.c1_stride_w,
                      c2_out=run.c2_out,
                      c2_kernel_w=run.c2_kernel_w,
                      c2_filter=run.c2_filter,
                      c2_stride_w=run.c2_stride_w,
                      dropout=run.dropout,
                      fc_out=run.fc_out).cuda(device)
    
    train_loader = DataLoader(train_dataset, batch_size=run.batch_size, sampler=train_sampler)
    valid_loader = DataLoader(valid_dataset, batch_size=run.batch_size, sampler=valid_sampler)

    optimizer = torch.optim.Adam(network.parameters(), lr=run.lr, weight_decay=run.weight_decay)
    log_dir = '/home/ubuntu/tb/8-05-19-6class/'
    # Display brief summary of first model
    if is_first_run:
        is_first_run = False
        summary(network.cuda(), input_size=(4, 4, seq_length), device='cuda')
#         util.show_sample(train_dataset[np.random.randint(len(train_dataset))], class_names=label_names)
    
    # Perform training
    manager.begin_run(run, network, train_loader, valid_loader, log_dir)
    network.cuda()
    for epoch in range(30):
        
        manager.begin_epoch()
        
        # Train on batch
        for batch in train_loader:
            seqs, labels = batch
            preds = network(seqs.cuda(device)) # pass batch
            loss = F.cross_entropy(preds, labels.cuda(device)) # calculate loss
            optimizer.zero_grad() # zero gradients
            loss.backward() # calculate gradients
            optimizer.step() # update weights

            manager.track_train_loss(loss)
            manager.track_train_num_correct(preds, labels.cuda(device))
        
        # Check validation set
        with torch.no_grad():
            for data in valid_loader:
                seqs, labels = data
                preds = network(seqs.cuda(device))
                loss = F.cross_entropy(preds, labels.cuda(device))
                
                manager.track_valid_loss(loss)
                manager.track_valid_num_correct(preds, labels.cuda(device))

        manager.end_epoch()
    manager.end_run(train_class_names=label_names[1],
                    valid_class_names=label_names[1])
manager.save('../results')

Unnamed: 0,run,epoch,train_loss,valid_loss,train_accuracy,valid_accuracy,epoch_duration,run_duration,num_classes,c1_in,...,c1_stride_w,c2_out,c2_kernel_w,c2_filter,c2_stride_w,fc_out,batch_size,lr,weight_decay,dropout
0,1,1,1.79334,1.816674,0.171845,0.161111,6.987569,7.193526,6,250,...,5,6,4,8,4,8,32,0.0001,0,0.1
1,1,2,1.793093,1.818034,0.169348,0.149444,6.848788,14.120342,6,250,...,5,6,4,8,4,8,32,0.0001,0,0.1


KeyboardInterrupt: 

time: 24.2 s
