In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import scipy.stats as ss
import seaborn as sns
sc.settings.set_figure_params(dpi=100)

### Read in Data

In [None]:
import pickle as pkl
# read in the aggregated values
with open('../external_data/db.ags.pkl', 'rb') as f: ags = pkl.load(f)
with open('../external_data/db.tras.pkl', 'rb') as f: tras = pkl.load(f)
with open('../external_data/db.tras.pkl', 'rb') as f: tras = pkl.load(f)
with open('../external_data/db.paired_tcrs.pkl', 'rb') as f: paired_tcrs = pkl.load(f)
ags, tras, tras = pd.Series(ags), pd.Series(tras), pd.Series(tras)
print(len(ags)); print(len(tras)); print(len(tras))

### Setup Model

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import torchvision.transforms as transforms

In [None]:
def train(epoch, loss_func):
    model.train()
    train_loss = 0
    for batch_idx, data in enumerate(train_loader):
        data = data[0].to(device)
        optimizer.zero_grad()
        (recon_batch, recon_len), mu, logvar = model(data)
        if loss_func == 1:
            loss = loss_function1(recon_batch, recon_len, data, mu, logvar)
        elif loss_func == 2:
            loss = loss_function2(recon_batch, recon_len, data, mu, logvar)
        elif loss_func == 3:
            loss = loss_function3(recon_batch, recon_len, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()

    print('====> Epoch: {} Average loss: {:.4f}'.format(
          epoch, train_loss / len(train_loader.dataset)))
    return train_loss / len(train_loader.dataset)
    
def test(epoch, loss_func):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for i, data in enumerate(test_loader):
            data = data[0].to(device)
            (recon_batch, recon_len), mu, logvar = model(data)
            if loss_func == 1:
                test_loss += loss_function1(recon_batch, recon_len, data, mu, logvar).item()
            elif loss_func == 2:
                test_loss += loss_function2(recon_batch, recon_len, data, mu, logvar).item()
            elif loss_func == 3:
                test_loss += loss_function3(recon_batch, recon_len, data, mu, logvar).item()

    test_loss /= len(test_loader.dataset)
    print('====> Test set loss: {:.4f}'.format(test_loss))
    return test_loss

In [None]:
# define the key parameters
init_embed_size = 50-1
protein_len = 48
init_kernel_size = 3
init_cnn_filters = 128
init_kernel_stride = 1
init_kernel_padding = 1
secn_cnn_filters = 128
latent_dim = 32
vocab = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']
# we want the embedding output to be the vocab with the length to allow for reconstruction
out_embed_size = len(vocab)
n_nodes_len = 32

# define the convolutional variational autoencoder
class ConvVAE(nn.Module):
    def __init__(self):
        super(ConvVAE, self).__init__()

        # encoding
        self.fc1 = nn.Conv1d(
            in_channels=init_embed_size, out_channels=init_cnn_filters, kernel_size=init_kernel_size, 
            stride=init_kernel_stride, padding=init_kernel_padding,
        )
        self.fc2 = nn.Conv1d(
            in_channels=init_cnn_filters, out_channels=secn_cnn_filters, kernel_size=init_kernel_size, 
            stride=init_kernel_stride, padding=init_kernel_padding
        )
        # variational sampling
        self.fc31 = nn.Linear(secn_cnn_filters*protein_len, latent_dim)
        self.fc32 = nn.Linear(secn_cnn_filters*protein_len, latent_dim)
        self.fc4 = nn.Linear(latent_dim, secn_cnn_filters*protein_len)
        # decoding
        self.fc5 = nn.ConvTranspose1d(
            in_channels=secn_cnn_filters, out_channels=init_cnn_filters, kernel_size=init_kernel_size, 
            stride=init_kernel_stride, padding=init_kernel_padding
        )
        self.fc6 = nn.ConvTranspose1d(
            in_channels=init_cnn_filters, out_channels=out_embed_size, kernel_size=init_kernel_size, 
            stride=init_kernel_stride, padding=init_kernel_padding
        )
        self.fc7 = nn.Linear(init_cnn_filters*protein_len, n_nodes_len)
        self.fc8 = nn.Linear(n_nodes_len, 1)

    def encode(self, x):
        x1 = nn.LeakyReLU()(self.fc1(x[:, :-1, :]))
        x2 = nn.LeakyReLU()(self.fc2(x1))
        x2_ = nn.Flatten()(x2)
        return self.fc31(x2_), self.fc32(x2_)

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z):
        x4 = nn.LeakyReLU()(self.fc4(z))
        x4_ = x4.view(-1, secn_cnn_filters, protein_len)
        x5 = nn.LeakyReLU()(self.fc5(x4_))
        x5_ = nn.Flatten()(x5)
        x6 = nn.Sigmoid()(self.fc6(x5))
        return x6, self.fc8(nn.LeakyReLU()(self.fc7(x5_)))

    def forward(self, x):
        mu, logvar = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z), mu, logvar

In [None]:
# Reconstruction + KL divergence losses summed over all elements and batch
def loss_function1(recon_x, recon_len, x, mu, logvar):
    # get the data
    BCE = nn.functional.binary_cross_entropy(recon_x, x[:, :len(vocab), :], reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD
def loss_function2(recon_x, recon_len, x, mu, logvar):
    # get the data
    TSE = (recon_len - x[:, -1, :]).pow(2).sum()
    return TSE
def loss_function3(recon_x, recon_len, x, mu, logvar):
    # get the data
    BCE = nn.functional.binary_cross_entropy(recon_x, x[:, :len(vocab), :], reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    TSE = (recon_len - x[:, -1, :]).pow(2).sum()
    return BCE + KLD + TSE

In [None]:
import pickle as pkl
# embed all of our unique TRAs
with open('../outs/map.tra_to_embed.extended.pkl', 'rb') as f: tra_to_embed = pkl.load(f)
X_tras = torch.stack([x.to(torch.float32) for x in tras.map(tra_to_embed)])
# randomly split the data into train and test
torch.manual_seed(0); np.random.seed(0)
idxs_train = np.random.choice(range(len(X_tras)), size=round(len(X_tras)*0.75), replace=False)
idxs_test = np.array(range(len(X_tras)))
idxs_test = idxs_test[~np.isin(idxs_test, idxs_train)]
X_tras_train = X_tras[idxs_train]
X_tras_test = X_tras[idxs_test]

In [None]:
from torch.utils.data import TensorDataset, DataLoader
# create a latent space for the TRAs
batch_size = 2048
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
from tqdm import tqdm
from Levenshtein import distance as levenshtein
# define a method to reconstruct the sequence
def reconstruct(tmp_out, targ_len, curr_len):
    # compute the x-coordinates of the original
    xp = np.arange(curr_len) / (curr_len - 1)
    x = np.arange(targ_len) / (targ_len - 1)
    # interpolate the results
    res = np.array([np.interp(x, xp, tmp_out[idx, :]) for idx in range(tmp_out.shape[0])])
    return ''.join(pd.DataFrame(res, index=vocab).idxmax(0))

In [None]:
# redefine functions to be silent
def train(model, train_loader, epoch, loss_func):
    model.train()
    train_loss = 0
    for batch_idx, data in enumerate(train_loader):
        data = data[0].to(device)
        optimizer.zero_grad()
        (recon_batch, recon_len), mu, logvar = model(data)
        if loss_func == 1:
            loss = loss_function1(recon_batch, recon_len, data, mu, logvar)
        elif loss_func == 2:
            loss = loss_function2(recon_batch, recon_len, data, mu, logvar)
        elif loss_func == 3:
            loss = loss_function3(recon_batch, recon_len, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()

### Search for Stable Parameters

In [None]:
# set constants throughout the search
device = 'cuda'
batch_size = 2048
# define a function to perform a single model search
df_recons = []
for seed in range(5):
    print('.', end='')
    # randomly split the data into train and test, select from a pool 100x batch size for speed
    torch.manual_seed(seed); np.random.seed(seed)
    idxs_pool = np.random.choice(range(len(X_tras)), size=round(100*batch_size), replace=False)
    idxs_train = np.random.choice(idxs_pool, size=round(len(idxs_pool)*0.75), replace=False)
    idxs_test = idxs_pool[~np.isin(idxs_pool, idxs_train)]
    X_tras_train = X_tras[idxs_train]
    X_tras_test = X_tras[idxs_test]
    # create a latent space for the TRAs
    train_loader = DataLoader(dataset=TensorDataset(X_tras_train), batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(dataset=TensorDataset(X_tras_test), batch_size=batch_size, shuffle=False)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # work through all model parameters
    secn_cnn_filters, latent_dim, n_nodes_len = 256, 32, 32
    for init_cnn_filters in [64, 128, 256, 512, 1024]:
        # initialize the model
        model = ConvVAE().to(device)
        # set the seed for training
        torch.manual_seed(seed); np.random.seed(seed)
        # set the learning parameters
        lr = 0.0005; epochs = 20
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        # train the model with dual losses to balance between two objectives
        for epoch in range(1, epochs + 1):
            train(model, train_loader, epoch, loss_func=1)
            train(model, train_loader, epoch, loss_func=2)
        
        # set the seed for training
        torch.manual_seed(seed); np.random.seed(seed)
        # set the learning parameters
        lr = 0.001; epochs = 40
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        # train the model with an integrated loss
        for epoch in range(1, epochs + 1):
            train(model, train_loader, epoch, loss_func=3)
            
        # retrieve the predictions
        model.eval()
        recon_lens = []; recon_batchs = []
        with torch.no_grad():
            for i, data in enumerate(test_loader):
                data = data[0].to(device)
                (recon_batch, recon_len), _, _ = model(data)
                recon_batchs.extend(recon_batch.clone().detach().cpu().numpy())
                recon_lens.extend(recon_len.clone().detach().cpu().tolist())
        recon_lens = [x[0] for x in recon_lens]
        # retrieve the indices
        tras_test = pd.Series(tras.iloc[idxs_test])
        
        # set parameters for reconstruction
        curr_len = 48  # this is a constant
        true_lens = tras_test.apply(len)
        n_sequences = len(tras_test)
        # test reconstruction keeping track in a dataframe
        df_recon = pd.DataFrame(columns=['pred_len','true_len','true_seq','pred_seq_from_pred_len','pred_seq_from_true_len'])
        for idx in range(n_sequences):
            pred_len = recon_lens[idx]
            true_len = true_lens.iloc[idx]
            true_seq = tras_test.iloc[idx]
            recon_seq_from_pred_len = reconstruct(recon_batchs[idx], pred_len, curr_len)
            recon_seq_from_true_len = reconstruct(recon_batchs[idx], true_len, curr_len)
            df_recon.loc[idx] = pred_len, true_len, true_seq, recon_seq_from_pred_len, recon_seq_from_true_len
        # assess via multiple metrics
        df_recon['pred_len_diff'] = df_recon['pred_len'] - df_recon['true_len']
        df_recon['leven_to_pred_len_recon'] = [levenshtein(df_recon.loc[idx, 'true_seq'], df_recon.loc[idx, 'pred_seq_from_pred_len']) for idx in df_recon.index]
        df_recon['leven_to_true_len_recon'] = [levenshtein(df_recon.loc[idx, 'true_seq'], df_recon.loc[idx, 'pred_seq_from_true_len']) for idx in df_recon.index]
        df_recon['init_cnn_filters'] = init_cnn_filters
        df_recon['secn_cnn_filters'] = secn_cnn_filters
        df_recon['latent_dim'] = latent_dim
        df_recon['n_nodes_len'] = n_nodes_len
        df_recon['seed'] = seed
        df_recons.append(df_recon)
# concatenate the data in order to examine the sequences
df_recon_init = pd.concat(df_recons, axis=0).reset_index().iloc[:, 1:]

In [None]:
# define a function to perform a single model search
df_recons = []
for seed in range(5):
    print('.', end='')
    # randomly split the data into train and test, select from a pool 100x batch size for speed
    torch.manual_seed(seed); np.random.seed(seed)
    idxs_pool = np.random.choice(range(len(X_tras)), size=round(100*batch_size), replace=False)
    idxs_train = np.random.choice(idxs_pool, size=round(len(idxs_pool)*0.75), replace=False)
    idxs_test = idxs_pool[~np.isin(idxs_pool, idxs_train)]
    X_tras_train = X_tras[idxs_train]
    X_tras_test = X_tras[idxs_test]
    # create a latent space for the TRAs
    train_loader = DataLoader(dataset=TensorDataset(X_tras_train), batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(dataset=TensorDataset(X_tras_test), batch_size=batch_size, shuffle=False)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # work through all model parameters
    init_cnn_filters, latent_dim, n_nodes_len = 256, 32, 32
    for secn_cnn_filters in [64, 128, 256, 512, 1024]:
        # initialize the model
        model = ConvVAE().to(device)
        # set the seed for training
        torch.manual_seed(seed); np.random.seed(seed)
        # set the learning parameters
        lr = 0.0005; epochs = 20
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        # train the model with dual losses to balance between two objectives
        for epoch in range(1, epochs + 1):
            train(model, train_loader, epoch, loss_func=1)
            train(model, train_loader, epoch, loss_func=2)
        
        # set the seed for training
        torch.manual_seed(seed); np.random.seed(seed)
        # set the learning parameters
        lr = 0.001; epochs = 40
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        # train the model with an integrated loss
        for epoch in range(1, epochs + 1):
            train(model, train_loader, epoch, loss_func=3)
            
        # retrieve the predictions
        model.eval()
        recon_lens = []; recon_batchs = []
        with torch.no_grad():
            for i, data in enumerate(test_loader):
                data = data[0].to(device)
                (recon_batch, recon_len), _, _ = model(data)
                recon_batchs.extend(recon_batch.clone().detach().cpu().numpy())
                recon_lens.extend(recon_len.clone().detach().cpu().tolist())
        recon_lens = [x[0] for x in recon_lens]
        # retrieve the indices
        tras_test = pd.Series(tras.iloc[idxs_test])
        
        # set parameters for reconstruction
        curr_len = 48  # this is a constant
        true_lens = tras_test.apply(len)
        n_sequences = len(tras_test)
        # test reconstruction keeping track in a dataframe
        df_recon = pd.DataFrame(columns=['pred_len','true_len','true_seq','pred_seq_from_pred_len','pred_seq_from_true_len'])
        for idx in range(n_sequences):
            pred_len = recon_lens[idx]
            true_len = true_lens.iloc[idx]
            true_seq = tras_test.iloc[idx]
            recon_seq_from_pred_len = reconstruct(recon_batchs[idx], pred_len, curr_len)
            recon_seq_from_true_len = reconstruct(recon_batchs[idx], true_len, curr_len)
            df_recon.loc[idx] = pred_len, true_len, true_seq, recon_seq_from_pred_len, recon_seq_from_true_len
        # assess via multiple metrics
        df_recon['pred_len_diff'] = df_recon['pred_len'] - df_recon['true_len']
        df_recon['leven_to_pred_len_recon'] = [levenshtein(df_recon.loc[idx, 'true_seq'], df_recon.loc[idx, 'pred_seq_from_pred_len']) for idx in df_recon.index]
        df_recon['leven_to_true_len_recon'] = [levenshtein(df_recon.loc[idx, 'true_seq'], df_recon.loc[idx, 'pred_seq_from_true_len']) for idx in df_recon.index]
        df_recon['init_cnn_filters'] = init_cnn_filters
        df_recon['secn_cnn_filters'] = secn_cnn_filters
        df_recon['latent_dim'] = latent_dim
        df_recon['n_nodes_len'] = n_nodes_len
        df_recon['seed'] = seed
        df_recons.append(df_recon)
# concatenate the data in order to examine the sequences
df_recon_secn = pd.concat(df_recons, axis=0).reset_index().iloc[:, 1:]

In [None]:
# define a function to perform a single model search
df_recons = []
for seed in range(5):
    print('.', end='')
    # randomly split the data into train and test, select from a pool 100x batch size for speed
    torch.manual_seed(seed); np.random.seed(seed)
    idxs_pool = np.random.choice(range(len(X_tras)), size=round(100*batch_size), replace=False)
    idxs_train = np.random.choice(idxs_pool, size=round(len(idxs_pool)*0.75), replace=False)
    idxs_test = idxs_pool[~np.isin(idxs_pool, idxs_train)]
    X_tras_train = X_tras[idxs_train]
    X_tras_test = X_tras[idxs_test]
    # create a latent space for the TRAs
    train_loader = DataLoader(dataset=TensorDataset(X_tras_train), batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(dataset=TensorDataset(X_tras_test), batch_size=batch_size, shuffle=False)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # work through all model parameters
    init_cnn_filters, secn_cnn_filters, n_nodes_len = 256, 256, 32
    for latent_dim in [8, 16, 32, 64, 128]:
        # initialize the model
        model = ConvVAE().to(device)
        # set the seed for training
        torch.manual_seed(seed); np.random.seed(seed)
        # set the learning parameters
        lr = 0.0005; epochs = 20
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        # train the model with dual losses to balance between two objectives
        for epoch in range(1, epochs + 1):
            train(model, train_loader, epoch, loss_func=1)
            train(model, train_loader, epoch, loss_func=2)
        
        # set the seed for training
        torch.manual_seed(seed); np.random.seed(seed)
        # set the learning parameters
        lr = 0.001; epochs = 40
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        # train the model with an integrated loss
        for epoch in range(1, epochs + 1):
            train(model, train_loader, epoch, loss_func=3)
            
        # retrieve the predictions
        model.eval()
        recon_lens = []; recon_batchs = []
        with torch.no_grad():
            for i, data in enumerate(test_loader):
                data = data[0].to(device)
                (recon_batch, recon_len), _, _ = model(data)
                recon_batchs.extend(recon_batch.clone().detach().cpu().numpy())
                recon_lens.extend(recon_len.clone().detach().cpu().tolist())
        recon_lens = [x[0] for x in recon_lens]
        # retrieve the indices
        tras_test = pd.Series(tras.iloc[idxs_test])
        
        # set parameters for reconstruction
        curr_len = 48  # this is a constant
        true_lens = tras_test.apply(len)
        n_sequences = len(tras_test)
        # test reconstruction keeping track in a dataframe
        df_recon = pd.DataFrame(columns=['pred_len','true_len','true_seq','pred_seq_from_pred_len','pred_seq_from_true_len'])
        for idx in range(n_sequences):
            pred_len = recon_lens[idx]
            true_len = true_lens.iloc[idx]
            true_seq = tras_test.iloc[idx]
            recon_seq_from_pred_len = reconstruct(recon_batchs[idx], pred_len, curr_len)
            recon_seq_from_true_len = reconstruct(recon_batchs[idx], true_len, curr_len)
            df_recon.loc[idx] = pred_len, true_len, true_seq, recon_seq_from_pred_len, recon_seq_from_true_len
        # assess via multiple metrics
        df_recon['pred_len_diff'] = df_recon['pred_len'] - df_recon['true_len']
        df_recon['leven_to_pred_len_recon'] = [levenshtein(df_recon.loc[idx, 'true_seq'], df_recon.loc[idx, 'pred_seq_from_pred_len']) for idx in df_recon.index]
        df_recon['leven_to_true_len_recon'] = [levenshtein(df_recon.loc[idx, 'true_seq'], df_recon.loc[idx, 'pred_seq_from_true_len']) for idx in df_recon.index]
        df_recon['init_cnn_filters'] = init_cnn_filters
        df_recon['secn_cnn_filters'] = secn_cnn_filters
        df_recon['latent_dim'] = latent_dim
        df_recon['n_nodes_len'] = n_nodes_len
        df_recon['seed'] = seed
        df_recons.append(df_recon)
# concatenate the data in order to examine the sequences
df_recon_ldim = pd.concat(df_recons, axis=0).reset_index().iloc[:, 1:]

In [None]:
# define a function to perform a single model search
df_recons = []
for seed in range(5):
    print('.', end='')
    # randomly split the data into train and test, select from a pool 100x batch size for speed
    torch.manual_seed(seed); np.random.seed(seed)
    idxs_pool = np.random.choice(range(len(X_tras)), size=round(100*batch_size), replace=False)
    idxs_train = np.random.choice(idxs_pool, size=round(len(idxs_pool)*0.75), replace=False)
    idxs_test = idxs_pool[~np.isin(idxs_pool, idxs_train)]
    X_tras_train = X_tras[idxs_train]
    X_tras_test = X_tras[idxs_test]
    # create a latent space for the TRAs
    train_loader = DataLoader(dataset=TensorDataset(X_tras_train), batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(dataset=TensorDataset(X_tras_test), batch_size=batch_size, shuffle=False)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    # work through all model parameters
    init_cnn_filters, secn_cnn_filters, latent_dim = 256, 256, 32
    for n_nodes_len in [8, 16, 32, 64, 128]:
        # initialize the model
        model = ConvVAE().to(device)
        # set the seed for training
        torch.manual_seed(seed); np.random.seed(seed)
        # set the learning parameters
        lr = 0.0005; epochs = 20
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        # train the model with dual losses to balance between two objectives
        for epoch in range(1, epochs + 1):
            train(model, train_loader, epoch, loss_func=1)
            train(model, train_loader, epoch, loss_func=2)
        
        # set the seed for training
        torch.manual_seed(seed); np.random.seed(seed)
        # set the learning parameters
        lr = 0.001; epochs = 40
        optimizer = torch.optim.Adam(model.parameters(), lr=lr)
        # train the model with an integrated loss
        for epoch in range(1, epochs + 1):
            train(model, train_loader, epoch, loss_func=3)
            
        # retrieve the predictions
        model.eval()
        recon_lens = []; recon_batchs = []
        with torch.no_grad():
            for i, data in enumerate(test_loader):
                data = data[0].to(device)
                (recon_batch, recon_len), _, _ = model(data)
                recon_batchs.extend(recon_batch.clone().detach().cpu().numpy())
                recon_lens.extend(recon_len.clone().detach().cpu().tolist())
        recon_lens = [x[0] for x in recon_lens]
        # retrieve the indices
        tras_test = pd.Series(tras.iloc[idxs_test])
        
        # set parameters for reconstruction
        curr_len = 48  # this is a constant
        true_lens = tras_test.apply(len)
        n_sequences = len(tras_test)
        # test reconstruction keeping track in a dataframe
        df_recon = pd.DataFrame(columns=['pred_len','true_len','true_seq','pred_seq_from_pred_len','pred_seq_from_true_len'])
        for idx in range(n_sequences):
            pred_len = recon_lens[idx]
            true_len = true_lens.iloc[idx]
            true_seq = tras_test.iloc[idx]
            recon_seq_from_pred_len = reconstruct(recon_batchs[idx], pred_len, curr_len)
            recon_seq_from_true_len = reconstruct(recon_batchs[idx], true_len, curr_len)
            df_recon.loc[idx] = pred_len, true_len, true_seq, recon_seq_from_pred_len, recon_seq_from_true_len
        # assess via multiple metrics
        df_recon['pred_len_diff'] = df_recon['pred_len'] - df_recon['true_len']
        df_recon['leven_to_pred_len_recon'] = [levenshtein(df_recon.loc[idx, 'true_seq'], df_recon.loc[idx, 'pred_seq_from_pred_len']) for idx in df_recon.index]
        df_recon['leven_to_true_len_recon'] = [levenshtein(df_recon.loc[idx, 'true_seq'], df_recon.loc[idx, 'pred_seq_from_true_len']) for idx in df_recon.index]
        df_recon['init_cnn_filters'] = init_cnn_filters
        df_recon['secn_cnn_filters'] = secn_cnn_filters
        df_recon['latent_dim'] = latent_dim
        df_recon['n_nodes_len'] = n_nodes_len
        df_recon['seed'] = seed
        df_recons.append(df_recon)
# concatenate the data in order to examine the sequences
df_recon_nlen = pd.concat(df_recons, axis=0).reset_index().iloc[:, 1:]

In [None]:
# write each search to the disk
df_recon_init.to_csv('../outs/matrix.gridsearch_init.final.tra.csv')
df_recon_secn.to_csv('../outs/matrix.gridsearch_secn.final.tra.csv')
df_recon_ldim.to_csv('../outs/matrix.gridsearch_ldim.final.tra.csv')
df_recon_nlen.to_csv('../outs/matrix.gridsearch_nlen.final.tra.csv')

In [None]:
# examine different latent dimensions
data = df_recon_init.groupby(['init_cnn_filters','seed']).mean(numeric_only=True).reset_index()
data['pred_len_diff_abs'] = data['pred_len_diff'].abs()
for col in ['pred_len_diff_abs','leven_to_pred_len_recon','leven_to_true_len_recon']:
    np.random.seed(0)
    fig, ax = plt.subplots(figsize=[3.5, 4]); ax.grid(False)
    sns.boxplot(x='init_cnn_filters', y=col, data=data, saturation=1, linecolor='dodgerblue', color='skyblue', linewidth=1.5, showfliers=False)
    sns.stripplot(x='init_cnn_filters', y=col, data=data, edgecolor='dodgerblue', color='skyblue', linewidth=1.5, jitter=0.25, alpha=0.5)
    ax.set_xlim(-1, 5)

In [None]:
# retrieve p-values
for col in ['pred_len_diff','leven_to_pred_len_recon','leven_to_true_len_recon']:
    res = ss.kruskal(*[data.loc[data['init_cnn_filters'] == x, col].abs().tolist() for x in data['init_cnn_filters'].unique()])
    print(col, res[1])

In [None]:
# examine different latent dimensions
data = df_recon_secn.groupby(['secn_cnn_filters','seed']).mean(numeric_only=True).reset_index()
data['pred_len_diff_abs'] = data['pred_len_diff'].abs()
for col in ['pred_len_diff_abs','leven_to_pred_len_recon','leven_to_true_len_recon']:
    np.random.seed(0)
    fig, ax = plt.subplots(figsize=[3.5, 4]); ax.grid(False)
    sns.boxplot(x='secn_cnn_filters', y=col, data=data, saturation=1, linecolor='dodgerblue', color='skyblue', linewidth=1.5, showfliers=False)
    sns.stripplot(x='secn_cnn_filters', y=col, data=data, edgecolor='dodgerblue', color='skyblue', linewidth=1.5, jitter=0.25, alpha=0.5)
    ax.set_xlim(-1, 5)

In [None]:
# retrieve p-values
for col in ['pred_len_diff','leven_to_pred_len_recon','leven_to_true_len_recon']:
    res = ss.kruskal(*[data.loc[data['secn_cnn_filters'] == x, col].abs().tolist() for x in data['secn_cnn_filters'].unique()])
    print(col, res[1])

In [None]:
# examine different latent dimensions
data = df_recon_ldim.groupby(['latent_dim','seed']).mean(numeric_only=True).reset_index()
data['pred_len_diff_abs'] = data['pred_len_diff'].abs()
for col in ['pred_len_diff_abs','leven_to_pred_len_recon','leven_to_true_len_recon']:
    np.random.seed(0)
    fig, ax = plt.subplots(figsize=[3.5, 4]); ax.grid(False)
    sns.boxplot(x='latent_dim', y=col, data=data, saturation=1, linecolor='dodgerblue', color='skyblue', linewidth=1.5, showfliers=False)
    sns.stripplot(x='latent_dim', y=col, data=data, edgecolor='dodgerblue', color='skyblue', linewidth=1.5, jitter=0.25, alpha=0.5)
    ax.set_xlim(-1, 5)

In [None]:
# retrieve p-values
for col in ['pred_len_diff','leven_to_pred_len_recon','leven_to_true_len_recon']:
    res = ss.kruskal(*[data.loc[data['latent_dim'] == x, col].abs().tolist() for x in data['latent_dim'].unique()])
    print(col, res[1])

In [None]:
# examine different latent dimensions
data = df_recon_nlen.groupby(['n_nodes_len','seed']).mean(numeric_only=True).reset_index()
data['pred_len_diff_abs'] = data['pred_len_diff'].abs()
for col in ['pred_len_diff_abs','leven_to_pred_len_recon','leven_to_true_len_recon']:
    np.random.seed(0)
    fig, ax = plt.subplots(figsize=[3.5, 4]); ax.grid(False)
    sns.boxplot(x='n_nodes_len', y=col, data=data, saturation=1, linecolor='dodgerblue', color='skyblue', linewidth=1.5, showfliers=False)
    sns.stripplot(x='n_nodes_len', y=col, data=data, edgecolor='dodgerblue', color='skyblue', linewidth=1.5, jitter=0.25, alpha=0.5)
    ax.set_xlim(-1, 5)

In [None]:
# retrieve p-values
for col in ['pred_len_diff','leven_to_pred_len_recon','leven_to_true_len_recon']:
    res = ss.kruskal(*[data.loc[data['n_nodes_len'] == x, col].abs().tolist() for x in data['n_nodes_len'].unique()])
    print(col, res[1])

### Examine Best Hyperparameters

In [None]:
def train(epoch, loss_func):
    model.train()
    train_loss = 0
    for batch_idx, data in enumerate(train_loader):
        data = data[0].to(device)
        optimizer.zero_grad()
        (recon_batch, recon_len), mu, logvar = model(data)
        if loss_func == 1:
            loss = loss_function1(recon_batch, recon_len, data, mu, logvar)
        elif loss_func == 2:
            loss = loss_function2(recon_batch, recon_len, data, mu, logvar)
        elif loss_func == 3:
            loss = loss_function3(recon_batch, recon_len, data, mu, logvar)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
    return train_loss / len(train_loader.dataset)
    
def test(epoch, loss_func):
    model.eval()
    test_loss = 0
    with torch.no_grad():
        for i, data in enumerate(test_loader):
            data = data[0].to(device)
            (recon_batch, recon_len), mu, logvar = model(data)
            if loss_func == 1:
                test_loss += loss_function1(recon_batch, recon_len, data, mu, logvar).item()
            elif loss_func == 2:
                test_loss += loss_function2(recon_batch, recon_len, data, mu, logvar).item()
            elif loss_func == 3:
                test_loss += loss_function3(recon_batch, recon_len, data, mu, logvar).item()
    return test_loss / len(test_loader.dataset)

In [None]:
import pickle as pkl
# define the best hyperparameters / lowest we can get away with
init_cnn_filters = 256
secn_cnn_filters = 256
latent_dim = 32
n_nodes_len = 32
# define standard testing parameters
batch_size = 2048
device = 'cuda'
train_losses_12, test_losses_12, train_losses_3, test_losses_3, df_recons = [], [], [], [], []
for seed in range(5):
    # randomly split the data into train and test
    print('splitting the data...', end='')
    torch.manual_seed(seed); np.random.seed(seed)
    idxs_train = np.random.choice(range(len(X_tras)), size=round(len(X_tras)*0.75), replace=False)
    idxs_test = np.array(range(len(X_tras)))
    idxs_test = idxs_test[~np.isin(idxs_test, idxs_train)]
    X_tras_train = X_tras[idxs_train]
    X_tras_test = X_tras[idxs_test]
    # create dataloaders
    train_loader = DataLoader(dataset=TensorDataset(X_tras_train), batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(dataset=TensorDataset(X_tras_test), batch_size=batch_size, shuffle=False)
    # initialize the model
    print('creating the model...', end='')
    model = ConvVAE().to(device)
    # set the seed for training
    torch.manual_seed(seed); np.random.seed(seed)
    # set the learning parameters
    lr = 0.0005; epochs = 20
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    # train the model with dual losses to balance between two objectives
    train_losses = []; test_losses = []
    for epoch in range(1, epochs + 1):
        train_losses_, test_losses_ = [], []
        for loss_func in [1, 2]:
            train_losses_.append(train(epoch, loss_func))
            test_losses_.append(test(epoch, loss_func))
        train_losses.append(train_losses_)
        test_losses.append(test_losses_)
    train_losses_12.append(train_losses)
    test_losses_12.append(test_losses)

    # set the seed for training
    torch.manual_seed(seed); np.random.seed(seed)
    # set the learning parameters
    lr = 0.001; epochs = 40
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    # train the model with an integrated loss
    train_losses = []; test_losses = []
    for epoch in range(1, epochs + 1):
        train_losses.append(train(epoch, loss_func=3))
        test_losses.append(test(epoch, loss_func=3))
    train_losses_3.append(train_losses)
    test_losses_3.append(test_losses)
    
    # compute additional metrics via predictions
    model.eval()
    print('testing...', end='')
    recon_lens = []; recon_batchs = []
    with torch.no_grad():
        for i, data in enumerate(test_loader):
            data = data[0].to(device)
            (recon_batch, recon_len), _, _ = model(data)
            recon_batchs.extend(recon_batch.clone().detach().cpu().numpy())
            recon_lens.extend(recon_len.clone().detach().cpu().tolist())
    recon_lens = [x[0] for x in recon_lens]
    # retrieve the indices
    tras_test = pd.Series(tras.iloc[idxs_test])
    
    # set parameters for reconstruction
    print('scoring the data...', end='')
    curr_len = 48  # this is a constant
    true_lens = tras_test.apply(len)
    n_sequences = len(tras_test)
    values = np.random.choice(range(n_sequences), size=1000, replace=False)
    # test reconstruction keeping track in a dataframe
    df_recon = pd.DataFrame(columns=['pred_len','true_len','true_seq','pred_seq_from_pred_len','pred_seq_from_true_len'])
    for idx in values:
        pred_len = recon_lens[idx]
        true_len = true_lens.iloc[idx]
        true_seq = tras_test.iloc[idx]
        recon_seq_from_pred_len = reconstruct(recon_batchs[idx], pred_len, curr_len)
        recon_seq_from_true_len = reconstruct(recon_batchs[idx], true_len, curr_len)
        df_recon.loc[idx] = pred_len, true_len, true_seq, recon_seq_from_pred_len, recon_seq_from_true_len
    # assess via multiple metrics
    print('saving this round...')
    df_recon['pred_len_diff'] = df_recon['pred_len'] - df_recon['true_len']
    df_recon['leven_to_pred_len_recon'] = [levenshtein(df_recon.loc[idx, 'true_seq'], df_recon.loc[idx, 'pred_seq_from_pred_len']) for idx in df_recon.index]
    df_recon['leven_to_true_len_recon'] = [levenshtein(df_recon.loc[idx, 'true_seq'], df_recon.loc[idx, 'pred_seq_from_true_len']) for idx in df_recon.index]
    df_recon['init_cnn_filters'] = init_cnn_filters
    df_recon['secn_cnn_filters'] = secn_cnn_filters
    df_recon['latent_dim'] = latent_dim
    df_recon['n_nodes_len'] = n_nodes_len
    df_recon['seed'] = seed
    df_recons.append(df_recon)
df_recon = pd.concat(df_recons, axis=0).reset_index().iloc[:, 1:]
df_recon['pred_len_diff_abs'] = df_recon['pred_len_diff'].abs()

In [None]:
# dump the results
result = (train_losses_12, test_losses_12, train_losses_3, test_losses_3, df_recons)
import pickle as pkl
with open('../models/tra.fiveiter.results.pkl', 'wb') as f:
    pkl.dump(result, f)

In [None]:
# dump the results
import pickle as pkl
with open('../models/tra.fiveiter.results.pkl', 'rb') as f:
    train_losses_12, test_losses_12, train_losses_3, test_losses_3, df_recons = pkl.load(f)
df_recon = pd.concat(df_recons, axis=0).reset_index().iloc[:, 1:]
df_recon['pred_len_diff_abs'] = df_recon['pred_len_diff'].abs()

In [None]:
# check the difference
ys = ['pred_len_diff','pred_len_diff_abs','leven_to_pred_len_recon','leven_to_true_len_recon']
for y in ys:
    fig, ax = plt.subplots(figsize=[1.5, 4]); ax.grid(False)
    sns.boxplot(y=y, data=df_recon.groupby('seed').mean(numeric_only=True),
                linewidth=1.5, saturation=1, showfliers=False, linecolor='dodgerblue', color='skyblue')
    sns.stripplot(y=y, data=df_recon.groupby('seed').mean(numeric_only=True),
                  linewidth=1.5, s=6, alpha=0.5, color='skyblue', edgecolor='dodgerblue')
    ax.set_xlim(-1, 1)

In [None]:
# plot bar plot with colors as percentage as y-axis as counts
def plot_bar(counts, cmap, edgecolor, labelrotation=90, figsize=None, color=None):
    # convert to relevant colors
    if color is None:
        colors = counts / counts.sum()
        colors = [to_hex(cmap(x)) for x in colors]
    else:
        colors = [color]*len(counts)
    figsize = [6, 4] if figsize is None else figsize
    fig, ax = plt.subplots(figsize=figsize); ax.grid(False)
    ax.bar(counts.index, counts, edgecolor=edgecolor, lw=1.5, color=colors)
    ax.tick_params(axis='x', labelrotation=labelrotation)
    return ax

In [None]:
from matplotlib.cm import get_cmap
from matplotlib.colors import to_hex
# provide examples at each section
cmap = get_cmap('Blues')
fig, ax = plt.subplots(figsize=[6, 4]); ax.grid(False)
# take the average counts
counts = []
for seed in range(5):
    count = df_recon.loc[df_recon['seed'] == seed, 'leven_to_pred_len_recon'].value_counts()
    order = sorted(count.index); count = count.loc[order]; counts.append(count)
count = pd.concat(counts, axis=0).reset_index(); count.columns = ['x','y']
order = list(range(min(count['x']), max(count['x'])+1))
sns.barplot(x='x', y='y', data=count, order=order, edgecolor='dodgerblue', color='skyblue', linewidth=1.5,
            saturation=1, err_kws={'color': 'dodgerblue', 'linewidth': 1.5}, capsize=0.3, errorbar=('ci', 95))
ax.set(xlabel='Levenshtein distance prediction vs. truth', ylabel='N-Observations')
ax.set_xlim(-1, order[-1]+1)

In [None]:
# show a few sequences that are perfectly predicted
np.random.seed(0)
idxs = df_recon.index[df_recon['leven_to_pred_len_recon'] == 0]
idxs = np.random.choice(idxs, size=5)
df_recon.loc[idxs, ['true_seq','pred_seq_from_pred_len']]

In [None]:
# show a few sequences that are almost predicted
np.random.seed(0)
idxs = df_recon.index[df_recon['leven_to_pred_len_recon'] == 1]
idxs = np.random.choice(idxs, size=5)
df_recon.loc[idxs, ['true_seq','pred_seq_from_pred_len']]

In [None]:
# show a few sequences that are almost predicted
np.random.seed(0)
idxs = df_recon.index[df_recon['leven_to_pred_len_recon'] == 2]
idxs = np.random.choice(idxs, size=5)
df_recon.loc[idxs, ['true_seq','pred_seq_from_pred_len']]

In [None]:
# show a few sequences that are almost predicted
np.random.seed(0)
idxs = df_recon.index[df_recon['leven_to_pred_len_recon'] == 3]
idxs = np.random.choice(idxs, size=5)
df_recon.loc[idxs, ['true_seq','pred_seq_from_pred_len']]

In [None]:
# show a few sequences that are almost predicted
np.random.seed(0)
idxs = df_recon.index[df_recon['leven_to_pred_len_recon'] == 4]
idxs = np.random.choice(idxs, size=5)
df_recon.loc[idxs, ['true_seq','pred_seq_from_pred_len']]

In [None]:
# show a few sequences that are almost predicted
np.random.seed(0)
idxs = df_recon.index[df_recon['leven_to_pred_len_recon'] == 5]
idxs = np.random.choice(idxs, size=5)
df_recon.loc[idxs, ['true_seq','pred_seq_from_pred_len']]

In [None]:
# provide examples at each difference between predicted and true length
fig, ax = plt.subplots(figsize=[6, 4]); ax.grid(False)
for seed in range(5):
    values = df_recon.loc[df_recon['seed'] == seed, 'pred_len_diff_abs']
    sns.kdeplot(values, label=seed+1, color=to_hex(cmap((seed+2)/8)), lw=1.5, bw_method=0.2)
ax.set(xlabel='Length difference', ylabel='N-Observations')
ax.legend(bbox_to_anchor=(.99, .5), bbox_transform=ax.transAxes, loc='center left', frameon=False)

In [None]:
# provide examples at each difference between predicted and true length
fig, ax = plt.subplots(figsize=[6, 4]); ax.grid(False)
for seed in range(5):
    values = df_recon.loc[df_recon['seed'] == seed, 'pred_len_diff']
    sns.kdeplot(values, label=seed+1, color=to_hex(cmap((seed+2)/8)), lw=1.5, bw_method=0.2)
ax.set(xlabel='Levenshtein distance prediction vs. truth', ylabel='N-Observations')
ax.legend(bbox_to_anchor=(.99, .5), bbox_transform=ax.transAxes, loc='center left', frameon=False)

In [None]:
# display losses for training and testing
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
for idx, (losses_train, losses_test) in enumerate(zip(train_losses_12, test_losses_12)):
    losses_train = np.array(losses_train); losses_test = np.array(losses_test)
    ax.plot(losses_train[:, 0], label=idx+1, color=to_hex(cmap((idx+2)/8)), lw=1.5)
    ax.plot(losses_test[:, 0], color=to_hex(cmap((idx+2)/8)), lw=1.5, linestyle='--')
ax.set(xlabel='Epochs', ylabel='Loss #1')
ax.legend(bbox_to_anchor=(.99, .5), bbox_transform=ax.transAxes, loc='center left', frameon=False)

In [None]:
# display losses for training and testing
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
for idx, (losses_train, losses_test) in enumerate(zip(train_losses_12, test_losses_12)):
    losses_train = np.array(losses_train); losses_test = np.array(losses_test)
    ax.plot(losses_train[:, 1], label=idx+1, color=to_hex(cmap((idx+2)/8)), lw=1.5)
    ax.plot(losses_test[:, 1], color=to_hex(cmap((idx+2)/8)), lw=1.5, linestyle='--')
ax.set(xlabel='Epochs', ylabel='Loss #2')
ax.legend(bbox_to_anchor=(.99, .5), bbox_transform=ax.transAxes, loc='center left', frameon=False)

In [None]:
# display losses for training and testing
fig, ax = plt.subplots(figsize=[4, 4]); ax.grid(False)
for idx, (losses_train, losses_test) in enumerate(zip(train_losses_3, test_losses_3)):
    ax.plot(losses_train, label=idx+1, color=to_hex(cmap((idx+2)/8)), lw=1.5)
    ax.plot(losses_test, color=to_hex(cmap((idx+2)/8)), lw=1.5, linestyle='--')
ax.set(xlabel='Epochs', ylabel='Loss #3 (FineTune)')
ax.legend(bbox_to_anchor=(.99, .5), bbox_transform=ax.transAxes, loc='center left', frameon=False)