In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn import init
from torch.utils.data import *

from CNN import *
from GradCAMUtils import *
from Utils import *

import numpy as np
import pandas as pd
import seaborn as sns

import os

from fastai import *
from fastai.text import *
from fastai.vision import *
from fastai.imports import *

from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
import umap

from matplotlib.widgets import Button
from matplotlib.widgets import TextBox
from matplotlib.ticker import MultipleLocator
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

from scipy.stats import norm
from scipy.stats import genextreme

In [2]:
"""Constants"""
# sequence length indicate the maximum length for all of the sequnence 626/798
SEQUENCE_LENGTH = 798

BATCH_SIZE = 16

vocab = {'C': [0,0,1], 'H': [0,1,0], 'E': [1,0,0], '-':[0,0,0]}

sns.set(rc={'figure.figsize':(15,15)})

model_path = Path("./Models/")
path = Path("./Datasets/")

#show all lines and columns
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# Transform the labels from String to Integer via LabelEncoder
le_fold = preprocessing.LabelEncoder()
le_fam = preprocessing.LabelEncoder()

# torch.cuda.set_device()
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

cuda_gpu = torch.cuda.is_available()   #check if gpu is avaliable

torch.cuda.empty_cache()

In [3]:
# multitask dataset overwrite of Dataset
class MultitaskDataset(Dataset):
    "`Dataset` for joint single and multi-label image classification."
    def __init__(self, data, labels_fold, labels_fam, paddings, cuda = True):   
        self.data = torch.FloatTensor(data.float())
        self.y_fam = torch.FloatTensor(labels_fam.float())
        self.y_fold = torch.FloatTensor(labels_fold.float())
        self.paddings = torch.FloatTensor(798-2*paddings.float())
        
        self.cuda = cuda
    
    def __len__(self): return len(self.data)
    
    def __getitem__(self,i:int): 
        if self.cuda:
            return torch.FloatTensor(self.data[i]).float().cuda(), torch.FloatTensor([self.y_fold[i], self.y_fam[i]]).float().cuda(),  self.paddings[i].cuda()
        else:
            return torch.FloatTensor(self.data[i]).float(), torch.FloatTensor([self.y_fold[i], self.y_fam[i]]).float(), self.paddings[i]

# a helper function to load the data into custom dataset
def Dataset_Loader(df, le_fam, le_fold, vocab, BATCH_SIZE, cuda = True):
    x_train = torch.LongTensor(Map_Tokens(df.q3seqTokens, vocab))
    y_train_fold = torch.LongTensor(le_fold.fit_transform(df["fold"].values.ravel()))
    y_train_fam = torch.LongTensor(le_fam.fit_transform(df["family"].values.ravel()))
    paddings = torch.LongTensor(df["paddings"].values.ravel())
    
    ds = MultitaskDataset(x_train, y_train_fold, y_train_fam, paddings, cuda)
    dl = DataLoader(
        ds,
        batch_size=BATCH_SIZE,
        shuffle=False,
        pin_memory=False)
    return ds, dl

In [4]:
# load pretrained final model
model = pickle.load(open("../PretrainedModels/CNNAttention.pickle", 'rb'))

In [None]:
# cluster definition
GTA_0 = ["GT14-A", "GT16-A", "GT2-A","GT25-A","GT45-A", "GT49-A", "GT60-A"]
GTA_1 = ["GT15-A","GT17-A","GT31-A","GT34-A","GT43-A","GT6-A","GT62-A","GT67-A","GT7-A","GT77-A"]
GTB_0 = ["GT1-B","GT10-B","GT20-B","GT28-B","GT37-B","GT38-B","GT4-B","GT47-B","GT5-B","GT63-B","GT72-B","GT79-B","GT9-B","GT90-B","GT93-B"]
GTB_1 = ["GT23-B","GT3-B","GT35-B","GT41-B"]
GTB_1 = ["GT104-B","GT30-B","GT70-B"]
GTC_0 = ["GT39-C","GT57-C","GT66-C"]
GTC_1 = ["GT50-C","GT58-C","GT87-C"]
GTC_2 = ["GT22-C","GT83-C"]

In [5]:
# load data
df = pd.read_csv("../Datasets/gt_training.non_augmented.csv")

df_cluter = df_large.loc[df_large['family'].isin(GTA_0)]

train_df, val_df = Train_Test_Val_split(df_cluter, test_size=0.05, shuffle=False)
test_df = df_large.loc[df_large.fold=="A"].loc[~df_large['family'].isin(GTA_0)]

df_gtu = pd.read_csv("../Datasets/...")

Train_ds, Train_dl = Dataset_Loader(train_df, le_fam, le_fold, vocab, BATCH_SIZE=20, cuda = cuda_gpu)

Val_ds, Val_dl = Dataset_Loader(val_df, le_fam, le_fold, vocab, BATCH_SIZE=20, cuda = cuda_gpu)

Test_ds, Test_dl = Dataset_Loader(test_df, le_fam, le_fold, vocab, BATCH_SIZE=20, cuda = cuda_gpu)

Val_u_ds, Val_u_dl = Dataset_Loader(df_gtu, le_fam, le_fold, vocab, BATCH_SIZE=20, cuda = cuda_gpu)

19912 2490 2490


In [12]:
df_A = df.loc[df["fold"]=="A"]
df_B = df.loc[df["fold"]=="B"]
df_C = df.loc[df["fold"]=="C"]
df_Lyso = df.loc[df["fold"]=="lyso"]

In [6]:
class autoencoder(nn.Module):
    def __init__(self, model):
        super(autoencoder, self).__init__()
        # pretrained cnn model
        self.encoder = model
        for p in self.parameters():
            p.requires_grad = False
            
        # decoder
        
        self.trans1 = nn.ConvTranspose1d(in_channels=512, 
                                         out_channels=512, 
                                         kernel_size=15, 
                                         stride=15, 
                                         dilation=2, 
                                         padding=3)
        
        self.relu1 = torch.nn.ReLU(inplace=True)
        
        self.in1 = nn.InstanceNorm1d(512)
        
        self.trans2 = nn.ConvTranspose1d(in_channels=512, 
                                         out_channels=512, 
                                         kernel_size=7, 
                                         stride=7,
                                         dilation=2, 
                                         padding=1, 
                                         output_padding=1)

        self.relu2 = torch.nn.ReLU(inplace=True)   
        
        self.in2 = nn.InstanceNorm1d(512)

        self.trans3 = nn.ConvTranspose1d(in_channels=512, 
                                         out_channels=3, 
                                         kernel_size=3, 
                                         stride=1) 
        

    def forward(self, x):
        _, _, x, _ = self.encoder(x)
        batch = x.shape[0]
        x = self.trans1(x)
        x = self.relu1(x)
        x = self.in1(x)
        x = self.trans2(x)
        x = self.relu2(x) 
        x = self.in2(x)
        x = self.trans3(x)
        x = F.sigmoid(x)  
        x = x.transpose(1,2)
        return x

In [7]:
def fit_autoencoder(epochs, model, criterion, optimizer, train_dl, val_dl_test, val_dl, patience = 5):
    early_stopping = EarlyStopping(patience=patience, verbose=True)
    train_loss = 0.0
    val_loss = 0.0
    val_loss_test = 0.0
    val_loss_oof = 0.0
    train_loss_list = []
    val_loss_list = []
    val_loss_test_list = []
    val_loss_oof_list = []
    for epoch in range(epochs):
        
        model.train()
        for i, data in enumerate(train_dl, 0):
            xb, yb, p = data
            optimizer.zero_grad()
            output = model(xb)
            xb = xb.float()
            loss = criterion(output, xb)/(p.sum())
            loss.backward()
            optimizer.step()
            train_loss += loss.item()*xb.size(0)
        model.eval()
    
        with torch.no_grad():
            for i, data in enumerate(val_dl, 0):
                xb, yb, p = data
                output = model(xb)
                xb = xb.float()
                loss = criterion(output, xb)/(p.sum())
                val_loss += loss.item()*xb.size(0)
        
        with torch.no_grad():
            for i, data in enumerate(val_dl_test, 0):
                xb, yb, p = data
                output = model(xb)
                xb = xb.float()
                loss = criterion(output, xb)/(p.sum())
                val_loss_test += loss.item()*xb.size(0)
                
        train_loss = train_loss/len(train_dl)
        val_loss = val_loss/len(val_dl)
        val_loss_test = val_loss_test/len(val_dl_test)
        
        train_loss_list.append(train_loss)
        val_loss_list.append(val_loss)
        val_loss_test_list.append(val_loss_test)
        
        print("Epoch :{} \tTraining Loss :{:.6f}.".format(epoch+1,train_loss))
        
        print("Epoch :{} \tVal Loss :{:.6f}.".format(epoch+1,val_loss))
        
        print("Epoch :{} \tVal OOD Loss :{:.6f}.".format(epoch+1,val_loss_test))

        early_stopping(val_loss, model)

        if early_stopping.early_stop:
            print("Early stopping")
            break
        train_loss = 0.0
        val_loss = 0.0
    fig, ax = plt.subplots()
    ax.plot(val_loss_list, 'b', label = "Validation")
    ax.plot(train_loss_list,'r', label = "Train")
    ax.plot(val_loss_test,'y', label = "Validation Lyso")
    x_ticks = np.arange(0,epoch+1,1)
    plt.xticks(x_ticks)
    
    return model

In [9]:
# training the model
# model_autoencoder = autoencoder(model).cuda()
# criterion = nn.MSELoss(reduction="sum")
# optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model_autoencoder.parameters()), lr=1e-5)

# model_A = fit_autoencoder(60, model_autoencoder, criterion, optimizer, Train_dl, Val_dl, Val_u_dl, patience=1)

In [None]:
# load for inference

model = pickle.load(open('../PretrainedModels/A_subcluster_0.pickle','rb'))

In [None]:
def get_rerr(df, le_fam, le_fold, vocab, BATCH_SIZE=1, cuda=cuda_gpu, model, name):
    reconstruction_err = []
    for i, data in enumerate(Train_dl, 0):
        model_A.eval()

        xb, yb, p = data
        output = model(xb)
        xb = xb.float()
        loss = criterion(output, xb)/(p.sum())

        reconstruction_err.append([df.iloc[i].family, loss.item()])

    return pd.DataFrame(reconstruction_err, columns=["fold", "Err"])

In [None]:
rerr_A = get_rerr(df=df_cluter, le_fam=le_fam, le_fold=le_fold, vocab=vocab, BATCH_SIZE=1, cuda=cuda_gpu, model, name)
rerr_B = get_rerr(df=df_B, le_fam=le_fam, le_fold=le_fold, vocab=vocab, BATCH_SIZE=1, cuda=cuda_gpu, model, name)
rerr_C = get_rerr(df=df_C, le_fam=le_fam, le_fold=le_fold, vocab=vocab, BATCH_SIZE=1, cuda=cuda_gpu, model, name)
rerr_lyso = get_rerr(df=df_lyso, le_fam=le_fam, le_fold=le_fold, vocab=vocab, BATCH_SIZE=1, cuda=cuda_gpu, model, name)
rerr_gtu = get_rerr(df=df_gtu, le_fam=le_fam, le_fold=le_fold, vocab=vocab, BATCH_SIZE=1, cuda=cuda_gpu, model, name)

In [None]:
rerr_A.to_csv("rerr_gta0_training.csv", index = False)
alltest_rerr= pd.concat([rerr_B, rerr_C, reconstruction_err_gtc, rerr_lyso, rerr_gtu], axis=0)
alltest_rerr.to_csv("rerr_gta0_Alltest.csv", index = False)