In [1]:
import random
import pandas as pd
import numpy as np
import os

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings(action='ignore')

import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score
from torchsampler import ImbalancedDatasetSampler 
from allennlp.commands.elmo import ElmoEmbedder
from pathlib import Path

ModuleNotFoundError: No module named 'torch'

In [2]:
CFG = {
    'NUM_WORKERS':4, ## 4
    'EPOCHS':10,
    'LEARNING_RATE':5e-3,
    'BATCH_SIZE':32,
    'THRESHOLD':0.5,
    'SEED':41
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(CFG['SEED']) # Seed 고정

def get_preprocessing(data_type, new_df):
    epitope_seq_list = []
    
    for epitope in tqdm(zip(new_df['epitope_seq'])):
        epitope_seq_list.append(epitope)
        
    label_list = None
    if data_type != 'test':
        label_list = []
        for label in new_df['label']:
            label_list.append(label)
    print(f'{data_type} dataframe preprocessing was done.')
    return epitope_seq_list, label_list

class CustomDataset(Dataset):
    def __init__(self, epitope_seq_list, label_list):
        self.epitope_seq_list = epitope_seq_list
        self.label_list = label_list
        
    def __getitem__(self, index):
        self.epitope_seq = self.epitope_seq_list[index]
        
        if self.label_list is not None:
            self.label = self.label_list[index]
            return self.epitope_seq, self.label
        else:
            return self.epitope_seq
        
    def __len__(self):
        return len(self.epitope_seq_list)

NameError: name 'torch' is not defined

In [None]:
all_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')
# Split Train : Validation = 0.8 : 0.2
train_len = len(all_df)
train_idx = np.arange(train_len)

train_df = all_df.iloc[train_idx[:int(train_len*0.8)]]
val_df = all_df.iloc[train_idx[int(train_len*0.8):]]

train_epitope_seq_list, train_label_list = get_preprocessing('train', train_df)
val_epitope_seq_list, val_label_list = get_preprocessing('val', val_df)
test_epitope_seq_list, test_label_list = get_preprocessing('test', test_df)

train_dataset = CustomDataset(train_epitope_seq_list, train_label_list)
train_loader = DataLoader(train_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=True, num_workers=CFG['NUM_WORKERS'])

val_dataset = CustomDataset(val_epitope_seq_list, val_label_list)
val_loader = DataLoader(val_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=CFG['NUM_WORKERS'])

test_dataset = CustomDataset(test_epitope_seq_list, None)
test_loader = DataLoader(test_dataset, batch_size = CFG['BATCH_SIZE'], shuffle=False, num_workers=CFG['NUM_WORKERS'])

In [None]:
def train(seqvec, classifier, optimizer, train_loader, val_loader, scheduler, device):
    model_seqvec = seqvec.elmo_bilm
    model_seqvec.to(device)
    classifier.to(device)
    criterion = nn.BCEWithLogitsLoss().to(device) 
    
    best_val_f1 = 0
    for epoch in range(1, CFG['EPOCHS']+1):
        model_seqvec.train()
        classifier.train()
        train_loss = []
        for epitope_char_seq, label in tqdm(iter(train_loader)):
            protein_embd_batch = []
            label = label.float().to(device)
            
            optimizer.zero_grad()
                        
            epitope_char_seq = [list(seq) for seq in epitope_char_seq[0]]
            embeddings = seqvec.embed_batch(epitope_char_seq)
            for embedding in embeddings:
                protein_embd = torch.tensor(embedding).sum(dim=0).mean(dim=0) # Vector with shape [1024]
                protein_embd_batch.append(protein_embd.tolist())
            protein_embd_batch = torch.tensor(protein_embd_batch).to(device)
            output = classifier(protein_embd_batch)
            loss = criterion(output, label)
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
            
            if scheduler is not None:
                scheduler.step()
                    
        val_loss, val_f1, pred_label, pred_proba_label = validation(seqvec, classifier, val_loader, criterion, device)
        print(f'Epoch : [{epoch}] Train Loss : [{np.mean(train_loss):.5f}] Val Loss : [{val_loss:.5f}] Val F1 : [{val_f1:.5f}]')
        
        if best_val_f1 < val_f1:
            if best_val_f1 != 0:
                os.remove(f'./best_model_seqvec_{best_val_f1:.5f}.pth')
                os.remove(f'./best_model_seqvec_classifier_{best_val_f1:.5f}.pth')
            best_val_f1 = val_f1
            torch.save(model_seqvec.state_dict(), f'./best_model_seqvec_{best_val_f1:.5f}.pth', _use_new_zipfile_serialization=False)
            torch.save(classifier.state_dict(), f'./best_model_seqvec_classifier_{best_val_f1:.5f}.pth', _use_new_zipfile_serialization=False)
            print('Model Saved.')
    return best_val_f1

def validation(seqvec, classifier, val_loader, criterion, device):
    model_seqvec = seqvec.elmo_bilm
    model_seqvec.eval()
    classifier.eval()
    pred_proba_label = []
    true_label = []
    val_loss = []
    with torch.no_grad():
        for epitope_char_seq, label in tqdm(iter(val_loader)):
            protein_embd_batch = []
            label = label.float().to(device)
            
            epitope_char_seq = [list(seq) for seq in epitope_char_seq[0]]
            embeddings = seqvec.embed_batch(epitope_char_seq)
            for embedding in embeddings:
                protein_embd = torch.tensor(embedding).sum(dim=0).mean(dim=0) # Vector with shape [1024]
                protein_embd_batch.append(protein_embd.tolist())
            protein_embd_batch = torch.tensor(protein_embd_batch).to(device)
            output = classifier(protein_embd_batch)
            loss = criterion(output, label)
            
            model_pred = torch.sigmoid(output).to('cpu')
            
            pred_proba_label += model_pred.tolist()
            true_label += label.to('cpu').tolist()
            
            val_loss.append(loss.item())
    
    pred_label = np.where(np.array(pred_proba_label)>CFG['THRESHOLD'], 1, 0)
    val_f1 = f1_score(true_label, pred_label, average='macro')
    return np.mean(val_loss), val_f1, pred_label, pred_proba_label

def inference(seqvec, classifier, test_loader, device):
    model_seqvec = seqvec.elmo_bilm
    model_seqvec.eval()
    classifier.eval()
    pred_proba_label = []
    with torch.no_grad():
        for epitope_char_seq, in tqdm(iter(test_loader)):
            protein_embd_batch = []
            
            epitope_char_seq = [list(seq) for seq in epitope_char_seq]
            embeddings = seqvec.embed_batch(epitope_char_seq)
            for embedding in embeddings:
                protein_embd = torch.tensor(embedding).sum(dim=0).mean(dim=0) # Vector with shape [1024]
                protein_embd_batch.append(protein_embd.tolist())
            protein_embd_batch = torch.tensor(protein_embd_batch).to(device)
            output = classifier(protein_embd_batch)
            model_pred = torch.sigmoid(output).to('cpu')
            pred_proba_label += model_pred.tolist()

    pred_label = np.where(np.array(pred_proba_label)>CFG['THRESHOLD'], 1, 0)
    return pred_label, pred_proba_label

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

class Classifier(nn.Module):
    def __init__(self,in_channels):
        super(Classifier, self).__init__()
        self.classifier = nn.Sequential(
            nn.LeakyReLU(True),
            nn.BatchNorm1d(in_channels),
            nn.Linear(in_channels, in_channels//4),
            nn.LeakyReLU(True),
            nn.BatchNorm1d(in_channels//4),
            nn.Linear(in_channels//4, 1)
        )
        
    def forward(self, x):
        x = self.classifier(x).view(-1)
        return x

model_dir = Path('~/dacon/uniref50_v2')
weights = model_dir / 'weights.hdf5'
options = model_dir / 'options.json'

seqvec  = ElmoEmbedder(options,weights,cuda_device=device) # cuda_device=-1 for CPU
model_seqvec = seqvec.elmo_bilm
classifier = Classifier(in_channels=1024)

model_seqvec = nn.DataParallel(model_seqvec)
classifier = nn.DataParallel(classifier)

model_seqvec.eval()
classifier.eval()
optimizer = torch.optim.Adam([{'params':model_seqvec.parameters()}, {'params':classifier.parameters()}], lr = CFG["LEARNING_RATE"])
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=len(train_loader)*CFG['EPOCHS'], eta_min=0)
best_score = train(seqvec, classifier, optimizer, train_loader, val_loader, scheduler, device)
print(f'Best Validation F1 Score : [{best_score:.5f}]')

In [None]:
seqvec  = ElmoEmbedder(options,weights,cuda_device=0) # cuda_device=-1 for CPU
model_seqvec = seqvec.elmo_bilm
model_seqvec = nn.DataParallel(model_seqvec)
best_model_seqvec_checkpoint = torch.load(f'./best_model_seqvec_{best_score:.5f}.pth')
best_model_seqvec_checkpoint = {'module.'+k: v for k, v in best_model_seqvec_checkpoint.items() if k in best_model_seqvec_checkpoint}
model_seqvec.load_state_dict(best_model_seqvec_checkpoint)
model_seqvec.eval()

classifier = Classifier(in_channels=1024)
classifier = nn.DataParallel(classifier)
best_model_classifier_checkpoint = torch.load(f'./best_model_seqvec_classifier_{best_score:.5f}.pth')
classifier.load_state_dict(best_model_classifier_checkpoint)
classifier.eval()
classifier.to(device)

pred_test, pred_prob_test = inference(seqvec, classifier, test_loader, device)
pred_test = np.array(pred_test)

submit = pd.read_csv('./sample_submission.csv')
submit['label'] = pred_test.astype(np.int32)

submit.to_csv(f'./Presentation_ELMO_RF_{best_score:.5f}.csv', index=False)
print('Done.')