In [1]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset
from tqdm import tqdm, tqdm_notebook
from transformers import ElectraTokenizer


# ============================================================================
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size=768,
                 num_classes=10,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)

    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        output = self.bert(input_ids=token_ids, token_type_ids=segment_ids.long(),
                           attention_mask=attention_mask.float().to(token_ids.device))
        first_token_tensor = output[0]

        if self.dr_rate:
            out = self.dropout(first_token_tensor[:, 0])
        return self.classifier(out)


# GPU 사용 시
device = torch.device("cuda:0")
print(device)

model = torch.load('./output/2/koelectrabert_epoch2.pt', map_location='cpu').to(device)
tokenizer = ElectraTokenizer.from_pretrained("monologg/koelectra-base-v3-discriminator")
print("load model")

# ============================================================================
# Setting parameters
max_len = 32
batch_size = 400
warmup_ratio = 0.1
num_epochs = 15
max_grad_norm = 1
log_interval = 3000
learning_rate = 5e-5


# ============================================================================
def padding(inputs, pad_token, pad_length=0, pad=True):
    pad_token = pad_token[0]

    if pad_length > len(inputs):
        if pad is True:
            length = (pad_length - len(inputs))
            pad_seq = [pad_token] * length
            inputs.extend(pad_seq)

    else:
        if pad is True:
            inputs = inputs[:pad_length]
    return np.array(inputs)


# ============================================================================
test = []
with open("./inputs/test_data", 'rt', encoding='UTF-8') as f:
    for i in f:
        test.append(i[:-1])

submission = pd.read_csv('./inputs/sample_submission.csv')


# ============================================================================
class BERTDatasetTest(Dataset):
    def __init__(self, dataset, electra_tokenizer, max_len, pad):
        self.sentences = [electra_tokenizer.convert_tokens_to_ids(electra_tokenizer.tokenize(i)) for i in dataset]
        self.length = [np.int32(len(i)) for i in self.sentences]
        for e in range(len(self.sentences)):
            self.sentences[e] = padding(self.sentences[e], tokenizer.convert_tokens_to_ids(['PAD']), max_len, pad)
        self.segment = np.zeros((len(self.sentences), max_len))

    def __getitem__(self, i):
        return self.sentences[i], self.length[i], self.segment[i]

    def __len__(self):
        return (len(self.sentences))


data_test = BERTDatasetTest(test, tokenizer, max_len, True)

test_dataloader = torch.utils.data.DataLoader(data_test, batch_size=batch_size, num_workers=0)
print("create test dataset")

# ============================================================================
predict = []
model.eval()
# with torch.no_grad():
for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(test_dataloader)):
    token_ids = token_ids.long().to(device)
    segment_ids = segment_ids.long().to(device)
    valid_length = valid_length
    out = model(token_ids, valid_length, segment_ids)

    predict.extend(torch.argmax(out, dim=-1).data.cpu().numpy())

# ============================================================================
predict = np.array(predict)
for e, i in enumerate(predict):
    predict[e] = i + 1
submission['Prediction'] = predict
submission.to_csv('submission4.csv', index=False)


cuda:1
load model
create test dataset


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_id, (token_ids, valid_length, segment_ids) in enumerate(tqdm_notebook(test_dataloader)):


  0%|          | 0/1000 [00:00<?, ?it/s]