In [1]:
import os
import re
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F

#from torch.utils.data.dataset import Dataset

#import torchtext
#from torchtext import data

import gluonnlp as nlp

import transformers
from transformers import AdamW
from transformers import AutoTokenizer
from transformers import AutoModel

import kobert.pytorch_kobert
import kobert.utils

In [2]:
device = torch.device("cuda:0")
print(device)

cuda:0


# vocab, tokenizer

In [3]:
#vocab 획득
_, vocab = kobert.pytorch_kobert.get_pytorch_kobert_model()

using cached model
using cached model


In [4]:
#tokenizer 획득
tokenizer_path = kobert.utils.get_tokenizer()
tokenizer = nlp.data.BERTSPTokenizer(tokenizer_path, vocab, lower=False)
max_input_length = 512

using cached model


In [None]:
#huggingface kobert tokenizer 획득
#tokenizer = AutoTokenizer.from_pretrained("monologg/kobert-lm")
#max_input_length = 512

# Pandas Dataset

In [33]:
#pandas dataset
train = pd.read_csv('./train.csv', keep_default_na=False)
test = pd.read_csv('./test.csv', keep_default_na=False)
sample_submission = pd.read_csv('./sample_submission.csv')

In [34]:
#preprocessing
train = train.dropna(how='any')
train['data'] = train['data'].str.replace("[^.,ㄱ-ㅎㅏ-ㅣ가-힣 ]", "")
test['data'] = test['data'].str.replace("[^.,ㄱ-ㅎㅏ-ㅣ가-힣 ]", "")

In [26]:
def encode_data(data_df, max_length, isTest=False):
    
    tokens = []
    valid_lengths = []
    labels = []
    transform = nlp.data.BERTSentenceTransform(tokenizer, max_seq_length=max_length, pad=False, pair=False)
     
    for i in range(len(data_df['data'])):
        
        if len(data_df['data'][i]) != 0:
            token = tokenizer.convert_tokens_to_ids(tokenizer(data_df['data'][i]))
            
            if len(token) > 512:
                token = token[:512]
            
            valid_length = len(token)
            
            token += [1] * (max_length - valid_length)
            
            if isTest==False:
                label = data_df['category'][i]

            tokens.append(token)
            valid_lengths.append(valid_length)
            
            if isTest==False:
                labels.append(label)
                
        if (i+1) % 1000 == 0:
            print("%05d/%05d finish" % (i+1, len(data_df['data'])))
            
    if isTest==False:
        return tokens, valid_lengths, labels 
    else:
        return tokens, valid_lengths

In [36]:
max_length = 512
tokens, valid_lengths, labels = encode_data(train, max_length)

01000/40000 finish
02000/40000 finish
03000/40000 finish
04000/40000 finish
05000/40000 finish
06000/40000 finish
07000/40000 finish
08000/40000 finish
09000/40000 finish
10000/40000 finish
11000/40000 finish
12000/40000 finish
13000/40000 finish
14000/40000 finish
15000/40000 finish
16000/40000 finish
17000/40000 finish
18000/40000 finish
19000/40000 finish
20000/40000 finish
21000/40000 finish
22000/40000 finish
23000/40000 finish
24000/40000 finish
25000/40000 finish
26000/40000 finish
27000/40000 finish
28000/40000 finish
29000/40000 finish
30000/40000 finish
31000/40000 finish
32000/40000 finish
33000/40000 finish
34000/40000 finish
35000/40000 finish
36000/40000 finish
37000/40000 finish
38000/40000 finish
39000/40000 finish
40000/40000 finish


In [37]:
trainX = []
for idx in range(len(tokens)):
    trainX.append([np.array(tokens[idx]), np.array(valid_lengths[idx]), np.array(labels[idx])])
trainY = labels

train = []
tokens = []
valid_lengths = []
labels = []

In [29]:
tokens, valid_lengths = encode_data(test, max_length=max_length, isTest=True)

01000/05000 finish
02000/05000 finish
03000/05000 finish
04000/05000 finish
05000/05000 finish


In [32]:
testX = []
for idx in range(len(tokens)):
    testX.append([np.array(tokens[idx]), np.array(valid_lengths[idx])])

train = []
tokens = []
valid_lengths = []

In [5]:
trainX = torch.load('trainX.pt')
trainY = torch.load('trainY.pt')

testX = torch.load('testX.pt')

# model

In [3]:
class BERTClassifier(nn.Module):
    def __init__(self, bert, hidden_size=768, num_classes=2, dr_rate=None, params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
        
        self.classifier = nn.Linear(hidden_size, num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def forward(self, token, segment, mask):
        _, pooler = self.bert(input_ids = token, token_type_ids = segment, attention_mask=mask)
        if self.dr_rate:
            out = self.dropout(pooler)
        out = self.classifier(out)
        out = F.softmax(out)
        return out

In [4]:
#bert model 획득
bertmodel = AutoModel.from_pretrained("monologg/kobert-lm").to(device)

In [20]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print(params)

92189187


In [6]:
learning_rate = 5e-7
num_epochs = 5
dr_rate = 0.2

In [7]:
model = BERTClassifier(bertmodel, num_classes=3, dr_rate=dr_rate).to(device)

# train

In [15]:
def calc_accuracy(X,Y):
    X = X.cpu()
    Y = Y.cpu()
    max_vals, max_indices = torch.max(X,1)
    correct_cnt = (max_indices == Y).sum().data.numpy()
    train_acc = correct_cnt/max_indices.size()[0]
    return correct_cnt, train_acc

In [17]:
batch_size = 16

train_loader = torch.utils.data.DataLoader(trainX, batch_size=batch_size, num_workers=1)

optimizer = AdamW(model.parameters(), lr=learning_rate)

loss_function = nn.CrossEntropyLoss()

epochs = 4

In [21]:
model.train()

#with torch.no_grad():
for epoch in range(epochs):
    train_acc = 0

    for batch_number, data in enumerate((train_loader)):

        optimizer.zero_grad()

        token_ids = data[0].to(device)
        valid_length = data[1].to(device)
        label = data[2].to(device)

        segment_ids = torch.zeros_like(token_ids).long().to(device)
        attention_masks = torch.zeros_like(token_ids).long().to(device)

        out = model(token_ids, segment_ids, attention_masks)

        loss = loss_function(out, label)
        loss.backward()
        optimizer.step()
        correct_cnt, acc = calc_accuracy(out, label)
        train_acc += correct_cnt
        if batch_number % 50 == 0:
            print('Epoch: ' + str(epoch+1) + '\tBatch: [' + str(batch_number) + '/' + str(len(train_loader)) + ']\tAcc: ' + str(acc*100) + '%\tLoss: ' + str(loss.data))
    
    train_acc /= (len(train_loader) * batch_size)
    print('\nEpoch: ' + str(epoch+1) + '\tEpoch Acc: ' + str(train_acc) + '%\n')


  out = F.softmax(out)


Epoch: 1	Batch: [0/2500]	Acc: 18.75	Loss: tensor(1.1036, device='cuda:0')
Epoch: 1	Batch: [50/2500]	Acc: 50.0	Loss: tensor(1.0871, device='cuda:0')
Epoch: 1	Batch: [100/2500]	Acc: 25.0	Loss: tensor(1.1034, device='cuda:0')
Epoch: 1	Batch: [150/2500]	Acc: 31.25	Loss: tensor(1.1095, device='cuda:0')
Epoch: 1	Batch: [200/2500]	Acc: 43.75	Loss: tensor(1.0913, device='cuda:0')
Epoch: 1	Batch: [250/2500]	Acc: 37.5	Loss: tensor(1.0757, device='cuda:0')
Epoch: 1	Batch: [300/2500]	Acc: 43.75	Loss: tensor(1.0981, device='cuda:0')
Epoch: 1	Batch: [350/2500]	Acc: 62.5	Loss: tensor(1.0704, device='cuda:0')
Epoch: 1	Batch: [400/2500]	Acc: 37.5	Loss: tensor(1.0862, device='cuda:0')
Epoch: 1	Batch: [450/2500]	Acc: 68.75	Loss: tensor(1.0521, device='cuda:0')
Epoch: 1	Batch: [500/2500]	Acc: 62.5	Loss: tensor(1.0501, device='cuda:0')
Epoch: 1	Batch: [550/2500]	Acc: 68.75	Loss: tensor(1.0658, device='cuda:0')
Epoch: 1	Batch: [600/2500]	Acc: 56.25	Loss: tensor(1.0629, device='cuda:0')
Epoch: 1	Batch: [650/

Epoch: 3	Batch: [350/2500]	Acc: 75.0	Loss: tensor(0.7699, device='cuda:0')
Epoch: 3	Batch: [400/2500]	Acc: 93.75	Loss: tensor(0.6709, device='cuda:0')
Epoch: 3	Batch: [450/2500]	Acc: 81.25	Loss: tensor(0.7233, device='cuda:0')
Epoch: 3	Batch: [500/2500]	Acc: 87.5	Loss: tensor(0.6876, device='cuda:0')
Epoch: 3	Batch: [550/2500]	Acc: 87.5	Loss: tensor(0.6728, device='cuda:0')
Epoch: 3	Batch: [600/2500]	Acc: 93.75	Loss: tensor(0.6484, device='cuda:0')
Epoch: 3	Batch: [650/2500]	Acc: 100.0	Loss: tensor(0.5847, device='cuda:0')
Epoch: 3	Batch: [700/2500]	Acc: 68.75	Loss: tensor(0.8429, device='cuda:0')
Epoch: 3	Batch: [750/2500]	Acc: 68.75	Loss: tensor(0.8022, device='cuda:0')
Epoch: 3	Batch: [800/2500]	Acc: 87.5	Loss: tensor(0.7345, device='cuda:0')
Epoch: 3	Batch: [850/2500]	Acc: 68.75	Loss: tensor(0.8738, device='cuda:0')
Epoch: 3	Batch: [900/2500]	Acc: 75.0	Loss: tensor(0.7997, device='cuda:0')
Epoch: 3	Batch: [950/2500]	Acc: 87.5	Loss: tensor(0.7118, device='cuda:0')
Epoch: 3	Batch: [1

In [22]:
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
}, 'checkpoint.tar')

In [None]:
checkpoint = torch.load('checkpoint.tar')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

# evaluate

In [40]:
model.eval()

eval_loader = torch.utils.data.DataLoader(testX, batch_size=batch_size, num_workers=1)

predictions = []

with torch.no_grad():
    for batch_number, data in enumerate((eval_loader)):

        token_ids = data[0].to(device)
        valid_length = data[1].to(device)

        segment_ids = torch.zeros_like(token_ids).long().to(device)
        attention_masks = torch.zeros_like(token_ids).long().to(device)

        out = model(token_ids, segment_ids, attention_masks)

        _, prediction = torch.max(out.cpu(),1)
        
        predictions.extend(prediction)

        if batch_number % 50 == 0:
            print('Batch: [' + str(batch_number) + '/' + str(len(eval_loader)) + ']')

  out = F.softmax(out)


Batch: [0/313]
Batch: [50/313]
Batch: [100/313]
Batch: [150/313]
Batch: [200/313]
Batch: [250/313]
Batch: [300/313]


In [53]:
index = [i for i in range(len(predictions))]
predictions = [int(p) for p in predictions]

s1 = pd.core.series.Series(index)
s2 = pd.core.series.Series(predictions)

submission = pd.DataFrame(data=dict(index=index, category=predictions))

In [54]:
print(submission.head())

   index  category
0      0         0
1      1         2
2      2         1
3      3         0
4      4         2


In [56]:
submission.to_csv('submission.csv', index=False, header=True)

# torchtext dataset

In [None]:
def new_tokenizer(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_length-2]
    return tokens

def PreProc(sentences):
    return [tokenizer.convert_tokens_to_ids(x) for x in sentences]

In [None]:
#torchtext.data label 생성
INDEX = torchtext.data.Field()
TEXT = torchtext.data.Field(batch_first = True, 
                  use_vocab = False, 
                  #tokenize = new_tokenizer, 
                  #preprocessing = PreProc,
                  init_token = '[CLS]',
                  eos_token = '[SEP]',
                  pad_token = '[PAD]',
                  unk_token = '[UNK]')
LABEL = torchtext.data.LabelField()

In [None]:
#torchtext.data dataset 획득
test_data = data.TabularDataset(
    path='./test.csv', format='csv',
    fields=[('index', INDEX), ('data', TEXT)], skip_header=True)

In [None]:
#torchtext.data dataset 획득
train_data = data.TabularDataset(
    path='./train.csv', format='csv',
    fields=[('index', INDEX), ('category', LABEL), ('data', TEXT)], skip_header=True)

In [None]:
print(vars(train_data[0]))

# gluonnlp dataset

In [None]:
#gluonnlp dataset class
class BERTDataset(Dataset):
    def __init__(self, dataset, bert_tokenizer, pad, pair, max_len=512, hasLabel=True):
        transform = gluonnlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, pad=pad, pair=pair)
        
        self.sentences = [transform([i[2]]) for i in dataset] #segment embedding 제외
        if hasLabel:
            self.labels = [[i[1]] for i in dataset]
    
    def __getitem__(self, i):
        return self.sentences[i], self.labels[i]
    
    def __len__(self):
        return len(self.sentences)

In [None]:
#gluonnlp dataset
train_dataset = gluonnlp.data.TSVDataset('./train.csv', field_separator=gluonnlp.data.utils.Splitter(','), num_discard_samples=1)
for i in train_dataset:
    if len(i) < 2:
        print(i)
        
train_dataset = BERTDataset(test_dataset, tokenizer, pad=True, pair=False, hasLabel=True)

test_dataset = gluonnlp.data.TSVDataset('./test.csv', field_separator=gluonnlp.data.utils.Splitter(','), num_discard_samples=1)
test_dataset = BERTDataset(test_dataset, tokenizer, pad=True, pair=False, hasLabel=False)