In [1]:
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import re

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# 모델제작

# 데이터 준비

In [3]:
with open('./chars-4996', encoding='utf-8-sig') as f:
    content = f.read()
    keys = ["<pad>", "<s>", "</s>", "<unk>"] + list(content)
vocab = dict()
for i, key in enumerate(keys): vocab[key] = i

In [4]:
with open('./namuwikitext_20200302.dev', 'r', encoding='utf-8') as f:
    content = f.readlines()

# string, label 제작
1. string 변형
2. 라벨제작
3. Padding

In [74]:
add_prob = 0.5
del_prob = 0.15

seq_len = 128

def labeling(data):
    string_data = torch.Tensor()
    label_data = torch.Tensor()

    i = 0
    while i < len(data):
        # 문장의 끝이 아니고, 다음 단어가 띄어쓰기일 경우
        is_space = True if (i < len(data)-1) and (data[i+1] == vocab[" "]) else False

        if is_space: # '다음'문자가 띄어쓰기면, 띄어쓰기를 제거하거나
            state = 2 if torch.rand(1) < del_prob else 0
        else: # '다음' 문자가 띄어쓰기가 아니라면, 띄어쓰기를 추가
            state = 1 if data[i] != vocab[" "] and torch.rand(1) < add_prob else 0
        
        # string
        # state 0: data[i]
        # state 1: data[i] + " "
        # state 2: data[i] // '다음' 띄어쓰기 제거  // 이건 state0과 같이 동작한 뒤, 나중에 index jump로 해결
        string_data = torch.cat((string_data, torch.tensor([data[i], vocab[" "]])), dim=0) if state == 1 else torch.cat((string_data, torch.tensor([data[i]])), dim=0)
        
        # label
        # state 0: 0
        # state 1: 2// 띄어쓰기를 추가한 경우 이므로, 제거하라는 라벨 붙이기
        # state 2: 1// 다음 띄어쓰기를 제거했으므로, 추가하라는 의미 부여
        if state == 0:
            label_data = torch.cat((label_data, torch.tensor([0])), dim=0)
        elif state == 1:
            label_data = torch.cat((label_data, torch.tensor([0, 2])), dim=0)
        else:
            label_data = torch.cat((label_data, torch.tensor([1])), dim=0)
        # One hot vector로 표현
        """
        if state == 0:
            label_data = torch.cat((label_data, torch.tensor([[0, 0, 0]])), dim=0)
        elif state == 1:
            label_data = torch.cat((label_data, torch.tensor([[0, 0, 0]])), dim=0)
            label_data = torch.cat((label_data, torch.tensor([[0, 0, 1]])), dim=0)
        else:
            label_data = torch.cat((label_data, torch.tensor([[0, 1, 0]])), dim=0)
        """
        # 띄어쓰기를 삭제한 경우, 다음 data 건너 뛰기
        i += 2 if state == 2 else 1

    # 문자열 Padding
    string_data = torch.cat((torch.tensor([vocab["<s>"]]), string_data, torch.tensor([vocab["</s>"]])), dim=0)
    string_data = torch.cat((string_data, torch.tensor([vocab['<pad>']] * (seq_len - len(string_data)))), dim=0) if len(string_data) < seq_len else string_data[:seq_len]

    # 라벨 Padding
    label_data = torch.cat((torch.tensor([0]), label_data, torch.tensor([0])), dim=0)
    label_data = torch.cat((label_data, torch.tensor([-1] * (seq_len - len(label_data)))), dim=0) if len(label_data) < seq_len else label_data[:seq_len]
    # 라벨 One-hot padding
    # label_data = torch.cat((torch.tensor([[0, 0, 0]]), label_data, torch.tensor([[0, 0, 0]])), dim=0)
    # label_data = torch.cat((label_data, torch.tensor([[0, 0, 0]] * (seq_len - len(label_data)))), dim=0) if len(label_data) < seq_len else label_data[:seq_len]
    
    return string_data, label_data


# Dataset 구성

In [75]:
from tqdm import tqdm

string_ds = torch.Tensor()
label_ds = torch.Tensor()

# 이렇게 선언해놓으면 병렬처리 어려움
# 한번에 다 받은다음에 나중에 쪼개는게 훨 빠름
for line in tqdm(content[:5000]):
    data = torch.Tensor()
    line = ' '.join(line.strip().split())   # 띄어쓰기 2개 이상 있는 문장 제거
    for char in line:
        try:
            data = torch.cat((data, torch.tensor([vocab[char]])), dim=0)
        except:
            data = torch.cat((data,torch.tensor([vocab['<unk>']])), dim=0)
    if len(data) > 0:
        string_data, label_data = labeling(data)
        string_ds = torch.cat((string_ds, string_data), dim=0)
        label_ds = torch.cat((label_ds, label_data), dim=0)
    
string_ds = string_ds.view(len(string_ds) // seq_len, seq_len)
label_ds = label_ds.view(len(label_ds) // seq_len, seq_len)

100%|██████████| 5000/5000 [02:28<00:00, 33.75it/s] 


# Batch 만들기

In [76]:
from torch.utils.data import DataLoader
string_ds = string_ds.type(torch.long)
label_ds = label_ds.type(torch.long)
string_dl = DataLoader(string_ds, batch_size=64, shuffle=True, num_workers=2, pin_memory=False)
label_dl = DataLoader(label_ds, batch_size=64, shuffle=False, num_workers=2, pin_memory=False)

In [77]:
import torch.nn.functional as F

class SpacingModel(nn.Module):
    def __init__(
        self, 
        vocab_size, 
        hidden_size, 
        num_classes = 3, 
        conv_activation="relu", 
        dense_activation="relu",
        kernel_and_filter_sizes = [
            (2, 8),
            (3, 8),
            (4, 8),
            (5, 8)
        ],
        dropout_rate = 0.3
        ):
        super(SpacingModel, self).__init__()

        # 5000개(vocab_size)의 단어를 각각 48차원(hidden_size)으로 Embedding 진행
        self.embeddings = nn.Embedding(vocab_size, hidden_size)
        
        in_channels = hidden_size
        # 2, 3, 4, 5 증가하는 Kernel size는 1d Conv에서, Ngram의 역할 수행
        # conv1d (Batch, Channel, Seq_len) (in:Channel-in, out: Channel-out)
        
        self.convs = list()
        for kernel_size, filter_size in kernel_and_filter_sizes:
            layer = nn.Sequential(
                nn.Conv1d(in_channels=in_channels, out_channels=filter_size, kernel_size=kernel_size, padding=kernel_size//2, bias=False),
                nn.ReLU(inplace=True)
            )
            self.convs.append(layer)

        # 한번에 몇개씩 Pooling할지 설정
        self.pools = []
        for _, filter_size in kernel_and_filter_sizes:
            self.pools.append(nn.MaxPool1d(filter_size))

        self.dropout1 = nn.Dropout(p=dropout_rate)
        self.linear1 = nn.Sequential(
            # @@@pooling한 filter의 합
            nn.Linear(len(kernel_and_filter_sizes), hidden_size),
            nn.ReLU(inplace=True)
        )
        self.dropout2 = nn.Dropout(p=dropout_rate)
        self.linear2 = nn.Linear(hidden_size, num_classes)    
    
    def forward(self, x):
        
        #Tensorflow / embedded: [batch, seq_len, hidden_size]
        #Pytorch / embedded: [batch, hidden_size(channel), seq_len(len)]로 바꿔줘야함
        
        # Embedding 만 딱 마쳤을때는 [batch, seq_len, hidden_size] 그래서 permute로 바꿔줌
        # x shape = [batch, seq_len, channel]을 permute시켜서 [batch, channel(hidden_size), seq_len]으로 변경
        x = self.embeddings(x).permute(0, 2, 1)
        features = []
        for conv, pool in zip(self.convs, self.pools):
            y = conv(x)
            # 현재 y shape = [batch, channel, seq_len]
            y = pool(y.permute(0, 2, 1))    # seq_len은 놔두고, channel에 대해서만 pooling
            # 현재 y shape = [batch, seq_len, channel]
            #print('conv shape:', y.shape)
            features.append(y)
        
        # features shape = [batch, seq_len, #filters의 합]
        features = torch.cat(features, dim=-1)
        features = self.dropout1(features)

        # projected shape = (batch, seq_len, hidden_size)
        projected = self.dropout2(self.linear1(features))

        # result (batch, seq_len, 3)
        # after permute (batch, 3, seq_len) // loss 계산을 위해 [batch, nb_classes, seq_len] 형식으로 변경
        result = self.linear2(projected).permute(0, 2, 1)
        #print('result shape:', result.shape)
        return result

# Train part

In [66]:
from torch.optim.lr_scheduler import ReduceLROnPlateau

In [91]:
vocab_size = 5000
hidden_size = 48    # Embedding size
kernel_and_filter_sizes = [[3, 8], [5, 16], [7, 16], [9, 16]]
model = SpacingModel(
    vocab_size=vocab_size,
    hidden_size=hidden_size,
    num_classes=3,
    conv_activation='relu',
    dense_activation='relu',
    kernel_and_filter_sizes=kernel_and_filter_sizes,
    dropout_rate=0.1)
model.to(device)

#criterion = nn.NLLLoss(ignore_index=-1)
#criterion = nn.CrossEntropyLoss()
criterion = nn.CrossEntropyLoss(ignore_index=-1, reduction='mean')
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# lr scheduler 도입
lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=2, verbose=1)

best_acc = 0
epochs = 30
predicted= list()

def train(epoch):
    model.train()

    train_loss = 0
    correct = 0
    total = 0
    
    for index, (inputs, targets) in enumerate(zip(string_dl, label_dl)):
        inputs, targets = inputs.to(device),targets.to(device)
        optimizer.zero_grad()   # optimizer에 저장되어있던 gradient 제거

        outputs = model(inputs)
        # print('-'*32)
        # print(outputs[0][0][:8])
        # print(outputs[0][1][:8])
        # print(outputs[0][2][:8])
        # print(targets[0][:8])

        #print(outputs, targets)
        loss = criterion(outputs, targets)
        loss.backward() # loss 미분하여 Grad 계산
        optimizer.step() # w, b 적용하여 초기화
        
        lr_scheduler.step(loss)

        #train_loss += loss.item()
        _, predicted = outputs.max(1)
        print(predicted[0][:8])
        # print(predicted[0][:8])
        # print('-'*32)
        
        total += targets.size(0) * seq_len    # 64 (==batch_size) * seq_len
        correct += (predicted == targets).sum().item()
        if (index+1) % 20 == 0:
            print(f'[Train] | epoch: {epoch+1}/{epochs} | batch: {index+1}/{len(string_dl)} | loss: {loss.item():.4f} | Acc: {correct / total * 100:.4f}')
for epoch in range(epochs):
    train(epoch)

tensor([2, 2, 2, 2, 2, 2, 2, 2])
tensor([2, 2, 2, 2, 2, 2, 2, 2])
tensor([2, 2, 2, 2, 2, 2, 2, 2])
tensor([2, 2, 2, 2, 2, 2, 2, 2])
tensor([2, 2, 2, 2, 2, 2, 2, 2])
tensor([2, 2, 2, 2, 2, 2, 2, 2])
tensor([2, 2, 2, 2, 2, 0, 0, 2])
tensor([2, 0, 2, 2, 2, 2, 2, 2])
tensor([0, 2, 2, 2, 2, 2, 0, 0])
tensor([2, 2, 2, 0, 2, 0, 0, 2])
tensor([0, 2, 2, 0, 0, 0, 2, 2])
tensor([0, 2, 2, 0, 0, 2, 0, 0])
tensor([0, 0, 0, 2, 0, 0, 2, 0])
tensor([0, 0, 0, 0, 0, 0, 2, 0])
tensor([2, 2, 2, 0, 0, 2, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 2])
tensor([0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 2, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0])
[Train] | epoch: 1/30 | batch: 20/62 | loss: 0.8549 | Acc: 18.3505
tensor([0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0])
tensor([0, 0, 0, 0, 0, 0, 0, 0])
tensor([0

KeyboardInterrupt: 