In [1]:
# import modules
import json
import torch
import tokenization

import numpy as np

from transformer.model import Classifier

In [2]:
# read configuration file
config = json.load(open('config.json'))
config

{'kor_vocab_length': 50000,
 'eng_vocab_length': 28998,
 'd_model': 768,
 'd_ff': 2048,
 'd_k': 64,
 'd_v': 64,
 'num_layers': 12,
 'num_heads': 8,
 'start_word': '[SOS]',
 'end_word': '[EOS]',
 'sep_word': '[SEP]',
 'cls_word': '[CLS]',
 'pad_word': '[PAD]',
 'mask_word': '[MASK]'}

In [3]:
# configure device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

In [4]:
# configure tokenizer
tokenizer = tokenization.FullTokenizer(
    vocab_file='vocab/eng_vocab.txt', do_lower_case=False)
tokenizer.tokenize('I love you')




['I', 'love', 'you']

In [5]:
# define sample dataset
dataset = [
    ['i feel awful about it too because it s my job to get him in a position to succeed and it just didn t happen here', 'sadness'],
    ['i don t feel comfortable around you', 'joy'],
    ['i constantly feel an anxious twinge to start something great', 'fear'],
    ['i was already feeling drained', 'sadness'],
    ['im not feeling bitter today', 'anger'],
    ['i hate myself for feeling grumpy about being pregnant', 'anger']]

emotion_list = sorted(list(set([data[1] for data in dataset])))

token_length = 50

texts = []
for data in dataset:
    token = tokenizer.tokenize(data[0])
    while len(token) < token_length:
        token.append(config['pad_word'])
    texts.append(tokenizer.convert_tokens_to_ids(token))

labels = []
for data in dataset:
    labels.append(emotion_list.index(data[1]))
print('texts : ')
print(texts)
print('labels : ')
print(labels)

texts : 
[[180, 1633, 9686, 1166, 1124, 1317, 1274, 1124, 190, 1141, 2263, 1108, 1245, 1142, 1109, 172, 1702, 1108, 9383, 1107, 1124, 1200, 1240, 191, 3335, 1305, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [180, 1276, 191, 1633, 6064, 1215, 1130, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [180, 7482, 1633, 1128, 11843, 5932, 2178, 1108, 1840, 1382, 1634, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [180, 1110, 1642, 2298, 11096, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [13282, 1138, 2298, 9180, 2054, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [180, 4821, 1993, 1113, 2298, 178, 27323, 1185, 1166, 1219, 6393, 0, 0, 0

In [6]:
# configure model, optimizer, criterion
pad_index = tokenizer.convert_tokens_to_ids([config['pad_word']])[0]
classifier = Classifier(
        vocab_size=config['eng_vocab_length'],
        d_model=config['d_model'],
        d_ff=config['d_ff'], d_k=config['d_k'],
        d_v=config['d_v'], n_heads=config['num_heads'],
        n_layers=config['num_layers'], pad_index=pad_index,
        device=device, num_classes=len(emotion_list)).to(device)
optimizer = torch.optim.Adam(classifier.parameters(), lr=5e-5)
criterion = torch.nn.CrossEntropyLoss()

In [7]:
# convert texts, labels to torch tensor
texts = torch.as_tensor(texts, dtype=torch.long).to(device)
labels = torch.as_tensor(labels, dtype=torch.long).to(device)

In [8]:
for epoch in range(10):
    optimizer.zero_grad()
    logits, _ = classifier(texts)
    loss = criterion(logits, labels)
    loss.backward()
    optimizer.step()

    print(epoch, loss.item())

0 1.4097980260849
1 1.6343640089035034
2 1.1189793348312378
3 0.18272121250629425
4 0.14093776047229767
5 0.12058594077825546
6 0.1174880638718605
7 0.038760535418987274
8 0.008962128311395645
9 0.0030036389362066984


In [9]:
test_dataset = [
    'i feel awful about it too because it s my job to get him',
    'i don t feel comfortable']

for test_data in test_dataset:
    tokens = tokenizer.tokenize(test_data)
    ids = tokenizer.convert_tokens_to_ids(tokens)
    tensor = torch.as_tensor([ids], dtype=torch.long).to(device)
    with torch.no_grad():
        logits, _ = classifier(tensor)
    pred = torch.argmax(logits, axis=1)[0]
    pred = pred.detach().cpu().numpy()
    print('---------')
    print('text')
    print(test_data)
    print('emotion')
    print(emotion_list[pred])
    print('---------')

---------
text
i feel awful about it too because it s my job to get him
emotion
sadness
---------
---------
text
i don t feel comfortable
emotion
joy
---------
