In [1]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"   
os.environ["CUDA_VISIBLE_DEVICES"]="1"

In [2]:
# Import modules
import os
import time
import pickle
from tqdm import tqdm

# Import PyTorch
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

# Import custom modules
from dataset import CustomDataset, PadCollate
from model.optimizer import Ralamb, WarmupLinearSchedule
from model.model import Transformer

In [3]:
import argparse

parser = argparse.ArgumentParser(description='Parsing Method')
parser.add_argument('--preprocessing', action='store_true')
parser.add_argument('--training', action='store_true')
# Path setting
parser.add_argument('--data_path', default='/HDD/dataset/korean-hate-speech-detection/', type=str,
                    help='Original data path')
parser.add_argument('--save_path', default='./preprocessing', type=str,
                    help='Preprocessed data & Model checkpoint file path')
# Preprocessing setting
parser.add_argument('--vocab_size', default=24000, type=int, help='Vocabulary size; Default is 24000')
parser.add_argument('--pad_idx', default=0, type=int, help='pad index')
parser.add_argument('--bos_idx', default=1, type=int, help='index of bos token')
parser.add_argument('--eos_idx', default=2, type=int, help='index of eos token')
parser.add_argument('--unk_idx', default=3, type=int, help='index of unk token')
parser.add_argument('--max_len', default=150, type=int, help='Max Length of Source Sentence; Default is 150')
# Model setting
parser.add_argument('--d_model', default=768, type=int, help='Model dimension; Default is 768')
parser.add_argument('--d_embedding', default=256, type=int, help='Embedding dimension; Default is 256')
parser.add_argument('--n_head', default=12, type=int, help='Mutlihead count; Default is 12')
parser.add_argument('--dim_feedforward', default=2048, type=int, help='Feedforward layer dimension; Default is 2048')
parser.add_argument('--n_layers', default=12, type=int, help='Layer count; Default is 12')
# Training setting
parser.add_argument('--num_epochs', default=30, type=int, help='Epoch count; Default is 30')
parser.add_argument('--batch_size', default=16, type=int, help='Batch size; Default is 16')
parser.add_argument('--dropout', default=0.3, type=float, help='Dropout ratio; Default is 0.3')
parser.add_argument('--lr', default=1e-3, type=float, help='Learning rate; Default is 1e-3')
parser.add_argument('--w_decay', default=5e-4, type=float, help='Weight decay ratio; Default is 5e-4')
args = parser.parse_args(list())

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#===================================#
#============Data Load==============#
#===================================#

# 1) Data open
print('Data Load & Setting!')
with open(os.path.join(args.save_path, 'processed.pkl'), 'rb') as f:
    data_ = pickle.load(f)
    train_indices = data_['train_indices']
    valid_indices = data_['valid_indices']
    train_title_indices = data_['train_title_indices']
    valid_title_indices = data_['valid_title_indices']
    train_total_indices = data_['train_total_indices']
    valid_total_indices = data_['valid_total_indices']
    train_label = data_['train_label']
    valid_label = data_['valid_label']
    word2id = data_['word2id']
    id2word = data_['id2word']
    vocab_num = len(word2id.keys())
    del data_

dataset_dict = {
    'train': CustomDataset(train_total_indices, train_indices, 
                        train_title_indices, train_label,
                        max_len=args.max_len),
    'valid': CustomDataset(valid_total_indices, valid_indices, 
                        valid_title_indices, valid_label,
                        max_len=args.max_len),
}
dataloader_dict = {
    'train': DataLoader(dataset_dict['train'], collate_fn=PadCollate(), drop_last=True,
                        batch_size=16, shuffle=True, pin_memory=True,
                        num_workers=2),
    'valid': DataLoader(dataset_dict['valid'], collate_fn=PadCollate(), drop_last=True,
                        batch_size=16, shuffle=True, pin_memory=True,
                        num_workers=2)
}
print(f"Total number of trainingsets  iterations - {len(dataset_dict['train'])}, {len(dataloader_dict['train'])}")

Data Load & Setting!
Total number of trainingsets  iterations - 7896, 493


In [5]:
# 1) Model initiating
print("Instantiating models...")
model = Transformer(vocab_num=vocab_num, pad_idx=args.pad_idx, bos_idx=args.bos_idx, 
                    eos_idx=args.eos_idx, max_len=args.max_len, d_model=args.d_model, 
                    d_embedding=args.d_embedding, n_head=args.n_head, 
                    dim_feedforward=args.dim_feedforward, n_layers=args.n_layers, 
                    dropout=args.dropout, device=device)
optimizer = Ralamb(params=filter(lambda p: p.requires_grad, model.parameters()),
                lr=args.lr, weight_decay=args.w_decay)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=len(dataloader_dict['train'])*3, 
                                t_total=len(dataloader_dict['train'])*args.num_epochs)
criterion = nn.CrossEntropyLoss(ignore_index=args.pad_idx)
model.to(device)

Instantiating models...


Transformer(
  (dropout): Dropout(p=0.3, inplace=False)
  (transformer_embedding): TransformerEmbedding(
    (token): TokenEmbedding(12000, 256, padding_idx=0)
    (linear_layer): Linear(in_features=256, out_features=768, bias=True)
    (position): PositionalEmbedding()
    (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (embedding_dropout): Dropout(p=0.1, inplace=False)
  )
  (output_linear): Linear(in_features=768, out_features=256, bias=False)
  (output_norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (output_linear2): Linear(in_features=256, out_features=3, bias=False)
  (encoders): ModuleList(
    (0): TransformerEncoderLayer(
      (self_attn): MultiheadAttention(
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
      )
      (linear1): Linear(in_features=768, out_features=2048, bias=True)
      (dropout): Dropout(p=0.3, inplace=False)
      (linear2): Linear(in_features=2048, out_features=768, bias=True)
      (norm1): Laye

In [6]:
start_epoch = 0

# 1) Pre-setting
best_val_loss = None

# 2) Training start
for e in range(start_epoch, args.num_epochs):
    start_time_e = time.time()
    for phase in ['train', 'valid']:
        if phase == 'train':
            model.train()
        if phase == 'valid':
            model.eval()
            val_loss = 0
        for i, (total, comment, title, label) in enumerate(dataloader_dict[phase]):
            break
        break
    break

In [7]:
# Source, Target  setting
total = total.to(device)
comment = comment.to(device)
label = label.to(device)

# Optimizer setting
optimizer.zero_grad()

In [8]:
# Model / Calculate loss
with torch.set_grad_enabled(phase == 'train'):
    output = model(comment)

In [17]:
output[:,0]

tensor([[ 0.9738,  0.3198, -0.3971],
        [ 0.4141,  0.6645,  0.1555],
        [-0.1024,  1.0948, -0.9971],
        [ 0.2354,  0.3047,  0.6112],
        [-0.6235, -0.3289, -0.7982],
        [ 0.9417, -0.1104,  0.0428],
        [ 0.2064,  0.8898, -0.4838],
        [-0.1077,  1.0312,  0.0220],
        [ 0.2509,  0.3141, -0.3987],
        [ 0.8201,  0.1591, -1.0772],
        [ 0.4695,  1.1079, -0.4658],
        [-0.2693,  0.4440, -0.8266],
        [ 0.8681,  0.2959, -0.6716],
        [ 0.6637, -0.0824, -1.4008],
        [ 0.0335,  0.6816, -0.5836],
        [-0.1174,  0.2077, -1.4999]], device='cuda:0',
       grad_fn=<SelectBackward>)

In [19]:
output[:,0].size()

torch.Size([16, 3])

In [10]:
label

tensor([0, 0, 1, 2, 1, 2, 1, 0, 0, 1, 2, 2, 2, 1, 0, 0], device='cuda:0')

In [20]:
loss = criterion(output[:,0], label)

In [21]:
loss

tensor(1.2599, device='cuda:0', grad_fn=<NllLossBackward>)

In [35]:
sum(output[:,0].max(dim=1)[1] == label).item() / len(label)

0.3125

In [30]:
sum(output[:,0].max(dim=1)[1] == label)

tensor(5, device='cuda:0')