https://dacon.io/competitions/official/235747/codeshare/3054?page=1&dtype=recent

In [1]:
import pandas as pd
import os 

# Preprocessing

In [2]:
PATH = './dataset/'

In [3]:
train_data = pd.read_csv(os.path.join(PATH, 'train_data.csv'), encoding='utf-8')
test_data = pd.read_csv(os.path.join(PATH, 'test_data.csv'), encoding='utf-8')

topic_dict = pd.read_csv(os.path.join(PATH, 'topic_dict.csv'), encoding='utf-8')

In [4]:
train_data = train_data.drop('index', axis=1)
test_data = test_data.drop('index', axis=1)

In [5]:
train_data

Unnamed: 0,title,topic_idx
0,인천→핀란드 항공기 결항…휴가철 여행객 분통,4
1,실리콘밸리 넘어서겠다…구글 15조원 들여 美전역 거점화,4
2,이란 외무 긴장완화 해결책은 미국이 경제전쟁 멈추는 것,4
3,NYT 클린턴 측근韓기업 특수관계 조명…공과 사 맞물려종합,4
4,시진핑 트럼프에 중미 무역협상 조속 타결 희망,4
...,...,...
45649,KB금융 미국 IB 스티펠과 제휴…선진국 시장 공략,1
45650,1보 서울시교육청 신종코로나 확산에 개학 연기·휴업 검토,2
45651,게시판 키움증권 2020 키움 영웅전 실전투자대회,1
45652,답변하는 배기동 국립중앙박물관장,2


# Make Dataset, DataLoader

In [6]:
# from eunjeon import Mecab
# from kobert_tokenizer import KoBERTTokenizer
from transformers import AdamWeightDecay
from transformers import AutoTokenizer
from tqdm import tqdm
import numpy as np

import torch

In [7]:
train_data['title'][0]

'인천→핀란드 항공기 결항…휴가철 여행객 분통'

In [8]:
# data를 token과 label로 분리해 numpy로 반환해준다.
def convert_data(tokenizer, data_df, case):
    tokens, masks, segments, targets = [], [], [], []
    
    for i in tqdm(range(len(data_df))):
        # tokenize
        token = tokenizer.encode(data_df['title'][i], max_length=MAX_LEN, padding='max_length', truncation=True)[1:-1]
        
        # making segment
        # segment = [0]*MAX_LEN
        
        # token, segment
        tokens.append(token)
        # segments.append(segments)
        
        if case == 'train':
            targets.append(data_df['topic_idx'][i])
    
    tokens = np.array(tokens)
    
    if case == 'train':
        targets = np.array(targets)
        
        return [tokens, segments], targets
    
    return [tokens, segments],
    

In [9]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, csv_path, tokenizer, max_seq_len, pad_idx):
        self.data = pd.read_csv(csv_path)
        self.tokenizer = tokenizer
        self.max_len = max_seq_len
        self.pad_idx = pad_idx
        
    
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context = self.tokenizer.encode(self.data['title'][idx])[1:-1]
        label_idx = int(self.data['topic_idx'][idx])
        con_len = len(context)
        pad_li = [self.pad_idx]*(self.max_len - con_len)
        context += pad_li
        
        context = torch.LongTensor(context)
        label = torch.zeros(7)
        label[label_idx] += 1
        
        return context, label

In [10]:
BATCH_SIZE = 64
MAX_LEN = 30

# get tokenizer
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

# make dataset
train_dataset = CustomDataset('./dataset/train_data.csv', tokenizer=tokenizer, max_seq_len=MAX_LEN, pad_idx = 1)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = CustomDataset('./dataset/test_data.csv', tokenizer=tokenizer, max_seq_len=MAX_LEN, pad_idx = 1)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [None]:
# # 64개의 token seq랑, label 들을 반환한다.
# for batch in train_loader:
#     print(len(batch[0]))
#     print(batch[1][0])
#     break

# Build Model

transformer 의 encdoer n개를 통과시켜 Affine Layer을 지나는 model이다.

- d_model = 512
- d_word_vec = 512
- n_layers = 6
- n_head = 8
- d_inner = 2048
- dropout = 0.1
- n_position = 50
- pad_idx = 

In [11]:
tokenizer.vocab['[PAD]']

1

In [12]:
def get_pad_mask(seq, pad_idx):
    return (seq != pad_idx).unsqueeze(-2)

In [26]:

from torch import nn
from model.Models import Encoder

class TransformerClassifier(nn.Module):
    
    def __init__(self, n_vocab, pad_idx, d_word_vec=512, d_model=512, d_inner=2048,
                 n_layers=6, n_head=8, d_k=64, dropout=0.1, n_position=50, num_labels=7):
        super().__init__()
        self.encoder = Encoder(n_src_vocab=n_vocab, d_word_vec=d_word_vec, n_layers=n_layers, 
                               d_inner=d_inner, n_head=n_head, d_k=d_k, d_v=d_k, d_model=d_model, 
                               pad_idx=pad_idx, dropout=dropout, n_position=n_position, scale_emb=False)
        self.linear = nn.Linear(d_model,num_labels,bias=True)
        
        self.pad_idx = pad_idx
    
    def forward(self, context):

        mask = get_pad_mask(context, self.pad_idx)
        
        print(mask.device, context.device)
        
        enc_output, *_ = self.encoder(context, mask)
        seq_logit = self.linear(enc_output)
        
        return seq_logit

# Train !


In [15]:
CFG = {
    'EPOCHS':10,
    'LEARNING_RATE':1e-5,
    "BATCH_SIZE":64,
    'SEED':42
}

In [32]:
if torch.cuda.is_available():
    device = 'cuda:0'
else:
    device = 'cpu'
    
classifier_model = TransformerClassifier(n_vocab=32000, pad_idx=tokenizer.vocab['[PAD]'], 
                                        d_word_vec=512, d_model=512, d_inner=2048,
                                        n_layers=6, n_head=8, d_k=64, dropout=0.1, n_position=MAX_LEN)
# optimizer = AdamWeightDecay(1e-5, weight_decay_rate=1e-4)
optimizer = torch.optim.Adam(params = classifier_model.parameters(), lr = CFG['LEARNING_RATE'])
criterion = nn.CrossEntropyLoss()

In [60]:
import numpy as np

def train(model, train_loader, optimizer, device, criterion):
    model.to(device)
    criterion = criterion.to(device)   # loss
    # metric = pass     # TODO 1: find appropriate metric

    for epoch in range(1,CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        # train_metric = [] # TODO 1

        for X,Y in tqdm(train_loader, desc='Eopch {epoch} Training ...'):
            X = X.to(device)
            Y = Y.to(torch.int64).to(device)
            
            optimizer.zero_grad()
            output = model(X)  
            loss = criterion(output, Y)
            with torch.no_grad():
                # metric_name = metric  # TODO 1
                pass
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
            # train_metric.append(metric_name.item())   # TODO 1
        print(f"Epoch [{epoch}]  |  Train Loss : [{np.mean(train_loss):.5f}]  |  Train Metric: not yet ")   # TODO 1
        
        # TODO 2: 지금 epoch의 metric이 더 좋으면, 더 좋은 모델을 best model에 저장한다.
        
        
                
train(classifier_model, train_loader, optimizer, device, criterion)

Eopch {epoch} Training ...:   0%|          | 0/714 [00:00<?, ?it/s]


RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

### ERROR 1 : nn.embedding() 은 input Tensor가 int 형이어야한다.
c:\Users\jaeng\Desktop\VSC\DACON\text_classification\main.ipynb 셀 23 in TransformerClassifier.forward(self, context)
...
   2197     # remove once script supports set_grad_enabled
   2198     _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2199 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

------------------------

학습 해보려 하는데,
- 모델 내부에서 int로 받아야하는 'indices'라는 놈에 floatTensor 가 들어가고있어 말썽인 상황이다. 내일 해결해봐야지.
- torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse) 의 첫번째 argument weight가 Int이어야하는데 Float로 들어온건가?

# ERROR 2: 
-> 3014 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)

IndexError: Target -9223372036854775808 is out of bounds.

model 어딘가에서 normalization 일어나고있지 않나?
linear 결과에 norm 추가해야하나? 다른 모델 봐야겠다.
- Y 찍어보니 그 안에 엄청 큰 절댓값이 존재한다.. 뭐지?

-1.9357e+16 이 값이, -19356592969351168 이 된다. 왜 저 값이 존재하지?
Y label 만드는 과정에서 오류가 있을거다.

# ERROR 3: model 과 Tensor 의 .to(device) 함수 차이
https://stackoverflow.com/questions/59560043/what-is-the-difference-between-model-todevice-and-model-model-todevice

---> 36 train(classifier_model, train_loader, optimizer, device, criterion)
AttributeError: 'TransformerClassifier' object has no attribute 'device'

Model can be placed in GPU with code,
```
a = my_model()  # a is in cpu
a.to(device)    # a is moved to gpu
```

But Tensor cannot be moved to GPU with the same code.
```
a = torch.Tensor([1,2,3])
a.to(device)    # a is in cpu
a = a.to(devivce)   # a is now in gpu
```