https://dacon.io/competitions/official/235747/codeshare/3054?page=1&dtype=recent

In [None]:
import pandas as pd
import os 
import torch

In [None]:
if torch.cuda.is_available():
    device = 'cuda:0'
else:
    device = 'cpu'

print(device)

In [None]:
# VGA torch 호환성 체크.

t = torch.Tensor(3,4).cuda()
t.device
# 실행이 잘 되면 GPU 잘 쓸 수 있는 것.

# Preprocessing

In [None]:
PATH = './dataset/'

In [None]:
train_data = pd.read_csv(os.path.join(PATH, 'train_data.csv'), encoding='utf-8')
test_data = pd.read_csv(os.path.join(PATH, 'test_data.csv'), encoding='utf-8')

topic_dict = pd.read_csv(os.path.join(PATH, 'topic_dict.csv'), encoding='utf-8')

In [None]:
train_data = train_data.drop('index', axis=1)
test_data = test_data.drop('index', axis=1)

In [None]:
train_data

# Make Dataset, DataLoader

In [None]:
# from eunjeon import Mecab
# from kobert_tokenizer import KoBERTTokenizer
from transformers import AdamWeightDecay
from transformers import AutoTokenizer
from tqdm import tqdm
import numpy as np

import torch

In [None]:
train_data['title'][0]

In [None]:
# data를 token과 label로 분리해 numpy로 반환해준다.
def convert_data(tokenizer, data_df, case):
    tokens, masks, segments, targets = [], [], [], []
    
    for i in tqdm(range(len(data_df))):
        # tokenize
        token = tokenizer.encode(data_df['title'][i], max_length=MAX_LEN, padding='max_length', truncation=True)[1:-1]
        
        # making segment
        # segment = [0]*MAX_LEN
        
        # token, segment
        tokens.append(token)
        # segments.append(segments)
        
        if case == 'train':
            targets.append(data_df['topic_idx'][i])
    
    tokens = np.array(tokens)
    
    if case == 'train':
        targets = np.array(targets)
        
        return [tokens, segments], targets
    
    return [tokens, segments],
    

In [None]:
from torch.utils.data import Dataset, DataLoader


class CustomDataset(Dataset):
    def __init__(self, csv_path, tokenizer, max_seq_len, pad_idx, mode):
        self.data = pd.read_csv(csv_path)
        self.tokenizer = tokenizer
        self.max_len = max_seq_len
        self.pad_idx = pad_idx
        if mode=='train' or mode=='eval':
            self.mode = mode
        else:
            raise Exception(f'\'--mode\' should be \'train\' or \'eval\'. But your arg is \'{mode}\'')
        
    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        context = self.tokenizer.encode(self.data['title'][idx])[1:-1]
        con_len = len(context)
        pad_li = [self.pad_idx]*(self.max_len - con_len)
        context += pad_li
        context = torch.LongTensor(context)
        
        if self.mode == 'train':
            label_idx = int(self.data['topic_idx'][idx])
            label = torch.zeros(7)
            label[label_idx] += 1
            
            return context, label
        return context,

In [None]:
BATCH_SIZE = 64
MAX_LEN = 30

tokenizer = AutoTokenizer.from_pretrained('klue/roberta-large')

# make dataset
train_dataset = CustomDataset('./dataset/train_data.csv', tokenizer=tokenizer, max_seq_len=MAX_LEN, pad_idx = 1, mode='train')
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = CustomDataset('./dataset/test_data.csv', tokenizer=tokenizer, max_seq_len=MAX_LEN, pad_idx = 1, mode='eval')
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
count=0
for X, in test_loader:
    count+=1
print(count, count*BATCH_SIZE)

# Build Model

transformer 의 encdoer n개를 통과시켜, 마지막 output중 첫번째 token이 Affine Layer을 지나는 model이다.

- d_model = 512
- d_word_vec = 512
- n_layers = 6
- n_head = 8
- d_inner = 2048
- dropout = 0.1
- n_position = 50
- pad_idx = 

In [None]:
tokenizer.vocab['[PAD]']

In [None]:

from torch import nn
from model.Models import Encoder
from einops import rearrange

def get_pad_mask(seq, pad_idx):
    return (seq != pad_idx).unsqueeze(-2)

class TransformerClassifier(nn.Module):
    
    def __init__(self, n_vocab, pad_idx, d_word_vec=512, d_model=512, d_inner=2048,
                 n_layers=6, n_head=8, d_k=64, dropout=0.1, n_position=50, num_labels=7,max_seq_len=30):
        
        super().__init__()
        self.encoder = Encoder(n_src_vocab=n_vocab, d_word_vec=d_word_vec, n_layers=n_layers, 
                               d_inner=d_inner, n_head=n_head, d_k=d_k, d_v=d_k, d_model=d_model, 
                               pad_idx=pad_idx, dropout=dropout, n_position=n_position, scale_emb=False)
        self.linear1 = nn.Linear(max_seq_len*d_model, d_model, bias=True)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(d_model, num_labels, bias=True)
        self.pad_idx = pad_idx
    
    def forward(self, context):
        
        mask = get_pad_mask(context, self.pad_idx)
        enc_output, *_ = self.encoder(context, mask)
        lin_output = self.relu(self.linear1(rearrange(enc_output,'b s d -> b (s d)')))
        seq_logit = self.relu(self.linear2(lin_output))
        
        return seq_logit

### ERROR 5: 모델 반환모양이 ( b seq class ) 이다. ( b c ) 이어야하는데..?

in model forward... context.shape torch.Size([64, 30])
in model forward... mask.shape torch.Size([64, 1, 30])
in model forward... enc_output.shape torch.Size([64, 30, 512])      # linear 을 잘못 수행하였다.
in model forward... after_linear.shape torch.Size([64, 30, 7])
-> linear 추가해서, 한번 더 거치게 하였다.

### 개선점 1

output에 softmax 안했는데, 어떤 영향이 있는거지?

# Train !


In [None]:
CFG = {
    'EPOCHS':20,
    'LEARNING_RATE':1e-5,
    "BATCH_SIZE":64,
    'SEED':42
}

In [None]:
classifier_model = TransformerClassifier(n_vocab=32000, pad_idx=tokenizer.vocab['[PAD]'], 
                                        d_word_vec=512, d_model=512, d_inner=2048,
                                        n_layers=6, n_head=8, d_k=64, dropout=0.1, n_position=MAX_LEN)
# optimizer = AdamWeightDecay(1e-5, weight_decay_rate=1e-4)
optimizer = torch.optim.Adam(params = classifier_model.parameters(), lr = CFG['LEARNING_RATE'])
criterion = nn.CrossEntropyLoss()

In [None]:

import numpy as np

def train(model, train_loader, optimizer, device, criterion):
    model.to(device)
    criterion = criterion.to(device)   # loss
    # metric = pass     # TODO 1: find appropriate metric

    for epoch in range(1,CFG['EPOCHS']+1):
        model.train()
        train_loss = []
        # train_metric = [] # TODO 1

        for X,Y in tqdm(train_loader, desc='Training ...'):
            X = X.to(device)
            Y = Y.to(device)
            
            optimizer.zero_grad()
            output = model(X)
            loss = criterion(output, Y)
            with torch.no_grad():
                # metric_name = metric  # TODO 1
                pass
            
            loss.backward()
            optimizer.step()
            
            train_loss.append(loss.item())
            # train_metric.append(metric_name.item())   # TODO 1
        
        print(f"Epoch {epoch}  |  Train Loss : [{np.mean(train_loss):.5f}]  |  Train Metric: not yet ")   # TODO 1
        
        # TODO 2: 지금 epoch의 metric이 더 좋으면, 더 좋은 모델을 best model에 저장한다.
        pass
        
                
train(classifier_model, train_loader, optimizer, device, criterion)

### ERROR 1 : nn.embedding() 은 input Tensor가 int 형이어야한다.
c:\Users\jaeng\Desktop\VSC\DACON\text_classification\main.ipynb 셀 23 in TransformerClassifier.forward(self, context)
...
   2197     # remove once script supports set_grad_enabled
   2198     _no_grad_embedding_renorm_(weight, input, max_norm, norm_type)
-> 2199 return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

------------------------

학습 해보려 하는데,
- 모델 내부에서 int로 받아야하는 'indices'라는 놈에 floatTensor 가 들어가고있어 말썽인 상황이다. 내일 해결해봐야지.
- torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse) 의 첫번째 argument weight가 Int이어야하는데 Float로 들어온건가?

### ERROR 2: 
-> 3014 return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)

IndexError: Target -9223372036854775808 is out of bounds.

model 어딘가에서 normalization 일어나고있지 않나?
linear 결과에 norm 추가해야하나? 다른 모델 봐야겠다.
- Y 찍어보니 그 안에 엄청 큰 절댓값이 존재한다.. 뭐지?

-1.9357e+16 이 값이, -19356592969351168 이 된다. 왜 저 값이 존재하지?
Y label 만드는 과정에서 오류가 있을거다.

### ERROR 3: model 과 Tensor 의 .to(device) 함수 차이
https://stackoverflow.com/questions/59560043/what-is-the-difference-between-model-todevice-and-model-model-todevice

---> 36 train(classifier_model, train_loader, optimizer, device, criterion)
AttributeError: 'TransformerClassifier' object has no attribute 'device'

Model can be placed in GPU with code,
```
a = my_model()  # a is in cpu
a.to(device)    # a is moved to gpu
```

But Tensor cannot be moved to GPU with the same code.
```
a = torch.Tensor([1,2,3])
a.to(device)    # a is in cpu
a = a.to(devivce)   # a is now in gpu
```

### ERROR 4: 
RuntimeError: CUDA error: no kernel image is available for execution on the device
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

https://ndb796.tistory.com/744  을 보고 해결했다.
- 현재 커널의 python 버전과, CUDA 버전이 호환되지 않아서 발생하는 문제다.
- python 버전을 3.6 -> 3.8 업그레이드함으로 해결했다.(사실은 conda 환경을 새로 팜)
    - 내가 가진 CUDA버전이 3.6이랑 호환이 안된다.

# Inference

In [None]:
def predict(model, test_loader, device):
    model.to(device)
    model.eval()
    preds=[]
    with torch.no_grad():
        for X, in tqdm(test_loader, desc='Inferencing ...'):
            X = X.to(device)
            
            pred = model(X)
            preds += pred.cpu().tolist()
    
    # 전체 prediction 된 것들을 numpy로 한번에 반환해준다.
    return np.array(preds)

## Model Load

In [None]:
# TODO 2: 가장 좋은 모델 ckpt 가져와서 load 해줘야하지만 지금은 대충 train 한거 기억하는거 그대로 가져온다.
# 대충 하는중
model = classifier_model
preds = predict(model, test_loader, device)

preds.shape

In [None]:
np.save('/home/kist/Desktop/JH/DACON_transformer/test_preds', preds)

# Test_data output checking

In [None]:
import numpy as np

preds = np.load('/home/kist/Desktop/JH/DACON_transformer/test_preds.npy')

In [None]:
# 예측 결과, max 해서 label로 반환

pred_outputs=[]
for i in range(len(preds)):
    pred_outputs.append(np.argmax(preds[i]))


In [None]:
# output inference
import pandas as pd

print(pd.read_csv('./dataset/topic_dict.csv'))
print(pd.read_csv('./dataset/test_data.csv'))
print(pred_outputs[:10])
"""
얼추 잘 맞는것같기도 하자. 따로 자세히 보자.
"""

In [None]:
import pandas as pd

sample_sub = pd.read_csv('./dataset/sample_submission.csv')
real_sub = pd.DataFrame(sample_sub['index'])

real_sub['topic_idx'] = pred_outputs

"""
Q. 혹시 test dataset에서 'shuffle=False' 하면 순서대로 반영되는게 맞는건가?
"""
real_sub.to_csv('first_submission_TransformerEncModel.csv', index=False)