https://dacon.io/competitions/official/235747/codeshare/3054?page=1&dtype=recent

In [1]:
import pandas as pd
import os 

# Preprocessing

In [3]:
PATH = './dataset/'

In [4]:
train_data = pd.read_csv(os.path.join(PATH, 'train_data.csv'), encoding='utf-8')
test_data = pd.read_csv(os.path.join(PATH, 'test_data.csv'), encoding='utf-8')

topic_dict = pd.read_csv(os.path.join(PATH, 'topic_dict.csv'), encoding='utf-8')

In [5]:
train_data = train_data.drop('index', axis=1)
test_data = test_data.drop('index', axis=1)

In [6]:
train_data

Unnamed: 0,title,topic_idx
0,인천→핀란드 항공기 결항…휴가철 여행객 분통,4
1,실리콘밸리 넘어서겠다…구글 15조원 들여 美전역 거점화,4
2,이란 외무 긴장완화 해결책은 미국이 경제전쟁 멈추는 것,4
3,NYT 클린턴 측근韓기업 특수관계 조명…공과 사 맞물려종합,4
4,시진핑 트럼프에 중미 무역협상 조속 타결 희망,4
...,...,...
45649,KB금융 미국 IB 스티펠과 제휴…선진국 시장 공략,1
45650,1보 서울시교육청 신종코로나 확산에 개학 연기·휴업 검토,2
45651,게시판 키움증권 2020 키움 영웅전 실전투자대회,1
45652,답변하는 배기동 국립중앙박물관장,2


# Make Dataset, DataLoader

In [49]:
# from eunjeon import Mecab
# from kobert_tokenizer import KoBERTTokenizer
from transformers import AutoTokenizer
import tqdm as tqdm
import numpy as np

import torch

In [24]:
# tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

In [43]:
MAX_LEN = 0

for d in train_data['title']:
    td = tokenizer.encode(d)
    if MAX_LEN < len(td):
        MAX_LEN = len(td)

MAX_LEN

29

In [32]:
train_data['title'][0]

'인천→핀란드 항공기 결항…휴가철 여행객 분통'

In [45]:
# data를 token과 label로 분리해 numpy로 반환해준다.
def convert_data(tokenizer, data_df, case):
    tokens, masks, segments, targets = [], [], [], []
    
    for i in tqdm(range(len(data_df))):
        # tokenize
        token = tokenizer.encode(data_df['title'][i], max_length=MAX_LEN, padding='max_length', truncation=True)[1:-1]
        
        # making segment
        # segment = [0]*MAX_LEN
        
        # token, segment
        tokens.append(token)
        # segments.append(segments)
        
        if case == 'train':
            targets.append(data_df['topic_idx'][i])
    
    tokens = np.array(tokens)
    
    if case == 'train':
        targets = np.array(targets)
        
        return [tokens, segments], targets
    
    return [tokens, segments],
    

In [50]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, csv_path, tokenizer):
        self.data = pd.read_csv(csv_path)
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.data)

    def __getitem_(self, idx):
        context = tokenizer.encode(self.data['title'][idx])[1:-1]
        label = int(self.data['topic_idx'][idx])
        
        context = torch.Tensor(context)
        label = torch.Tensor(label)
        
        return context, label

In [47]:
BATCH_SIZE = 64

# get tokenizer
tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

# make dataset
train_dataset = CustomDataset('./dataset/train_data.csv', tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

test_dataset = CustomDataset('./dataset/test_data.csv', tokenizer=tokenizer)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Build Model

transformer 의 encdoer n개를 통과시켜 Affine Layer을 지나는 model이다.

- d_model = 512
- d_word_vec = 512
- n_layers = 6
- n_head = 8
- d_inner = 2048
- dropout = 0.1
- n_position = 50
- pad_idx = 

In [79]:
tokenizer.vocab['[PAD]']

1

In [80]:
def get_pad_mask(seq, pad_idx):
    return (seq != pad_idx).unsqueeze(-2)

In [None]:
from torch import nn
from model.Models import Encoder

class Naroho(nn.Module):
    
    def __init__(self, n_vocab, pad_idx, d_word_vec=512, d_model=512, d_inner=2048,
                 n_layers=6, n_head=8, d_k=64, dropout=0.1, n_position=50):
        super().__init__()
        self.encoder = Encoder(n_src_vocab=n_vocab, d_word_vec=d_word_vec, n_layers=n_layers, 
                               n_head=n_head, d_k=d_k, d_v=d_k, d_model=d_model, pad_idx=pad_idx,
                               dropout=dropout, n_position=n_position, scale_emb=False)
        self.linear = nn.Linear(d_model,n_vocab,bias=False)
        
        self.pad_idx = pad_idx
    
    def forward(self, context):

        mask = get_pad_mask(context, self.pad_idx)
        
        enc_output, *_ = self.encoder(context, mask)
        seq_logit = self.linear(enc_output)