In [1]:
import torch
from konlpy.tag import Mecab
from collections import defaultdict
import pickle
import pandas as pd
import seaborn as sns

mecab = Mecab()
morphs = mecab.pos("아버지가방에들어가신다.", join=False)
print(morphs)

In [2]:
class RE_Dataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_dataset, labels):
        self.tokenized_dataset = tokenized_dataset
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.tokenized_dataset.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

In [3]:
def preprocessing_dataset(dataset, label_type):
    label = []
    for i in dataset[8]:
        if i == 'blind':
            label.append(100)
        else:
            label.append(label_type[i])
    out_dataset = pd.DataFrame({
        'sentence':dataset[1],
        'entity_01':dataset[2],
        'entity_02':dataset[5],
        'label':label,
        'entity_01_start' : dataset[3],
        'entity_01_end' : dataset[4],
        'entity_02_start' : dataset[6],
        'entity_02_end' : dataset[7]
    })
    return out_dataset

In [4]:
def load_data(dataset_dir):
  # load label_type, classes
  with open('/opt/ml/input/data/label_type.pkl', 'rb') as f:
    label_type = pickle.load(f)
  # load dataset
  dataset = pd.read_csv(dataset_dir, delimiter='\t', header=None)
  # preprecessing dataset
  dataset = preprocessing_dataset(dataset, label_type)
  
  return dataset

In [5]:
def tokenized_dataset(dataset, tokenizer, ent_token = False):
    concat_entity = []
    concat_sentence = []
    for e01, e02, e1s,e1e,e2s,e2e, sentence in zip(
        dataset['entity_01'], 
        dataset['entity_02'],
        dataset['entity_01_start'],
        dataset['entity_01_end'], 
        dataset['entity_02_start'],
        dataset['entity_02_end'],
        dataset['sentence']):

        temp = ''
        temp = e01 + '[SEP]' + e02
        concat_entity.append(temp)

        if ent_token:
            if e1s < e2s:
                sentence = sentence[:e1s] + \
                '[ENT]' + \
                sentence[e1s:e1e+1] + \
                '[/ENT]' + \
                sentence[e1e+1:e2s] + \
                '[ENT]' +  \
                sentence[e2s:e2e+1] + \
                '[/ENT]'+ \
                sentence[e2e+1:]
            elif e1s >= e2s:
                sentence = sentence[:e2s] + \
                '[ENT]' + \
                sentence[e2s:e2e+1] + \
                '[/ENT]' + \
                sentence[e2e+1:e1s] + \
                '[ENT]' +  \
                sentence[e1s:e1e+1] + \
                '[/ENT]'+ \
                sentence[e1e+1:]
            concat_sentence.append(sentence)
    
    # 토큰 옵션 있는 경우
    if ent_token: 
        tokenized_sentences = tokenizer(
            concat_entity,
            concat_sentence,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=100,
            add_special_tokens=True,
        )
    # 토큰 옵션 없는 경우
    else: 
        tokenized_sentences = tokenizer(
            concat_entity,
            list(dataset['sentence']),
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=100,
            add_special_tokens=True,
        )
        
    return tokenized_sentences

In [8]:
df = load_data("/opt/ml/input/data/train/train_renew.tsv")
sentences = df['sentence'].tolist()

In [9]:
part_list = defaultdict(int)

for sentence in sentences:
    morphs = mecab.pos(sentence, join=False)
    for morph in morphs:
        part_list[morph[1]] += 1

In [2]:
part_list

In [11]:
df = pd.DataFrame(data = {"part" : part_list.keys(), "count" : part_list.values()}, columns = ["part", "count"] )

In [3]:
import matplotlib.pyplot as plt

plt.figure(figsize=(30, 10))
ax = sns.barplot(x = "part", y = "count",data = df)
ax.set_xticks([])
plt.show()

In [8]:
postpositions = set()
conjunctions = set()

postposition_tag = ['JKS', 'JKC', 'JKG', 'JKO', 'JKB', 'JKV', 'JKQ', 'JX', 'JC'] # 조사 태그 리스트 
conjunction_tag = ['MAG'] # 접속사 태그 리스트

for sentence in sentences:
    morphs = mecab.pos(sentence, join=False)
    for morph in morphs:

        if morph[1] in postposition_tag:
            postpositions.add(morph)

        if morph[1] in conjunction_tag:
            conjunctions.add(morph)

print(len(postpositions))
print(len(conjunctions))

In [4]:
print(list(postpositions)[:10])
print(list(conjunctions)[:10])

In [16]:
# Entity에 불용어가 있는지 확인
df = load_data("/opt/ml/input/data/train/train_renew.tsv")
entity1_list = df['entity_01'].tolist()
entity2_list = df['entity_02'].tolist()

In [7]:
postpositions = set()
conjunctions = set()

for entity1 in entity1_list:

    morphs = mecab.pos(entity1, join=False)
    for morph in morphs:

        if morph[1] in postposition_tag:
            postpositions.add(morph)

        if morph[1] in conjunction_tag:
            conjunctions.add(morph)
            

for entity2 in entity1_list:

    morphs = mecab.pos(entity2, join=False)
    for morph in morphs:

        if morph[1] in postposition_tag:
            postpositions.add(morph)

        if morph[1] in conjunction_tag:
            conjunctions.add(morph)
            
print(len(postpositions))
print(len(conjunctions))

In [5]:
print(list(postpositions)[:20])

In [6]:
print(list(conjunctions)[:20])

In [20]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("kykim/bert-kor-base")

vocab = tokenizer.get_vocab()