In [26]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM

In [20]:
model_name = 'skt/ko-gpt-trinity-1.2B-v0.5'

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [23]:
model_config = AutoConfig.from_pretrained(model_name)
#model_config.num_labels = 30

In [27]:
model = AutoModelForCausalLM.from_pretrained(
            model_name, config=model_config)

In [28]:
import ast
import pickle

import torch

import pandas as pd
import numpy as np

In [29]:
data_dir = '../dataset/train/train.csv'
data = pd.read_csv(data_dir)

In [13]:
def add_entity_tokens(sentence, object_entity, subject_entity):
    def entity_mapper(entity_type):
        e_map = {'PER' : '인물', 'ORG' : '기관', 'LOC' : '지명', 'POH' : '기타', 'DAT' : '날짜', 'NOH' : '수량'}
        return e_map[entity_type]
    def extract(entity): return int(ast.literal_eval(entity)['start_idx']), int(ast.literal_eval(entity)['end_idx']), entity_mapper(ast.literal_eval(entity)['type'])
    obj_start_idx, obj_end_idx, obj_type = extract(object_entity)
    subj_start_idx, subj_end_idx, sbj_type = extract(subject_entity)
    
    if obj_start_idx < subj_start_idx:
        new_sentence = sentence[:obj_start_idx] + '#' + '+' + obj_type + '+' + sentence[obj_start_idx:obj_end_idx+1] + '#' + \
                       sentence[obj_end_idx+1:subj_start_idx] + '@' + '^' + sbj_type + '^' + sentence[subj_start_idx:subj_end_idx+1] + \
                       '@' + sentence[subj_end_idx+1:]
    else:
        new_sentence = sentence[:subj_start_idx] + '@' + '^' + sbj_type + '^' + sentence[subj_start_idx:subj_end_idx+1] + '@' + \
                       sentence[subj_end_idx+1:obj_start_idx] + '#' + '+' + obj_type + '+' + sentence[obj_start_idx:obj_end_idx+1] + \
                       '#' + sentence[obj_end_idx+1:]
    
    return new_sentence


def ent_preprocess(data):
    data['sentence'] = data.apply(lambda row: add_entity_tokens(row['sentence'], row['object_entity'], row['subject_entity']), axis=1)
    return data

data = ent_preprocess(data)

In [30]:
ex_sentence = data['sentence'][41]
ex_sentence2 = data['sentence'][20]
ex_encoding = tokenizer(ex_sentence,
                max_length=64,
                padding='max_length',
                truncation=True)
ex_encoding2 = tokenizer(ex_sentence2,
                max_length=64,
                padding='max_length',
                truncation=True)

In [31]:
print(tokenizer.decode(ex_encoding['input_ids']))
print(ex_encoding["input_ids"])
print(ex_sentence)

이 캐릭터가 작가 스탠 리와 만화가 잭 커비가 만든 "" 에서 캡틴 아메리카의 전쟁 시절 애정 상대로 처음으로 등장하였을 당시에는 이름이 없었다.<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
[29976, 40658, 32603, 37329, 30284, 25512, 30048, 31625, 36523, 30582, 32420, 31210, 30013, 378, 32162, 49671, 40738, 25792, 31110, 31487, 49112, 33175, 32770, 30672, 46009, 38371, 33736, 31545, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
이 캐릭터가 작가 스탠 리와 만화가 잭 커비가 만든 "" 에서 캡틴 아메리카의 전쟁 시절 애정 상대로 처음으로 등장하였을 당시에는 이름이 없었다.


In [32]:
print(tokenizer.decode(ex_encoding2['input_ids']))
print(ex_encoding2["input_ids"])

1971년 대선을 앞두고 김종필은 1971년 선거에서 박정희 당선을 위해 무려 600억원이나 썼다고 밝혔다.<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
[46252, 29983, 31097, 34123, 34828, 29356, 25768, 46252, 41040, 39789, 30087, 31097, 30234, 34519, 40390, 31523, 30195, 33251, 30062, 32733, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]


In [18]:
stc = "인물 지명 기관 기타 수량 날짜 나머지 단체 사람"
ex_encoding = tokenizer(stc,
                max_length=20,
                padding='max_length',
                truncation=True)
print(ex_encoding['input_ids'])
print(print(tokenizer.decode(ex_encoding['input_ids'])))

[31169, 36514, 32043, 32112, 29985, 21809, 39231, 32960, 32964, 30086, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
인물 지명 기관 기타 수량 날짜 나머지 단체 사람<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>
None
