1. Few-shot 학습을 위한 개체명 리스트 생성

2. GPT-3를 사용해 개체명 리스트 확장

3. GPT-3를 사용하여 확장된 개체명 인식 데이터셋 생성

4. NER 모델 학습

---
# Install & load
----

In [1]:
!pip install openai
!pip install transformers
!pip install seqeval

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.2-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.1/70.1 KB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Collecting multidict<7.0,>=4.5
  Downloading multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.2/114.2 KB[0m [31m14.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting async-timeout<5.0,>=4.0.0a3
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x

[참고블로그](https://medium.com/@yongsun.yoon/%EB%8D%B0%EC%9D%B4%ED%84%B0-%EC%97%86%EC%9D%B4-ner-%EB%AA%A8%EB%8D%B8-%ED%95%99%EC%8A%B5%ED%95%98%EA%B8%B0-90c4c24953a)

In [2]:
import os
import openai


key_path = '/content/drive/MyDrive/2.Study/NER/OpenAI_Key.txt'

with open(key_path, 'r') as f:
  value = f.read()
  
# OpenAI Key 등록 [노출 조심]
openai.api_key = value

In [3]:
##### pytorch #####
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset


##### 시각화 #####
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns 

##### 기본 모듈 #####
import pandas as pd
import numpy as np
import os
import random
import json
import math
import easydict
from pprint import pprint
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import re 
import time

##### 디버깅 #####
import pdb

##### cuda #####
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # GPU 할당

##### 경고무시 #####
import warnings
warnings.filterwarnings(action='ignore')

from transformers import AutoTokenizer, AutoModelForTokenClassification

---
# 개체명 리스트 작성
---

In [4]:
# 예시를 위한 엔티티 생성

real_entities = [
    {
        'class_name': 'hotel name',
        'entity_names': [
            'Ritz-Carlton Hotel',
            'Marriott',
            'The Luxury Collection Hotels & Resorts',
            'St Regis Hotels',
            'Hyatt'
        ]
    },

    {
        'class_name': 'room type',
        'entity_names': [
            'Single room',
            'twin room',
            'Double room',
            'deluxe room',
            'Suites',
        ]    
    },
    {
        'class_name': 'person name',
        'entity_names': [
            'Yongsun Yoon',
            'Steve Adams',
            'Donnie K. Schneider',
            'Eleanor Lockhart',
            'Jacqueline R. French'
        ]
    },
    {
        'class_name': 'date',
        'entity_names': [
            '3/4/2022',
            'November 27th',
            'December 15, 2023',
            'Feb. 8',
            'Saturday, Jul 22'
        ]
    },
    {
        'class_name': 'hotel supplies',
        'entity_names': [
            'shampoo',
            'Coffee kit',
            'towels',
            'Wine glass',
            'fan'
        ]
    }
]

---
# GPT-3를 사용해 개체명 리스트 확장 
---

In [5]:
# 모델에서 텍스트 생성 
def generate(prompts, model='text-davinci-003', n=1, max_tokens=512):
    response = openai.Completion.create(
        model = model,
        prompt = prompts, # 입력으로 사용될 문장을 지정
        echo = False, # 입력으로 받은 문장을 다시 출력할지 여부
        n = n, # 생성할 문장 개수
        max_tokens = max_tokens, # 모델이 생성할 최대 토큰 수 
        # stop = '\n'
    )
    
    # response.choices는 모델이 생성한 출력 결과를 담고 있는 리스트
    texts = [c.text.strip() for c in response.choices]
    return texts

# 새로운 엔티티 이름을 생성하는 모델에 입력할 프롬프트 문장 생성
def construct_entity_prompt(class_name, entity_names, k=10):
    prompt = f'These are <{class_name}> entity names. Generate {k} new <{class_name}> entity names.\n\n'
    prompt += 'Entity names:\n'
    for e in entity_names:
        prompt += f'- {e}\n'
    prompt += '\nGenerated names:\n-'
    return prompt


# 모델이 생성한 엔티티 이름 목록을 정제하는 함수
def postprocess_entities(synthetic_entities):
    processed = []
    for ents in synthetic_entities:
        ents = f'- {ents}'.split('\n')
        ents = [e.split('-')[1].strip() for e in ents]
        processed += ents
    return processed    

In [6]:
synthetic_entities = []
for real_ent in tqdm(real_entities):
    class_name, entity_names = real_ent['class_name'], real_ent['entity_names']
    # GPT 모델에 입력할 prompt 문장 생성
    prompt = construct_entity_prompt(class_name, entity_names)
    # 새로운 엔티티 생성 
    syn_entities = generate(prompt, n=10)
    # 생성된 새로운 엔티티 정제
    syn_entities = postprocess_entities(syn_entities)
    # 중복된 엔티티 제거
    syn_entities = list(set(syn_entities))

    synthetic_entities.append({'class_name': class_name, 'entity_names': syn_entities})

  0%|          | 0/5 [00:00<?, ?it/s]

In [7]:
prompt

'These are <hotel supplies> entity names. Generate 10 new <hotel supplies> entity names.\n\nEntity names:\n- shampoo\n- Coffee kit\n- towels\n- Wine glass\n- fan\n\nGenerated names:\n-'

In [8]:
synthetic_entities[0]

{'class_name': 'hotel name',
 'entity_names': ['Starlight Spectacular Resort',
  'Crystal Palace Inn',
  'Urban Oceanside Resort',
  'Castle Inn & Suites',
  'Emerit Hotels & Resorts',
  'Comfort Harbor Inn',
  'Four Seasons Executive Galleria',
  'Tower Royal Hotel',
  'The Crown Hotel',
  'Round Table Resorts',
  'InterContinental Resort',
  'Cobalt Hotel',
  'Lords Palace Suites',
  'Regal Sky Lodging',
  'Boulevard Plaza Hotel',
  'Splendid Expanse Hotel',
  'The Silver Spur Inn',
  'Seaport Village Inn',
  'Regal Splendor Hotels & Resorts',
  'The Fairmont Inn',
  'InterContinental Hotels & Resorts',
  'Luxury Palace Haven',
  'The Presidential Retreats',
  'Westin Hotels & Resorts',
  'Waterfront Getaway Inn',
  'The Regent Suites & Spa',
  'Residence Inn & Suites',
  'Elite Summit Inn Hotels',
  'Platinum Towers',
  'Smorgasbord Place Mansion',
  'StayVille Hotels',
  'Grand Oasis Inn & Suites',
  'Grand Hyatt',
  'W Hotels',
  'Elite Coronet Suites',
  'Vibrant Palaces Hotel',


In [9]:
# 직접 작성한 엔티티와 모델 생성 엔티티 합치기
all_entities = []
for real, synthetic in zip(real_entities, synthetic_entities):
    all_entities.append({
        'class_name': real['class_name'],
        'entity_names': list(set(real['entity_names'] + synthetic['entity_names']))
    })

---
# GPT-3를 사용하여 확장된 개체명 인식 데이터셋 생성
----

In [10]:
# 엔티티 리스트에서 랜덤으로 엔티티를 선택하여 반환하는 함수
def sample_entities(all_entities, min_k=1, max_k=3):
    k = np.random.randint(min_k, max_k+1)
    idxs = np.random.choice(range(len(all_entities)), size=k, replace=False)

    entities = []
    for i in idxs:
        ents = all_entities[i]
        name = np.random.choice(ents['entity_names'])
        entities.append({'class_name': ents['class_name'], 'entity_name': name})
    
    return entities

# 선택된 엔티티와 문장을 GPT 모델의 입력으로 사용될 prompt 문장 생성
def construct_sentence_prompt(entities, style='dialog'):
    prompt = f'Generate a {style} sentence including following entities.\n\n'

    entities_string = ', '.join([f"{e['entity_name']}({e['class_name']})" for e in entities])
    prompt += f'Entities: {entities_string}\n'
    prompt += 'Sentence:'
    return prompt    

# 생성된 문장과 엔티티 목록을 통해 각 토큰에 대한 레이블 생성
def construct_labels(generated, entities, class2idx):

    # 생성된 문장의 길이만큼 outside 레이블로 구성된 리스트 labels을 생성
    labels = [class2idx['outside']] * len(generated)
    for ent in entities:
        l = class2idx[ent['class_name']]
        # 생성된 문장에서 현재 엔티티의 이름이 등장하는 모든 위치를 찾음
        for span in re.finditer(ent['entity_name'].lower(), generated.lower()):
            # 찾은 위치 값
            s, e = span.start(), span.end()
            labels[s] = l
            # 찾은 위치의 해당하는 토큰들의 레이블을 현재 엔티티의 클래스 레이블로 설정
            # 만약 l = 3, s = 10, e = 14일 경우
            # labels[10] = 3, labels[11:14] = [4,4,4]
            labels[s+1:e] = [l+1] * (e-s-1)
    return labels    

In [11]:
class2idx = {e['class_name']: i*2 for i, e in enumerate(all_entities)}
class2idx['outside'] = len(class2idx) * 2
'''index가 2씩 증가하는 이유는 
   label을 만들 때, 엔티티 클래스를 표시하는 숫자와 
   엔티티의 시작점을 표시하는 숫자 사이에 구분을 두기 위함'''

data = []
for _ in tqdm(range(100)):
    batch_entities = [sample_entities(all_entities) for _ in range(10)]
    batch_prompts = [construct_sentence_prompt(ents) for ents in batch_entities]
    batch_generated = generate(batch_prompts, model='text-davinci-002')

    for generated, entities in zip(batch_generated, batch_entities):
        labels = construct_labels(generated, entities, class2idx)
        data.append({'text': generated, 'labels': labels})

    time.sleep(10)


  0%|          | 0/100 [00:00<?, ?it/s]

---
# NER 모델 학습
----

---
### Dataset & DataLoader
---

In [13]:
LABELS = ['B-HT', 'I-HT', 'B-RT', 'I-RT', 'B-PS', 'I-PS', 'B-DT', 'I-DT', 'B-SP', 'I-SP', 'O']


def pad_sequences(seqs, pad_val, max_length):     
    _max_length = max([len(s) for s in seqs])
    max_length = min(max_length, _max_length)
    
    padded_seqs = []
    for seq in seqs:
        seq = seq[:max_length]
        pads = [pad_val] * (max_length - len(seq))
        seq = seq + pads
        padded_seqs.append(seq)

    return padded_seqs


class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer, max_length, split='train'):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.split = split 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item['text']
        char_labels = item['labels']

        inputs = self.tokenizer(text)
        input_ids = inputs.input_ids
        attention_mask = inputs.attention_mask

        labels = []
        for i in range(len(input_ids)):
            # i가 원래 문장에서 시작과 끝 문자열의 인덱스 값 반환
            span = inputs.token_to_chars(i)
            if span is None:
                labels.append(len(LABELS)-1) # O = [PAD], [UNK]
            else:
                labels.append(char_labels[span.start])
        
        return input_ids, attention_mask, labels


    def collate_fn(self, batch):
        input_ids, attention_mask, labels = zip(*batch)
        input_ids = pad_sequences(input_ids, self.tokenizer.pad_token_id, self.max_length)
        attention_mask = pad_sequences(attention_mask, 0, self.max_length)
        labels = pad_sequences(labels, -100, self.max_length)

        return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(labels)

In [14]:
# 토크나이저 로드
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

# 데이터셋 분리
rand_idxs = np.random.permutation(range(len(data)))
train_idxs = rand_idxs[100:]
valid_idxs = rand_idxs[:100]

train_data = [data[i] for i in train_idxs]
valid_data = [data[i] for i in valid_idxs]

# 데이터 로드
train_dataset = Dataset(train_data, tokenizer, 256)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=train_dataset.collate_fn)

valid_dataset = Dataset(valid_data, tokenizer, 256)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=16, shuffle=False, collate_fn=valid_dataset.collate_fn)

Downloading (…)lve/main/config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [49]:
next(iter(valid_loader))

(tensor([[    0,  4148,   719,   820,  1187,     6,    38,    40,   213,     7,
              5,   537,     4,     2,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1],
         [    0,  2522,  4187,   929,    16,  1969,    13,   110,   220,  8728,
            120,  4384,     4,     2,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1],
         [    0,   133,  3459, 37377,    16,    66,     9,   388,     4,     2,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
              1,     1,     1],
         [    0,   113, 13755,    47,   686,    47,   236, 24503, 37928,  2781,
              7,   110,   929,    15,   644,   195,     6, 15294,  1917,     2,
              1,     1, 

---
# Train
---

In [21]:
def train(model, loader, device, outside_weight=0.9):
    model.train()

    label_weight = torch.ones(model.num_labels)
    label_weight[-1] = outside_weight
    label_weight = label_weight.to(device)

    pbar = tqdm(loader)
    for batch in pbar:
        batch = [b.to(device) for b in batch]
        input_ids, attention_mask, labels = batch
        
        outputs = model(input_ids, attention_mask)
        logits = outputs.logits
        logits = logits.view(-1, model.num_labels)
        labels = labels.view(-1)

        loss = F.cross_entropy(logits, labels, weight=label_weight)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        pbar.set_postfix({'loss': loss.item()})

def predict(model, loader, device):
    model.eval()

    total_preds, total_labels = [], []
    for batch in tqdm(loader):
        batch = [b.to(device) for b in batch]
        input_ids, attention_mask, labels = batch
        with torch.no_grad():
            outputs = model(input_ids, attention_mask, labels=labels)
        
        preds = outputs.logits.argmax(dim=-1)
        total_preds += preds.cpu().tolist()
        total_labels += labels.cpu().tolist()

    return total_preds, total_labels      

def evaluate(model, loader, device):
    preds, labels = predict(model, loader, device)
    preds, labels = remove_padding(preds, labels)
    entity_f1 = entity_f1_func(preds, labels)
    char_f1 = char_f1_func(preds, labels)
    return entity_f1, char_f1

In [22]:
from sklearn.metrics import f1_score
from seqeval.metrics import f1_score as ner_f1_score
from seqeval.scheme import IOB2
import itertools

# 패딩 레이블(-100)을 가진 데이터 제거
def remove_padding(preds, labels):
    removed_preds, removed_labels = [], []
    for p, l in zip(preds, labels):
        if -100 not in l: continue

        idx = l.index(-100)
        removed_preds.append(p[:idx])
        removed_labels.append(l[:idx])
    
    return removed_preds, removed_labels

#  entity-level F1 score를 계산하여 반환하는 함수
def entity_f1_func(preds, targets):
    preds = [[LABELS[p] for p in pred] for pred in preds]
    targets = [[LABELS[t] for t in target] for target in targets]
    entity_macro_f1 = ner_f1_score(targets, preds, average="macro", mode="strict", scheme=IOB2)
                      # IOB 방식에서는 처음 시작하는 토큰은 B로, 중간에 위치하는 토큰은 I로 시작
                      # BIO 방식에서는 첫 번째 토큰은 B로 시작하지 않고, 중간에 위치하는 토큰은 I로 시작
    f1 = entity_macro_f1 * 100.0
    return round(f1, 2)

# character-level F1 score를 계산하여 반환하는 함수
def char_f1_func(preds, targets):
    label_indices = list(range(len(LABELS)))
    preds = list(itertools.chain(*preds)) # chain은 두 리스트를 이어 붙여 하나의 iterator를 반환
    targets = list(itertools.chain(*targets))
    f1 = f1_score(targets, preds, labels=label_indices, average='macro', zero_division=True) * 100.0
    return round(f1, 2)

In [23]:
model_path = '/content/drive/MyDrive/2.Study/NER'

num_labels = len(LABELS)
id2label = {i:l for i,l in enumerate(LABELS)}
label2id = {l:i for i,l in enumerate(LABELS)}

model = AutoModelForTokenClassification.from_pretrained('roberta-base', num_labels=num_labels, id2label=id2label, label2id=label2id)
_ = model.train().to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)

best_score = 0.0
for ep in range(10):
    train(model, train_loader, device)
    entity_f1, char_f1 = evaluate(model, valid_loader, device)
    print(f'ep: {ep:02d} | entity f1: {entity_f1:.2f} | char f1: {char_f1:.2f}')

    if entity_f1 > best_score:
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)
        best_score = entity_f1
        print(best_score)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

ep: 00 | entity f1: 92.71 | char f1: 94.51


  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

ep: 01 | entity f1: 93.20 | char f1: 94.59


  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

ep: 02 | entity f1: 94.40 | char f1: 96.27


  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

ep: 03 | entity f1: 93.83 | char f1: 96.24


  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

ep: 04 | entity f1: 94.69 | char f1: 96.38


  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

ep: 05 | entity f1: 95.32 | char f1: 96.68


  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

ep: 06 | entity f1: 95.32 | char f1: 96.81


  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

ep: 07 | entity f1: 94.70 | char f1: 96.41


  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

ep: 08 | entity f1: 94.78 | char f1: 96.42


  0%|          | 0/57 [00:00<?, ?it/s]

  0%|          | 0/7 [00:00<?, ?it/s]

ep: 09 | entity f1: 95.52 | char f1: 96.96


---
# Inference
----

In [26]:
num_labels = len(LABELS)
id2label = {i:l for i,l in enumerate(LABELS)}
label2id = {l:i for i,l in enumerate(LABELS)}

model = AutoModelForTokenClassification.from_pretrained('roberta-base', num_labels=num_labels, id2label=id2label, label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained('/content/drive/MyDrive/2.Study/NER/GPT_for_NER_file')

state_dict = torch.load('/content/drive/MyDrive/2.Study/NER/GPT_for_NER_file/pytorch_model.bin')
model.load_state_dict(state_dict)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able

<All keys matched successfully>

In [94]:
def inference_fn(sentence):
  inputs = tokenizer(
      [sentence],
      max_length=256,
      padding="max_length",
      truncation=True)
  
  model.eval()
  with torch.no_grad():
    inputs_ids = torch.tensor(value['input_ids'])
    attention_mask = torch.tensor(value['attention_mask'])
    outputs = model(inputs_ids, attention_mask) 
    probs = outputs.logits[0].softmax(dim=1)
    top_probs, preds = torch.topk(probs, dim=1, k=1)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    predicted_tags = [id2label[pred.item()] for pred in preds]
    result = []
    for token, predicted_tag, top_prob in zip(tokens, predicted_tags, top_probs):
        if token not in [tokenizer.pad_token, tokenizer.cls_token, tokenizer.sep_token]:
            token_result = {"token": token,
                            "predicted_tag": predicted_tag,
                            "top_prob": str(round(top_prob[0].item(), 4))}
            result.append(token_result)
            
  return {"sentence": sentence,"result": result}

In [97]:
text = 'Can I leave my luggage in the hotel lobby on November 2nd?'
inference_fn(text)

{'sentence': 'Can I leave my luggage in the hotel lobby on November 2nd?',
 'result': [{'token': 'Can', 'predicted_tag': 'O', 'top_prob': '0.9999'},
  {'token': 'ĠI', 'predicted_tag': 'O', 'top_prob': '0.9999'},
  {'token': 'Ġleave', 'predicted_tag': 'O', 'top_prob': '0.9999'},
  {'token': 'Ġmy', 'predicted_tag': 'O', 'top_prob': '0.9999'},
  {'token': 'Ġluggage', 'predicted_tag': 'O', 'top_prob': '0.9999'},
  {'token': 'Ġin', 'predicted_tag': 'O', 'top_prob': '0.9999'},
  {'token': 'Ġthe', 'predicted_tag': 'O', 'top_prob': '0.9999'},
  {'token': 'Ġhotel', 'predicted_tag': 'O', 'top_prob': '0.9999'},
  {'token': 'Ġlobby', 'predicted_tag': 'O', 'top_prob': '0.9996'},
  {'token': 'Ġon', 'predicted_tag': 'O', 'top_prob': '0.9999'},
  {'token': 'ĠNovember', 'predicted_tag': 'B-DT', 'top_prob': '0.999'},
  {'token': 'Ġ2', 'predicted_tag': 'I-DT', 'top_prob': '0.9987'},
  {'token': 'nd', 'predicted_tag': 'I-DT', 'top_prob': '0.9981'},
  {'token': '?', 'predicted_tag': 'O', 'top_prob': '0.999

In [102]:
text = "Hello, I'm Hong Gil dong and I decided to reserve a party room on January 5th."
inference_fn(text)

# 라벨에 없는 홍길동, 파티도 인식

{'sentence': "Hello, I'm Hong Gil dong and I decided to reserve a party room on January 5th.",
 'result': [{'token': 'Hello', 'predicted_tag': 'O', 'top_prob': '0.9999'},
  {'token': ',', 'predicted_tag': 'O', 'top_prob': '0.9999'},
  {'token': 'ĠI', 'predicted_tag': 'O', 'top_prob': '0.9999'},
  {'token': "'m", 'predicted_tag': 'O', 'top_prob': '0.9999'},
  {'token': 'ĠHong', 'predicted_tag': 'B-PS', 'top_prob': '0.9986'},
  {'token': 'ĠGil', 'predicted_tag': 'I-PS', 'top_prob': '0.9975'},
  {'token': 'Ġd', 'predicted_tag': 'I-PS', 'top_prob': '0.9939'},
  {'token': 'ong', 'predicted_tag': 'I-PS', 'top_prob': '0.9963'},
  {'token': 'Ġand', 'predicted_tag': 'O', 'top_prob': '0.9999'},
  {'token': 'ĠI', 'predicted_tag': 'O', 'top_prob': '0.9999'},
  {'token': 'Ġdecided', 'predicted_tag': 'O', 'top_prob': '0.9999'},
  {'token': 'Ġto', 'predicted_tag': 'O', 'top_prob': '0.9999'},
  {'token': 'Ġreserve', 'predicted_tag': 'O', 'top_prob': '0.9999'},
  {'token': 'Ġa', 'predicted_tag': 'O', '

In [111]:
Generated_data = []
for i in data:
  i['text'] = i['text'].replace('\n','')
  i['text'] = i['text'].replace("\ ",'')
  Generated_data.append(i['text'])

Generated_data

['"Welcome to Imperial Lodge & Suites, Joseph C. Wu. How may I help you?"',
 '"Hello, my name is Reece Sanchez and I\'m calling from the Imperial Lodge & Suites. We are in need of some new bath mats."',
 'Will our hotel have nutrition bars on Saturday, Oct 6?',
 '"We\'re sorry, we don\'t have any Pocket Sprayers from Four Seasons Hotels & Resorts in stock."',
 "Hi, my name is Antonio W. Miller and I'm interested in reserving a room at The Presidential Retreats.",
 'Nash Donovan, can I ask you a question?',
 'Leslie J. Clarke, can you please help me with this project?',
 '"Nash Donovan, we are so happy to have you stay with us at Luxury Palace Haven! We want to make sure you have the best possible experience, so please let us know if you need anything at all, including air fresheners for your room."',
 "We offer bedding kits for St Regis Hotels' Relax Rooms.",
 "You're in luck! Summer House Hotels & Resorts has a King room available for tonight.",
 'Check out our selection of coasters f

In [115]:
file_path = "/content/drive/MyDrive/2.Study/NER/GPT_for_NER_file/Generated_data.txt"


with open(file_path, "w") as file:
  for i in Generated_data:
    file.write(i + '\n')

file.close()