1. Few-shot 학습을 위한 개체명 리스트 생성

2. GPT-3를 사용해 개체명 리스트 확장

3. GPT-3를 사용하여 확장된 개체명 인식 데이터셋 생성

4. NER 모델 학습

---
# Install & load
----

In [1]:
!pip install openai

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting openai
  Downloading openai-0.27.2-py3-none-any.whl (70 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.1/70.1 KB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting aiohttp
  Downloading aiohttp-3.8.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
Collecting aiosignal>=1.1.2
  Downloading aiosignal-1.3.1-py3-none-any.whl (7.6 kB)
Collecting multidict<7.0,>=4.5
  Downloading multidict-6.0.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.2/114.2 KB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting frozenlist>=1.1.1
  Downloading frozenlist-1.3.3-cp39-cp39-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux

[참고블로그](https://medium.com/@yongsun.yoon/%EB%8D%B0%EC%9D%B4%ED%84%B0-%EC%97%86%EC%9D%B4-ner-%EB%AA%A8%EB%8D%B8-%ED%95%99%EC%8A%B5%ED%95%98%EA%B8%B0-90c4c24953a)

In [7]:
import os
import openai


key_path = '/content/drive/MyDrive/2.Study/NER/OpenAI_Key.txt'

with open(key_path, 'r') as f:
  value = f.read()
  
# OpenAI Key 등록 [노출 조심]
openai.api_key = value

In [50]:
##### pytorch #####
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset


##### 시각화 #####
from PIL import Image
import matplotlib.pyplot as plt
import seaborn as sns 

##### 기본 모듈 #####
import pandas as pd
import numpy as np
import os
import random
import json
import math
import easydict
from pprint import pprint
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import re 
import time

##### 디버깅 #####
import pdb

##### cuda #####
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu') # GPU 할당

##### 경고무시 #####
import warnings
warnings.filterwarnings(action='ignore')


---
# 개체명 리스트 작성
---

In [10]:
# 예시를 위한 엔티티 생성

real_entities = [
    {
        'class_name': 'hotel name',
        'entity_names': [
            'Ritz-Carlton Hotel',
            'Marriott',
            'The Luxury Collection Hotels & Resorts',
            'St Regis Hotels',
            'Hyatt'
        ]
    },

    {
        'class_name': 'room type',
        'entity_names': [
            'Single room',
            'twin room',
            'Double room',
            'deluxe room',
            'Suites',
        ]    
    },
    {
        'class_name': 'person name',
        'entity_names': [
            'Yongsun Yoon',
            'Steve Adams',
            'Donnie K. Schneider',
            'Eleanor Lockhart',
            'Jacqueline R. French'
        ]
    },
    {
        'class_name': 'date',
        'entity_names': [
            '3/4/2022',
            'November 27th',
            'December 15, 2023',
            'Feb. 8',
            'Saturday, Jul 22'
        ]
    },
    {
        'class_name': 'hotel supplies',
        'entity_names': [
            'shampoo',
            'Coffee kit',
            'towels',
            'Wine glass',
            'fan'
        ]
    }
]

---
# GPT-3를 사용해 개체명 리스트 확장 
---

In [36]:
# 모델에서 텍스트 생성 
def generate(prompts, model='text-davinci-003', n=1, max_tokens=512):
    response = openai.Completion.create(
        model = model,
        prompt = prompts, # 입력으로 사용될 문장을 지정
        echo = False, # 입력으로 받은 문장을 다시 출력할지 여부
        n = n, # 생성할 문장 개수
        max_tokens = max_tokens, # 모델이 생성할 최대 토큰 수 
        # stop = '\n'
    )
    
    # response.choices는 모델이 생성한 출력 결과를 담고 있는 리스트
    texts = [c.text.strip() for c in response.choices]
    return texts

# 새로운 엔티티 이름을 생성하는 모델에 입력할 프롬프트 문장 생성
def construct_entity_prompt(class_name, entity_names, k=10):
    prompt = f'These are <{class_name}> entity names. Generate {k} new <{class_name}> entity names.\n\n'
    prompt += 'Entity names:\n'
    for e in entity_names:
        prompt += f'- {e}\n'
    prompt += '\nGenerated names:\n-'
    return prompt


# 모델이 생성한 엔티티 이름 목록을 정제하는 함수
def postprocess_entities(synthetic_entities):
    processed = []
    for ents in synthetic_entities:
        ents = f'- {ents}'.split('\n')
        ents = [e.split('-')[1].strip() for e in ents]
        processed += ents
    return processed    

In [37]:
synthetic_entities = []
for real_ent in tqdm(real_entities):
    class_name, entity_names = real_ent['class_name'], real_ent['entity_names']
    # GPT 모델에 입력할 prompt 문장 생성
    prompt = construct_entity_prompt(class_name, entity_names)
    # 새로운 엔티티 생성 
    syn_entities = generate(prompt, n=10)
    # 생성된 새로운 엔티티 정제
    syn_entities = postprocess_entities(syn_entities)
    # 중복된 엔티티 제거
    syn_entities = list(set(syn_entities))
    
    synthetic_entities.append({'class_name': class_name, 'entity_names': syn_entities})

  0%|          | 0/5 [00:00<?, ?it/s]

In [44]:
prompt

'These are <hotel supplies> entity names. Generate 10 new <hotel supplies> entity names.\n\nEntity names:\n- shampoo\n- Coffee kit\n- towels\n- Wine glass\n- fan\n\nGenerated names:\n-'

In [43]:
synthetic_entities[0]

{'class_name': 'hotel name',
 'entity_names': ['The Majestic Palace Hotel',
  'Four Seasons Hotels & Resorts',
  'Omni Hotels & Resorts',
  'The Venetian Room Hotel',
  'Park Avenue Lodge',
  'Radisson Hotels & Suites',
  'The Villa Suites Hotel',
  'Grand Trump Hotel',
  'Atlantis Beachfront Resort',
  'The Majestic Estates Inn',
  'The Peninsula Hotels',
  'Sheraton Hotels',
  'The Summit by Fairmont',
  'Waldorf Astoria Hotels & Resorts',
  'Intercontinental Hotels & Resorts',
  'RoomMates Hotel',
  'Embassy Suites Hotels',
  'The Embassy Suites Hotel',
  'Park Plaza Hotels & Resorts',
  'Four Seasons Resort & Spa',
  'Savoyard Suites and Lodgings',
  'Hilton Hotels & Resorts',
  'The Royal Flair Resort',
  'The Regal Royal Resorts',
  'Westin Hotels',
  'Radisson Blu Resort Corniche',
  'The Plaza Hotel',
  'QT Hotels & Resorts',
  'Regal Palatial Suites',
  'Arabian Riviera Resort',
  'The Palacial Harmony Hotel',
  'Oakwood Manor & Suites',
  'The Peninsula Hotels and Resorts',
 

In [45]:
# 직접 작성한 엔티티와 모델 생성 엔티티 합치기
all_entities = []
for real, synthetic in zip(real_entities, synthetic_entities):
    all_entities.append({
        'class_name': real['class_name'],
        'entity_names': list(set(real['entity_names'] + synthetic['entity_names']))
    })

---
# GPT-3를 사용하여 확장된 개체명 인식 데이터셋 생성
----

In [None]:
# 엔티티 리스트에서 랜덤으로 엔티티를 선택하여 반환하는 함수
def sample_entities(all_entities, min_k=1, max_k=3):
    k = np.random.randint(min_k, max_k+1)
    idxs = np.random.choice(range(len(all_entities)), size=k, replace=False)

    entities = []
    for i in idxs:
        ents = all_entities[i]
        name = np.random.choice(ents['entity_names'])
        entities.append({'class_name': ents['class_name'], 'entity_name': name})
    
    return entities

# 선택된 엔티티와 문장을 GPT 모델의 입력으로 사용될 prompt 문장 생성
def construct_sentence_prompt(entities, style='dialog'):
    prompt = f'Generate a {style} sentence including following entities.\n\n'

    entities_string = ', '.join([f"{e['entity_name']}({e['class_name']})" for e in entities])
    prompt += f'Entities: {entities_string}\n'
    prompt += 'Sentence:'
    return prompt    

# 생성된 문장과 엔티티 목록을 통해 각 토큰에 대한 레이블 생성
def construct_labels(generated, entities, class2idx):

    # 생성된 문장의 길이만큼 outside 레이블로 구성된 리스트 labels을 생성
    labels = [class2idx['outside']] * len(generated)
    for ent in entities:
        l = class2idx[ent['class_name']]
        # 생성된 문장에서 현재 엔티티의 이름이 등장하는 모든 위치를 찾음
        for span in re.finditer(ent['entity_name'].lower(), generated.lower()):
            # 찾은 위치 값
            s, e = span.start(), span.end()
            labels[s] = l
            # 찾은 위치의 해당하는 토큰들의 레이블을 현재 엔티티의 클래스 레이블로 설정
            # 만약 l = 3, s = 10, e = 14일 경우
            # labels[10] = 3, labels[11:14] = [4,4,4]
            labels[s+1:e] = [l+1] * (e-s-1)
    return labels    

In [48]:
class2idx = {e['class_name']: i*2 for i, e in enumerate(all_entities)}
class2idx['outside'] = len(class2idx) * 2
'''index가 2씩 증가하는 이유는 
   label을 만들 때, 엔티티 클래스를 표시하는 숫자와 
   엔티티의 시작점을 표시하는 숫자 사이에 구분을 두기 위함'''

data = []
for _ in tqdm(range(100)):
    batch_entities = [sample_entities(all_entities) for _ in range(10)]
    batch_prompts = [construct_sentence_prompt(ents) for ents in batch_entities]
    batch_generated = generate(batch_prompts, model='text-davinci-002')

    for generated, entities in zip(batch_generated, batch_entities):
        labels = construct_labels(generated, entities, class2idx)
        data.append({'text': generated, 'labels': labels})

    time.sleep(10)


{'hotel name': 0,
 'room type': 2,
 'person name': 4,
 'date': 6,
 'hotel supplies': 8,
 'outside': 10}

In [51]:
class2idx

{'hotel name': 0,
 'room type': 2,
 'person name': 4,
 'date': 6,
 'hotel supplies': 8,
 'outside': 10}

---
# NER 모델 학습
----