# Sentence correction gpt on Korean

한국어 문법교정 GPT3 모델 구축하기


In [None]:
%pip install pandas numpy scipy scikit-learn torch accelerate transformers ipywidgets tqdm matplotlib --upgrade

# Transformers 를 사용하기 위한 라이브러리들 설치

### AI-Hub data 정제

예시 데이터는 다음과 같다.

이 프로젝트를 수행하는 데이터인 AI-Hub 데이터는 용량이 112GB, 국외반출 불가로 데이터 관리에 주의를 요한다.

이 데이터를 Pandas.DataFrame['index', text', 'corrected'] 로 정제하는 과정을 거친다.

```json
{  "id": "100008-1-1-1",
  "fileName": "TX_CA_1_100008-1-1-1",
  "dataSet": "한국어 철자 및 맞춤법 교정용 병렬 데이터",
  "domain": "CA",
  "ko": "지금까지 다녀 본 여행지 중 좋았던 곳 추천해줘.",
  "corrected": "지금까지 다녀 본 여행지 중 좋았던 곳 추천해 줘.",
  "error": [
    {
      "errorType": "spac",
      "startPoint": 22,
      "endPoint": 27
    }
  ]
}
```

In [None]:
from typing import List, Dict
from tqdm.notebook import tqdm

from json_processing import get_json_files, read_json_file

import pandas as pd


LOCATION = '.'

train = get_json_files(f'{LOCATION}/data/train')
validate = get_json_files(f'{LOCATION}/data/validate')

train_data = pd.DataFrame()
validate_data = pd.DataFrame()

In [None]:
from json_processing import get_json_files, read_json_file
import multiprocessing as mp

# https://zerohertz.github.io/multiprocessing/
with mp.Pool() as pool:
    train_raw = list(tqdm(pool.imap(read_json_file, train), total=len(train)))

train_data = pd.concat(train_raw, ignore_index=True)

for file in tqdm(validate):
    validate_data = pd.concat([validate_data, read_json_file(f'{LOCATION}/{file}')], ignore_index=True)

train_data.to_csv(f'{LOCATION}/train.csv')
validate_data.to_csv(f'{LOCATION}/validate.csv')

### Transformer 학습 준비

학습 준비된 데이터는 토크나이저를 불러와 DataLoader에 준비하여 학습 준비를 한다.

파인튜닝 대상 모델은 다음과 같다.
[kykim/gpt3-kor-small_based_on_gpt2](https://github.com/kiyoungkim1/LMkor)

In [3]:
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizerFast

import pandas as pd
import torch

LOCATION = '.'

# GPT-3 모델과 토크나이저 불러오기
tokenizer = BertTokenizerFast.from_pretrained("kykim/gpt3-kor-small_based_on_gpt2")
max_length = 100
loader_size = 16

# 훈련용 및 검증용 데이터셋 클래스 정의
class TextDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        input_text = self.data.iloc[idx]['text']
        output_text = self.data.iloc[idx]['corrected']
        input_ids = self.tokenizer.encode(input_text, max_length=self.max_length, truncation=True, padding='max_length')
        output_ids = self.tokenizer.encode(output_text, max_length=self.max_length, truncation=True, padding='max_length')
        return {
            'input_ids': torch.tensor(input_ids, dtype=torch.long),
            'output_ids': torch.tensor(output_ids, dtype=torch.long)
        }

train = pd.read_csv(f'{LOCATION}/train.csv')
validate = pd.read_csv(f'{LOCATION}/validate.csv')

train_dataset = TextDataset(train, tokenizer, max_length)
validate_dataset = TextDataset(validate, tokenizer, max_length)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
validate_loader = DataLoader(validate_dataset, batch_size=32, shuffle=False)

In [4]:
from transformers import GPT2LMHeadModel
from tqdm.notebook import tqdm  # Assuming you're using a Jupyter notebook
from sklearn.metrics import roc_auc_score, f1_score  # Only needed if using accuracy in compute_metrics

import torch
import pickle

device = torch.device('cpu')

if torch.backends.mps.is_available():
    device = torch.device('mps')
elif torch.cuda.is_available():
    device = torch.device('cuda')

# Model and tokenizer (assuming you have them defined elsewhere)
model = GPT2LMHeadModel.from_pretrained("kykim/gpt3-kor-small_based_on_gpt2").to(device=device)

all_metrics = []

model.train()

lr = 2e-5
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

# 학습 및 검증 함수 정의
def train(train_loader, model, optimizer):
    model.train()
    total_loss = 0.0
    for batch in tqdm(train_loader):
        inputs = batch['input_ids'].to(model.device)
        labels = batch['output_ids'].to(model.device)
        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

def validate(validate_loader, model):
    model.eval()
    total_loss = 0.0
    predictions = []
    targets = []
    with torch.no_grad():
        for batch in tqdm(validate_loader):
            inputs = batch['input_ids'].to(model.device)
            labels = batch['output_ids'].to(model.device)
            outputs = model(inputs, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            predictions.extend(outputs.logits.argmax(dim=-1).tolist())
            targets.extend(labels.tolist())
    roc_auc = roc_auc_score(targets, predictions)
    f1 = f1_score(targets, predictions, average='macro')
    return total_loss / len(validate_loader), roc_auc, f1

# 학습 및 검증
epochs = 10
for epoch in tqdm(range(epochs)):
    train_loss = train(train_loader, model, optimizer)
    validate_loss, roc_auc, f1 = validate(validate_loader, model)

    print(f"Epoch {epoch + 1}/{epochs}, Train Loss: {train_loss}, Validate Loss: {validate_loss}, ROC-AUC: {roc_auc}, F1 Score: {f1}")
    all_metrics.append({
        'epoch':epoch + 1,
        'train_loss': train_loss,
        'val_loss': validate_loss,
        'roc_auc': roc_auc,
        'f1': f1
    })
    with open("{LOCATION}/all_metrics.pkl", "wb") as f:
        pickle.dump(all_metrics, f)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/68801 [00:00<?, ?it/s]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import pickle

with open(f"{LOCATION}/all_metrics.pkl", "rb") as f:
    all_metrics = pickle.load(f)

df_metrics = pd.DataFrame(all_metrics)

# 그래프 시각화
plt.plot(df_metrics["train_loss"], label="Train Loss")
plt.plot(df_metrics["validate_loss"], label="Validate Loss")
plt.legend()
plt.show()

plt.plot(df_metrics["f1"], label="F1 Score")
plt.plot(df_metrics["roc_auc"], label="ROC AUC")
plt.legend()
plt.show()