In [1]:
# Hugging Face 라이브러리 적용 - 기계 번역 모델 성능 확인
# AI HUB 관광지 소개 다국어 번역 데이터셋 적용

# Cell 1
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch.optim import AdamW
from torch.amp import autocast
from torch.cuda.amp import GradScaler
import evaluate   # ✅ 최신 BLEU 평가 라이브러리
import json, glob
import random
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
model_name = "t5-small"   # RTX 3060 6GB 환경에 적합한 경량 모델

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

# BLEU metric 로드
bleu = evaluate.load("sacrebleu")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 데이터셋 전처리
# 폴더명: TL_영어_관광지,TL_영어_레포츠... -> 도메인/카테고리 구분
# 파일명:en_2_0_gg_0_(평화누리길김포시첫째길)대명항-문수산성_2.json
# 내용: 각 JSON 안에는 annotations 배열이 있고, k_context(한국어) ↔ t_context(영어) 페어

ko_lines, en_lines = [], []
folders = [ # 폴더 리스트 정의
    './llm_data/ai_hub_tourist_accommodation_english/*.json',
    './llm_data/ai_hub_tourist_attraction_english/*.json',
    './llm_data/ai_hub_tourist_cultural_facility_english/*.json',
    './llm_data/ai_hub_tourist_entertainment_english/*.json',
    './llm_data/ai_hub_tourist_leisure_sports_english/*.json',
    './llm_data/ai_hub_tourist_restaurant_english/*.json',
    './llm_data/ai_hub_tourist_shopping_english/*.json'
]

In [3]:
# 데이터셋 전처리
# 폴더명: TL_영어_관광지,TL_영어_레포츠... -> 도메인/카테고리 구분
# 파일명:en_2_0_gg_0_(평화누리길김포시첫째길)대명항-문수산성_2.json
# 내용: 각 JSON 안에는 annotations 배열이 있고, k_context(한국어) ↔ t_context(영어) 페어

# 모든 JSON 읽기
for folder in folders:
    for path in glob.glob(folder):
        with open(path, encoding='utf-8') as f:
            js = json.load(f)

        for ann in js.get('annotations', []):
            if ann.get('language') == 'en':
                ko = ann.get('k_context')
                en = ann.get('t_context')

                if ko and en:
                    ko_split = [line.strip() for line in ko.splitlines() if line.strip()]
                    en_split = [line.strip() for line in en.splitlines() if line.strip()]

                    # 줄 수가 다르면 최소 길이에 맞춰 강제 매칭
                    min_len = min(len(ko_split), len(en_split))
                    ko_lines.extend(ko_split[:min_len])
                    en_lines.extend(en_split[:min_len])

print(f"Total sentence pairs after cleaning: {len(ko_lines)} vs {len(en_lines)}")

Total sentence pairs after cleaning: 2524283 vs 2524283


In [4]:
# 데이터 전처리 저장

# 최종 저장 단계에서도 안전하게 min_len 맞추기
min_len = min(len(ko_lines), len(en_lines))
ko_lines = ko_lines[:min_len]
en_lines = en_lines[:min_len]

# 샘플링 (데이터셋 줄이기)
sample_size = 50000   # 원하는 크기 (예: 50k 문장)

ko_lines = ko_lines[:sample_size]
en_lines = en_lines[:sample_size]

out_dir = './llm_data/ai_hub_tourist_ko_en'
ko_path = f'{out_dir}/origin_train_ko.txt'
en_path = f'{out_dir}/origin_train_en.txt'

assert len(ko_lines) == len(en_lines), "KO/EN line count mismatch!"

with open(ko_path, 'w', encoding='utf-8') as fko, \
     open(en_path, 'w', encoding='utf-8') as fen:
    for k, e in zip(ko_lines, en_lines):
        fko.write(k.strip() + '\n')
        fen.write(e.strip() + '\n')
print('저장 완료:', ko_path, en_path)

저장 완료: ./llm_data/ai_hub_tourist_ko_en/origin_train_ko.txt ./llm_data/ai_hub_tourist_ko_en/origin_train_en.txt


In [5]:
# 데이터 분할
# - 전체 병렬 데이터 로드: train_ko.txt, train_en.txt를 읽어서 리스트로 만든다
# - 랜덤 셔플: 순서를 섞어 데이터 누수를 방지
# - 비율 분할: 8:1:1 = train:validation:test
# - 각 파일 저장: train_ko.txt, valid_ko.txt, test_ko.txt와 대응하는 영어 파일을 저장

# 파일 읽기 (경로 수정)
with open('./llm_data/ai_hub_tourist_ko_en/origin_train_ko.txt', encoding='utf-8') as fko, \
     open('./llm_data/ai_hub_tourist_ko_en/origin_train_en.txt', encoding='utf-8') as fen:
    ko_lines = [line.strip() for line in fko if line.strip()]
    en_lines = [line.strip() for line in fen if line.strip()]

# 길이 검증
assert len(ko_lines) == len(en_lines), f"KO/EN mismatch: {len(ko_lines)} vs {len(en_lines)}"
print("counts:", len(ko_lines), len(en_lines))

# 병렬 데이터 묶기
pairs = list(zip(ko_lines, en_lines))
print("pairs:", len(pairs))

# 랜덤 셔플
random.seed(42) # 재현성 위해 고정
random.shuffle(pairs)

# 분할 비율
n = len(pairs)
train_end = int(n * 0.8)
valid_end = int(n * 0.9)

train_pairs = pairs[:train_end] # 0% ~ 80%
valid_pairs = pairs[train_end:valid_end] # 80% ~ 90%
test_pairs = pairs[valid_end:] # 90% ~ 100%

counts: 50000 50000
pairs: 50000


In [6]:
# 분할 데이터 저장

# 저장 함수
def save_pairs(pairs, ko_path, en_path):
    with open(ko_path, 'w', encoding='utf-8') as fko, \
        open(en_path, 'w', encoding='utf-8') as fen:
        for k, e in pairs:
            fko.write(k + '\n')
            fen.write(e + '\n')

# 저장 실행
out_dir = './llm_data/ai_hub_tourist_ko_en'
save_pairs(train_pairs, f'{out_dir}/train_ko.txt', f'{out_dir}/train_en.txt')
save_pairs(valid_pairs, f'{out_dir}/valid_ko.txt', f'{out_dir}/valid_en.txt')
save_pairs(test_pairs, f'{out_dir}/test_ko.txt', f'{out_dir}/test_en.txt')

print('데이터셋 분할 및 저장 완료')

데이터셋 분할 및 저장 완료


In [7]:
# Cell 2
class TranslationDataset(Dataset):
    def __init__(self, src_file, tgt_file, tokenizer, max_len=64):
        with open(src_file, encoding="utf-8") as fsrc, \
             open(tgt_file, encoding="utf-8") as ftgt:
            self.src_lines = [line.strip() for line in fsrc if line.strip()]
            self.tgt_lines = [line.strip() for line in ftgt if line.strip()]
        assert len(self.src_lines) == len(self.tgt_lines), "Source/Target mismatch!"
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.src_lines)

    def __getitem__(self, idx):
        src = self.src_lines[idx]
        tgt = self.tgt_lines[idx]

        src_enc = self.tokenizer(src,
                                 max_length=self.max_len,
                                 padding="max_length",
                                 truncation=True,
                                 return_tensors="pt")
        tgt_enc = self.tokenizer(tgt,
                                 max_length=self.max_len,
                                 padding="max_length",
                                 truncation=True,
                                 return_tensors="pt")

        return {
            "input_ids": src_enc["input_ids"].squeeze(),
            "attention_mask": src_enc["attention_mask"].squeeze(),
            "labels": tgt_enc["input_ids"].squeeze(),
            "target_text": tgt
        }

In [8]:
# Cell 3
train_dataset = TranslationDataset("./llm_data/ai_hub_tourist_ko_en/train_ko.txt",
                                   "./llm_data/ai_hub_tourist_ko_en/train_en.txt",
                                   tokenizer)
valid_dataset = TranslationDataset("./llm_data/ai_hub_tourist_ko_en/valid_ko.txt",
                                   "./llm_data/ai_hub_tourist_ko_en/valid_en.txt",
                                   tokenizer)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=4)

optimizer = AdamW(model.parameters(), lr=1e-4)
scaler = GradScaler()

  scaler = GradScaler()


In [9]:
# Cell 4
num_epochs = 5
for epoch in range(num_epochs):
    model.train()
    total_train_loss = 0

    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with autocast("cuda"):
            outputs = model(input_ids=input_ids,
                            attention_mask=attention_mask,
                            labels=labels)
            loss = outputs.loss

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_train_loss += loss.item()

    avg_train_loss = total_train_loss / len(train_loader)

    # 검증 루프 + BLEU 평가
    model.eval()
    total_val_loss = 0
    predictions, references = [], []

    with torch.no_grad():
        for batch in valid_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            with autocast("cuda"):
                outputs = model(input_ids=input_ids,
                                attention_mask=attention_mask,
                                labels=labels)
                loss = outputs.loss
            total_val_loss += loss.item()

            generated_ids = model.generate(input_ids=input_ids,
                                           attention_mask=attention_mask,
                                           max_length=64)
            preds = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
            refs = [[batch["target_text"][i]] for i in range(len(preds))]

            predictions.extend(preds)
            references.extend(refs)

    avg_val_loss = total_val_loss / len(valid_loader)
    bleu_score = bleu.compute(predictions=predictions, references=references)

    print(f"Epoch {epoch+1}/{num_epochs} | "
          f"Train Loss: {avg_train_loss:.4f} | "
          f"Val Loss: {avg_val_loss:.4f} | "
          f"BLEU: {bleu_score['score']:.2f}")

    

Epoch 1/5 | Train Loss: 0.2209 | Val Loss: 0.1190 | BLEU: 67.52
Epoch 2/5 | Train Loss: 0.1286 | Val Loss: 0.1055 | BLEU: 67.28
Epoch 3/5 | Train Loss: 0.1148 | Val Loss: 0.0987 | BLEU: 68.69
Epoch 4/5 | Train Loss: 0.1064 | Val Loss: 0.0938 | BLEU: 69.23
Epoch 5/5 | Train Loss: 0.1002 | Val Loss: 0.0909 | BLEU: 68.63


In [10]:
# 모델 저장
model.save_pretrained(f"./results_translation_huggingface_t5-small/checkpoints/epoch_{epoch+1}")
tokenizer.save_pretrained(f"./results_translation_huggingface_t5-small/checkpoints/epoch_{epoch+1}")

('./results_translation_huggingface_t5-small/checkpoints/epoch_5\\tokenizer_config.json',
 './results_translation_huggingface_t5-small/checkpoints/epoch_5\\special_tokens_map.json',
 './results_translation_huggingface_t5-small/checkpoints/epoch_5\\spiece.model',
 './results_translation_huggingface_t5-small/checkpoints/epoch_5\\added_tokens.json',
 './results_translation_huggingface_t5-small/checkpoints/epoch_5\\tokenizer.json')

In [3]:
# 모델 로드
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'PyTorch Version: {torch.__version__}, Device: {device}')

model_path = './results_translation_huggingface_t5-small/checkpoints/epoch_5'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSeq2SeqLM.from_pretrained(model_path).to(device)

PyTorch Version: 2.7.1+cu118, Device: cuda


In [None]:
# 추론(Inference)

# 한국어 입력 문장
# input_text = "이 호텔은 서울 중심에 위치해 있습니다."
# input_text = "이 식당은 전통 한식을 제공합니다."
# input_text = '이 호텔은 깨끗합니다'
input_text = '이 박물관은 무료 입장이 가능합니다'

# 토크나이즈
inputs = tokenizer(input_text, return_tensors='pt').to(device)

# 번역
# generated_ids = model.generate(**inputs, max_length=64, num_beams=5)
generated_ids = model.generate(**inputs, max_length=64, do_sample=True, top_k=50, top_p=0.95)

output_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print('번역 결과:', output_text)


번역 결과: Gapyeong-ro, Gapyeong-eup, Gapyeong-gun, Gyeonggi-do
