Pytorch에서 제공하는 transformer 모델을 활용해 영어-독일어 번역 모델을 만들어보자.  
데이터는 다국어 데이터세트 중 하나인 Multi30k를 사용한다.

In [None]:
!pip install -U spacy



In [None]:
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm

Collecting de-core-news-sm==3.8.0
  Using cached https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl (14.6 MB)
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m52.1 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can no

In [None]:
!pip install portalocker

# 1. 데이터세트 다운로드 및 전처리

In [None]:
import os
from pathlib import Path
import gzip
from collections import Counter
import spacy

# -------------------------
# 환경 설정
# -------------------------
SRC_LANGUAGE = "de"
TGT_LANGUAGE = "en"
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ["<unk>", "<pad>", "<bos>", "<eos>"]

DATA_DIR = Path("multi30k_data")
DATA_DIR.mkdir(exist_ok=True)

# -------------------------
# 데이터 다운로드
# -------------------------
!wget -q https://github.com/multi30k/dataset/archive/refs/heads/master.zip -O master.zip
!unzip -q -o master.zip -d {DATA_DIR}

BASE_DIR = DATA_DIR / "dataset-master" / "data" / "task1" / "raw"
TRAIN_DE_FILE = BASE_DIR / "train.de.gz"
TRAIN_EN_FILE = BASE_DIR / "train.en.gz"

# -------------------------
# gzip 풀기
# -------------------------
def read_gzip_lines(path):
    with gzip.open(path, 'rt', encoding='utf-8') as f:
        return f.read().splitlines()

train_de_lines = read_gzip_lines(TRAIN_DE_FILE)
train_en_lines = read_gzip_lines(TRAIN_EN_FILE)

# -------------------------
# SpaCy tokenizer
# -------------------------
spacy_de = spacy.load("de_core_news_sm")
spacy_en = spacy.load("en_core_web_sm")

def tokenize(text, language):
    return [tok.text for tok in (spacy_de if language=="de" else spacy_en)(text)]

# -------------------------
# vocab 생성 (torchtext 없이)
# -------------------------
def build_vocab(lines, language, min_freq=1):
    counter = Counter()
    for line in lines:
        counter.update(tokenize(line, language))
    itos = special_symbols + [tok for tok, freq in counter.items() if freq >= min_freq]
    stoi = {tok: i for i, tok in enumerate(itos)}
    return {"itos": itos, "stoi": stoi, "default_index": UNK_IDX}

vocab_transform = {
    "de": build_vocab(train_de_lines, "de"),
    "en": build_vocab(train_en_lines, "en")
}

# -------------------------
# 결과 확인
# -------------------------
print("DE vocab sample:", vocab_transform["de"]["itos"][:20])
print("EN vocab sample:", vocab_transform["en"]["itos"][:20])

OSError: [E050] Can't find model 'de_core_news_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [2]:
SRC_LANGUAGE = "de"
TGT_LANGUAGE = "en"
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_symbols = ["<unk>", "<pad>", "<bos>", "<eos>"]