In [None]:
pip install datasets

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from datasets import load_dataset

# IWSLT 데이터셋 로드
dataset = load_dataset("iwslt2017", "iwslt2017-ko-en")
print(dataset)

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import BertTokenizer
from datasets import load_dataset

# IWSLT 데이터셋 로드 (한국어-영어)
dataset = load_dataset("iwslt2017", "iwslt2017-en-ko")

# 토크나이저 정의
tokenizer_en = BertTokenizer.from_pretrained("bert-base-multilingual-cased")
tokenizer_ko = BertTokenizer.from_pretrained("bert-base-multilingual-cased")

# 배치 전처리 함수
def collate_fn(batch):
    src_batch = tokenizer_ko([item['translation']['ko'] for item in batch], padding=True, truncation=True, return_tensors="pt")
    tgt_batch = tokenizer_en([item['translation']['en'] for item in batch], padding=True, truncation=True, return_tensors="pt")

    src_batch = src_batch.input_ids
    tgt_batch = tgt_batch.input_ids

    return src_batch, tgt_batch

# DataLoader 설정
train_loader = DataLoader(dataset['train'], batch_size=32, collate_fn=collate_fn, shuffle=True)
valid_loader = DataLoader(dataset['validation'], batch_size=32, collate_fn=collate_fn)

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, batch_first=True)

    def forward(self, src):
        embedded = self.embedding(src)  # (batch_size, seq_len, emb_dim)
        outputs, hidden = self.rnn(embedded)  # (batch_size, seq_len, hidden_dim), (1, batch_size, hidden_dim)
        return hidden  # hidden: (1, batch_size, hidden_dim)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim):
        super().__init__()
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, batch_first=True)
        self.fc_out = nn.Linear(hidden_dim, output_dim)

    def forward(self, input, hidden):
        input = input.unsqueeze(1)  # (batch_size, 1)
        embedded = self.embedding(input)  # (batch_size, 1, emb_dim)
        output, hidden = self.rnn(embedded, hidden)  # output: (batch_size, 1, hidden_dim)
        prediction = self.fc_out(output.squeeze(1))  # (batch_size, output_dim)
        return prediction, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        hidden = self.encoder(src)  # hidden: (1, batch_size, hidden_dim)
        outputs = []

        input = tgt[0, :]  # 시작 토큰을 첫 번째로 사용
        for t in range(1, tgt.size(1)):  # 나머지 토큰에 대해 반복
            output, hidden = self.decoder(input, hidden)
            outputs.append(output)
            input = tgt[:, t]  # 다음 토큰을 입력으로 사용

        return torch.stack(outputs, dim=1)  # (batch_size, seq_length, output_dim)


class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self, src, tgt):
        # Encoder의 hidden state 얻기
        hidden = self.encoder(src)  # hidden: (1, batch_size, hidden_dim)
        batch_size = src.size(0)
        tgt_len = tgt.size(1)
        output_dim = self.decoder.fc_out.out_features

        # 디코더의 출력을 저장할 텐서 초기화
        outputs = torch.zeros(batch_size, tgt_len - 1, output_dim).to(src.device)

        # 첫 입력 토큰은 tgt의 첫 번째 토큰입니다 (보통 <sos>)
        input = tgt[:, 0]  # (batch_size)

        for t in range(1, tgt_len):
            output, hidden = self.decoder(input, hidden)
            outputs[:, t - 1] = output
            input = tgt[:, t]  # 다음 입력 토큰 설정

        return outputs



# 하이퍼파라미터 설정
INPUT_DIM = len(tokenizer_ko)
OUTPUT_DIM = len(tokenizer_en)
EMB_DIM = 128
HIDDEN_DIM = 256
N_EPOCHS = 10

# 모델 및 최적화기 초기화
encoder = Encoder(INPUT_DIM, EMB_DIM, HIDDEN_DIM)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, HIDDEN_DIM)
model = Seq2Seq(encoder, decoder)

optimizer = torch.optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

# 모델 학습 함수
def train(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0

    for src, tgt in iterator:
        optimizer.zero_grad()
        output = model(src, tgt)  # 여기서 tgt를 전체 시퀀스로 전달합니다.
        output_dim = output.shape[-1]
        output = output.view(-1, output_dim)


        tgt = tgt[:, 1:].reshape(-1)

        loss = criterion(output, tgt)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

# 모델 학습
for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion)
    print(f'Epoch: {epoch + 1}, Train Loss: {train_loss:.3f}')