In [None]:
import random
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
import nltk
from nltk.tokenize import word_tokenize
from collections import defaultdict
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [None]:
class TranslationData(Dataset):
    def __init__(self, data, max_length=100):
        self.eng_sentence = [word[0] for word in data]
        self.kor_sentence = [word[1] for word in data]
        self.eng_vocab = self.build_vocab(self.eng_sentence)
        self.kor_vocab = self.build_vocab(self.kor_sentence)

        self.sos_idx = self.kor_vocab['<sos>']
        self.eos_idx = self.kor_vocab['<eos>']
        self.padding_idx = self.kor_vocab['<pad>']

        self.max_length = max_length

        print("영어 문장:", self.eng_sentence)
        print("한국어 문장:", self.kor_sentence)
        print("영어 어휘:", self.eng_vocab)
        print("한국어 어휘:", self.kor_vocab)

    def build_vocab(self, sentences):
        vocab = {'<pad>': 0, '<sos>': 1, '<eos>': 2}
        for sentence in sentences:
            for word in word_tokenize(sentence):
                if word not in vocab:
                    vocab[word] = len(vocab)
        return vocab

    def __len__(self):
        return len(self.kor_sentence)

    def __getitem__(self, idx):
        eng = self.eng_sentence[idx]
        kor = self.kor_sentence[idx]


        eng_indices = [self.eng_vocab[word] for word in word_tokenize(eng) if word in self.eng_vocab]

        kor_indices = [self.sos_idx] + [self.kor_vocab[word] for word in word_tokenize(kor) if word in self.kor_vocab] + [self.eos_idx]

        if len(eng_indices) < self.max_length:
          eng_indices += [self.padding_idx] * (self.max_length - len(eng_indices))
        else:
          eng_indices = eng_indices[:self.max_length]

        if len(kor_indices) < self.max_length:
          kor_indices += [self.padding_idx] * (self.max_length - len(kor_indices))
        else:
          kor_indices = kor_indices[:self.max_length]

        return torch.tensor(eng_indices), torch.tensor(kor_indices)

dataset = TranslationData(data)

class Encoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(Encoder, self).__init__()
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input_seq):
        embedded = self.embedding(input_seq)
        output, hidden = self.gru(embedded.transpose(0, 1))


        return output, hidden

class Decoder(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Decoder, self).__init__()
        self.hidden_size = hidden_size
        self.input_size = input_size
        self.output_size = output_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)

    def forward(self, input, hidden):
        input = input.unsqueeze(0)
        embedded = self.embedding(input)
        output, hidden = self.gru(embedded, hidden)
        output = self.out(output.squeeze(0))
        return output, hidden

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder


    def forward(self, input_seq, target_seq, teacher_forcing_ratio=0.5):
        batch_size = input_seq.size(0)
        target_length = target_seq.size(1)
        target_vocab_size = self.decoder.output_size
        outputs = torch.zeros(batch_size, target_length, target_vocab_size).to(device)

        encoder_output, hidden = self.encoder(input_seq)

        if hidden.size(0) != 1:
            hidden = hidden.unsqueeze(0)

        decoder_input = target_seq[:, 0]

        for t in range(1, target_length):
            output, hidden = self.decoder(decoder_input, hidden)
            outputs[:, t, :] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            decoder_input = target_seq[:, t] if teacher_force else top1

        return outputs

def translate(model, word, dataset, device):
    if word not in dataset.eng_vocab:
        return "데이터에 해당 단어가 없습니다."

    eng_indices = [dataset.eng_vocab[word]]
    eng_indices += [dataset.padding_idx] * (dataset.max_length - len(eng_indices))
    test_input = torch.tensor(eng_indices).unsqueeze(0).to(device)

    with torch.no_grad():
        test_target = torch.zeros((1, dataset.max_length), dtype=torch.long).to(device)
        test_target[0][0] = dataset.kor_vocab['<sos>']
        output = model(test_input, test_target)

        predicted = torch.argmax(output, dim=2)
        for pred in predicted[0]:
            if pred.item() not in {dataset.padding_idx, dataset.sos_idx, dataset.eos_idx}:
                translated_word = list(dataset.kor_vocab.keys())[list(dataset.kor_vocab.values()).index(pred.item())]
                return translated_word

        # return " ".join(translated_sentence)


def prepare_data(dataset, batch_size, train_ratio=0.8):
    train_size = int(train_ratio * len(dataset))
    val_size = len(dataset) - train_size
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataloader = DataLoader(val_dataset, batch_size=batch_size)
    
    return train_dataloader, val_dataloader

def train_model(model, train_dataloader, val_dataloader, criterion, optimizer, num_epochs, device):
    model.train()
    train_loss_values = []
    val_loss_values = []
    train_accuracy_values = []
    val_accuracy_values = []

    for epoch in range(num_epochs):
        model.train()
        train_loss, train_acc = train_epoch(model, train_dataloader, criterion, optimizer, device)
        train_loss_values.append(train_loss)
        train_accuracy_values.append(train_acc)
        
        
        model.eval()
        val_loss, val_acc = evaluate_epoch(model, val_dataloader, criterion, device)
        val_loss_values.append(val_loss)
        val_accuracy_values.append(val_acc)
        
        print(f'Epoch {epoch + 1}/{num_epochs}, 훈련 손실{train_loss:.4f}, 훈련 정확도: {train_acc:.2f}%, 검증 손실: {val_loss:.4f}, 검증 정확도: {val_acc:.2f}%')

    return train_loss_values, val_loss_values, train_accuracy_values, val_accuracy_values

def evaluate_epoch(model, dataloader, criterion, device):
    epoch_loss = 0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for eng, kor in dataloader:
            eng, kor = eng.to(device), kor.to(device)
            
            output = model(eng, kor[:, :-1])
            output = output.view(-1, output.size(-1))
            kor_target = kor[:, 1:].contiguous().view(-1)
            
            loss = criterion(output, kor_target)
            epoch_loss += loss.item()

            output_probs = F.softmax(output, dim=-1)
            predicted_indices = output_probs.argmax(dim=-1).view(kor.size(0), -1)
          

            # print(f"모델 출력: {output.cpu().numpy()}")
            # print(f"타겟: {kor_target.cpu().numpy()}")
            
            # for i in range(eng.size(0)):
            #     eng_sentence = [
            #         list(dataset.eng_vocab.keys())[list(dataset.eng_vocab.values()).index(idx.item())]
            #         for idx in eng[i] if idx.item() != dataset.padding_idx
            #     ]
            #     target_sentence = [
            #         list(dataset.kor_vocab.keys())[list(dataset.kor_vocab.values()).index(idx.item())]
            #         for idx in kor[i][1:] if idx.item() not in {dataset.padding_idx, dataset.sos_idx, dataset.eos_idx}
            #     ]
            #     predicted_sentence = [
            #         list(dataset.kor_vocab.keys())[list(dataset.kor_vocab.values()).index(idx.item())]
            #         for idx in predicted_indices[i] if idx.item() not in {dataset.padding_idx, dataset.sos_idx, dataset.eos_idx}
            #     ]

            #     print(f"예측된 인덱스: {predicted_indices[i].cpu().numpy()}") 


            #     if predicted_sentence: 
            #         predicted_output = ' '.join(predicted_sentence)
            #     else:  # 예측된 문장이 비어있는 경우
            #         predicted_output = "비어있음"


            #     print(f"영어 입력: {' '.join(eng_sentence)}")
            #     print(f"정답 한국어: {' '.join(target_sentence)}")
            #     print(f"예측된 한국어: {' '.join(predicted_output)}")
            #     print("-" * 50)
                
            correct += (predicted_indices.view(-1) == kor_target.view(-1)).sum().item() 
            total += kor_target.numel()
    avg_loss = epoch_loss / len(dataloader) if len(dataloader) > 0 else 0.0
    accuracy = (correct / total * 100) if total > 0 else 0.0
    
    return avg_loss, accuracy

def plot_loss_and_accuracy(train_loss, val_loss, train_accuracy, val_accuracy):
    plt.figure(figsize=(15, 5))

    plt.subplot(1, 2, 1)
    plt.plot(train_loss, label='Training Loss')
    plt.plot(val_loss, label='Validation Loss')
    plt.title('Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(train_accuracy, label='Training Accuracy')
    plt.plot(val_accuracy, label='Validation Accuracy')
    plt.title('Training and Validation Accuracy')
    plt.xlabel('Epochs')
    plt.ylabel('Accuracy (%)')
    plt.legend()

    plt.tight_layout()
    plt.show()

if __name__ == '__main__':
    max_length = 10
    hidden_size = 100
    batch_size = 128
    num_epochs = 10
    learning_rate = 0.001

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    dataset = TranslationData(data, max_length)
    train_dataloader, val_dataloader = prepare_data(dataset, batch_size)
    
    eng_vocab_size = len(dataset.eng_vocab)
    kor_vocab_size = len(dataset.kor_vocab)
    
    encoder = Encoder(input_size=eng_vocab_size, hidden_size=hidden_size).to(device)
    decoder = Decoder(input_size=kor_vocab_size, hidden_size=hidden_size, output_size=kor_vocab_size).to(device)
    model = Seq2Seq(encoder, decoder).to(device)
    
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    train_loss_values, val_loss_values, train_accuracy_values, val_accuracy_values = train_model(
        model, train_dataloader, val_dataloader, criterion, optimizer, num_epochs, device
    )
    
    plot_loss_and_accuracy(train_loss_values, val_loss_values, train_accuracy_values, val_accuracy_values)

    


    

