In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from collections import Counter
import random
import re
import datasets
import tqdm
import math
from functools import partial
import math
import argparse
import os
import collections
import json
import sentencepiece
import shutil
import copy
import multiprocessing
import transformers
from dataclasses import dataclass, field
from evaluate import load

# set "high" if you have a GPU with compute capability >= 8.0 else "highest"
torch.set_float32_matmul_precision("high")
if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch
print(torch.__version__, torch.__file__)

import transformers
print(transformers.__version__)

import evaluate
print(evaluate.__version__)

from evaluate import load
print("evaluate.load import OK")


2.10.0+cpu c:\Users\PC\Downloads\a2\venv\Lib\site-packages\torch\__init__.py
5.1.0
0.4.6
evaluate.load import OK


# Training config

In [3]:
## you can modify some options such as batch_size, depending on your environments  
# 텍스트 데이터 -> (batch 크기, 문장 길이 (최대로), 임베딩 차원)
#임베딩 전 : (batch, seq_len) 임베딩 후: (batch, seq_len, embed_dim)
training_config = {
    "batch_size": 16,
    "epochs": 3,
    "lr": 1e-4,
    "warmup_steps": 50,
    "device": "cuda" if torch.cuda.is_available() else "cpu",

    "gradient_accumulate_steps": 1,
}

# Dataset load

In [None]:
#{"english": "...", "korean": "..."} 형태

dataset = datasets.load_dataset("lemon-mint/korean_english_parallel_wiki_augmented_v1",split="train") #한영 병렬 코퍼스 다운로드

#영어 한국어 모두 128자 초과 - 8192자 미만만 남김 
dataset = dataset.filter(lambda x: len(x['english']) < 8192 and len(x['english']) > 128 and len(x['korean']) < 8192 and len(x['korean']) > 128) #길이제한

valid_set = dataset.select(range(10000)) #검증용용
train_set = dataset.select(range(10000, 110000)) #train 용

#"나는 너를 사랑해" → [1203, 4421, 2891]
tokenizer = transformers.AutoTokenizer.from_pretrained("Helsinki-NLP/opus-mt-ko-en") #토크나이저 : 텍스트 -> 숫자 변환
additional_special_tokens = {}
#문장 시작, 종료, 패딩 토큰 추가가
if tokenizer.pad_token is None:
    additional_special_tokens["pad_token"] = "<pad>"
if tokenizer.eos_token is None:
    additional_special_tokens["eos_token"] = "</s>"
if tokenizer.bos_token is None:
    additional_special_tokens["bos_token"] = "<s>"
tokenizer.add_special_tokens(additional_special_tokens)

#각 배치에서 텍스트 추출
'''
padding = True : 배치 내 가장 긴 문장에 맞춰 패딩
truncation = True : 문장 길이 512 넘어가면 잘라냄
return_tensors="pt" : 파이토치 텐서로 변환
max_length=512 : 문장 최대 길이 512
pad_to_multiple_of=64 : 64의 배수로 패딩 - GPU 연산 최적화
input_ids -> 토큰화된 문자를 숫자로 바꾼 값
토크나이저 내부 로직이 알아서 패딩 자리를 0, 원래 토큰 자리에 1 넣어줌
labels 는 loss 계산시 사용
BOS 토큰 id = 0
EOS 토큰 id = 2
'''
def collate_fn(batch):
    english_corpus = [item["english"] for item in batch]
    korean_corpus = [item["korean"] for item in batch]
    english_corpus = tokenizer(english_corpus, padding=True, truncation=True, return_tensors="pt", max_length=512, pad_to_multiple_of=64)
    korean_corpus = tokenizer(korean_corpus, padding=True, truncation=True, return_tensors="pt", max_length=512, pad_to_multiple_of=64)
    labels = korean_corpus["input_ids"].clone() #원본 복사
    labels[korean_corpus['attention_mask'].eq(0)] = -100 #eq[0] 은 0 과 비교시 true false 반환하라는거고 -100 은 패딩 토큰 이렇게 바꿔 파이토치보고 무시하라는 뜻


    return {
        "encoder_input_ids": english_corpus["input_ids"],
        "encoder_attention_mask": english_corpus["attention_mask"],
        "decoder_input_ids": korean_corpus["input_ids"],
        "labels": korean_corpus["input_ids"],
    } #배치 딕셔너리 형태 반환



To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Generating train split: 100%|██████████| 503245/503245 [00:00<00:00, 507041.13 examples/s]
Filter: 100%|██████████| 503245/503245 [00:02<00:00, 169336.74 examples/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


### Model implement

In [24]:
'''
Seq2Seq 구조: Encoder(양방향 LSTM)가 입력 문장을 읽고, Decoder(단방향 LSTM)가 한 토큰씩 출력을 생성합니다. 
Attention은 Decoder가 매 스텝마다 Encoder의 어떤 부분에 집중할지 결정합니다.
Input Feeding: 이전 타임스텝의 attention context를 현재 입력 임베딩에 concat해서 넣는 기법입니다. 
Decoder가 이전에 어디를 봤는지 알 수 있게 해줍니다.
'''
@dataclass
class ModelConfig(object):
    vocab_size: int = field(default=50000)
    encoder_hidden_dim: int = field(default=512) # hidden dimention of encoder lstm
    decoder_hidden_dim: int = field(default=512) # hidden dimention of decoder lstm
    hidden_dim: int = field(default=512) # hidden dimention of other module like attention
    embed_dim: int = field(default=512) # embedding dimention
    pad_idx: int = field(default=0)
    sos_idx: int = field(default=1)
    eos_idx: int = field(default=2)
    n_layers: int = field(default=1)
    dropout: float = field(default=0.1)

    attention_type:str = field(default="global")
    window_size: int = field(default=10)
    sigma_ratio: float = field(default=2.0)

    do_input_feeding: bool = field(default=True)

'''
Decoder 의 현재 hidden state 가 쿼리
인코더의 모든 출력이 key, value 사용해 가중합
'''

class GlobalAttention(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.config = config

        self.query_proj = nn.Linear(config.decoder_hidden_dim, config.hidden_dim, bias=False) #입력 크기, 출력 크기, bias
        self.key_proj = nn.Linear(config.encoder_hidden_dim * 2, config.hidden_dim, bias=False)
        self.value_proj = nn.Linear(config.encoder_hidden_dim * 2, config.hidden_dim, bias=False)
        self.output_proj = nn.Linear(config.hidden_dim, config.decoder_hidden_dim, bias=False)
        
        self.dropout = nn.Dropout(config.dropout)
        self.scale = np.sqrt(config.hidden_dim)

    def forward(self, decoder_hidden_query, encoder_outputs, encoder_attention_mask):
        query = self.query_proj(decoder_hidden_query)
        key = self.key_proj(encoder_outputs)
        value = self.value_proj(encoder_outputs)
        
        # fill here for global attention forward
        # shape hint:
        # context: (batch, 1, hidden_dim)
        ######

        attn_scores = torch.matmul(query, key.transpose(-2, -1)) / self.scale #행렬 곱셈
        attn_scores = attn_scores.masked_fill(encoder_attention_mask.unsqueeze(1) == 0, -1e9) #패딩 토큰 무시
        attn_weights = F.softmax(attn_scores, dim=-1) #소프트맥스 함수 적용
        attn_weights = self.dropout(attn_weights) #드롭아웃 적용
        context = torch.matmul(attn_weights, value) #컨텍스트 계산
        
        ######
        output_context = self.output_proj(context)

        return output_context

class LocalAttention(GlobalAttention):
    # 주변 윈도우만 고려
    def __init__(self, config: ModelConfig):
        super().__init__(config) 
        self.window_size = config.window_size #좌우로 몇개 볼것인가
        self.location_proj_up = nn.Linear(config.decoder_hidden_dim, config.hidden_dim, bias=False)
        self.location_proj_down = nn.Linear(config.hidden_dim, 1, bias=False)
        self.sigma = self.window_size / config.sigma_ratio

    def forward(self, decoder_hidden_query, encoder_outputs, encoder_attention_mask):
        key, value, attn_mask, gaussian_penalty = self._gather_local_context(decoder_hidden_query, encoder_outputs, encoder_attention_mask)
        query = self.query_proj(decoder_hidden_query)
        key = self.key_proj(key)
        value = self.value_proj(value)

        # fill here for local attention forward
        # shape hint:
        # context: (batch, 1, hidden_dim)
        ######
        #attention score 계산
        attn_scores = torch.matmul(query, key.transpose(-2, -1)) / self.scale
        
        #PAD masking
        attn_scores = attn_scores.masked_fill(attn_mask == 0, -1e9)

        #softmax 적용 
        attn_weights = F.softmax(attn_scores, dim=-1) #dim -1 은 마지막 차원에 대해 softmax 하니 합쳐 1
        
        #gaussian penalty 적용
        attn_weights = attn_weights * gaussian_penalty.unsqueeze(1) 

        #renormalize (가우시안 패널티 적용 후 합이 1 되도록)
        attn_weights = attn_weights / (attn_weights.sum(dim=-1, keepdim=True) + 1e-10) #0 방지

        #context vector 계산
        context = torch.matmul(attn_weights, value)
        
        ######
        output_context = self.output_proj(context)

        return output_context

    def _gather_local_context(self, decoder_hidden_query, encoder_outputs, encoder_attention_mask):
        device = encoder_outputs.device #현재 디바이스 저장
        src_len = encoder_attention_mask.sum(dim=-1).unsqueeze(-1) #각 샘플 실제 토큰 개수

        # fill here for local context window
        # shape hint:
        # local_key: (batch, window_size * 2 + 1, hidden_dim)
        # local_value: (batch, window_size * 2 + 1, hidden_dim)
        # local_attn_mask: (batch, window_size * 2 + 1)
        # gaussian_penalty: (batch, window_size * 2 + 1)
        ######
        
        location_proj = torch.tanh(self.location_proj_up(decoder_hidden_query)) #위치 프로젝션
        pt = torch.sigmoid(self.location_proj_down(location_proj))*src_len #실제 위치로 스케일링
        pt = pt.squeeze(-1) #불필요한 차원 제거

        batch_size, src_seq_len, hidden_dim = encoder_outputs.shape

        window_positions = torch.arange(
            -self.window_size, self.window_size + 1, device=device
        ).float() #unsqueeze() 는 차원추가, 0,1 은 위치 -1 은 무조건 마지막 위치 broadcasting

        positions = (pt.unsqueeze(-1) + window_positions.unsqueeze(0).unsqueeze(0)).long() #위치 계산, 정수 인덱싱 필요해서 long 타입
        
        #범위 지정
        positions = positions.clamp(0, src_seq_len - 1) #0 과 src_seq_len - 1 사이로 제한
        positions = positions.squeeze(1) #(batch, 2D+1)
        positions_expanded = positions.unsqueeze(-1).expand(-1, -1, hidden_dim) 

        local_key = torch.gather(encoder_outputs, dim=1, index=positions_expanded)
        local_value = local_key.clone()

        #local attention mask 만들기
        '''
        torch.gather(input, dim, index) -> 특정 인덱스 위치 값들 수집
        input : 원본 텐서
        dim : 추출할 차원
        index : 추출할 위치
        '''
        local_attn_mask = torch.gather(
            encoder_attention_mask,
            dim=1,
            index=positions
        )
        window_positions_sq = window_positions.squeeze(0)

        #가우시안 패널티 주기
        gaussian_penalty = torch.exp(-(window_positions_sq**2) / (2 * self.sigma**2))
        gaussian_penalty = gaussian_penalty.expand(batch_size, -1)                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     
        ######

        return local_key, local_value, local_attn_mask, gaussian_penalty

class Encoder(nn.Module):
    '''
    인풋 임베딩 : (batch, src_seq_len, embed_dim)
    어텐션 마스크 : (batch, src_seq_len) 
    인코더 출력 : (batch, src_seq_len, hidden_dim)
    인코더 은닉 상태 : (n_layers, batch, decoder_hidden_dim)
    '''
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.config = config
        
        self.encoder = nn.LSTM(
            input_size=config.embed_dim, #입력 특징 차원 적용
            hidden_size=config.encoder_hidden_dim, #은닉 상태 차원 적용
            num_layers=config.n_layers, #레이어 수
            dropout=config.dropout if config.n_layers > 1 else 0, #드롭아웃
            bidirectional=True,
            batch_first=True #배치 첫 차원
        )

        self.h_dec_proj = nn.Linear(config.encoder_hidden_dim * 2, config.decoder_hidden_dim) #양뱡향 concat, decoder 지정
        self.c_dec_proj = nn.Linear(config.encoder_hidden_dim * 2, config.decoder_hidden_dim)

    def forward(self, input_embeds, attention_mask):

        # Fill here for encoder forward
        # shape hint
        # input_embeds: (batch, src_seq_len, embed_dim)
        # attention_mask: (batch, src_seq_len)
        # encoder_output: (batch, src_seq_len, hidden_dim)
        # h_enc: (n_layers, batch, decoder_hidden_dim)
        # c_enc: (n_layers, batch, decoder_hidden_dim)
        # hint for implementation
        # 1. use nn.utils.rnn.pack_padded_sequence to packing inputs for rnn series, see https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pack_padded_sequence.html
        #    failure to properly handle padding will result in a penalty.
        # 2. lstm cell state and hidden state will be doubled because of bidirectional lstm.
        #    decoder will be unidirectional for causal language modeling. 
        #    handle the hidden state and cell state to be same as decoder.
        ######
        lengths = attention_mask.sum(dim=1).cpu() #각 샘플 실제 길이 계산 

        #패딩 제거
        packed_input = nn.utils.rnn.pack_padded_sequence(input_embeds, lengths, batch_first=True, enforce_sorted=False)

        #Bidirectional LSTM 
        packed_output, (h_enc, c_enc) = self.encoder(packed_input)

        encoder_output, _ = nn.utils.rnn.pad_packed_sequence(packed_output, batch_first=True)

        n_layers = self.config.n_layers
        h_enc = h_enc.view(n_layers, 2, -1, self.config.decoder_hidden_dim)
        c_enc = c_enc.view(n_layers, 2, -1, self.config.decoder_hidden_dim)

        h_enc = torch.cat([h_enc[:,0,:,:], h_enc[:,1,:,:]], dim=-1)
        c_enc = torch.cat([c_enc[:,0,:,:], c_enc[:,1,:,:]], dim=-1)

        #projection
        h_enc = self.h_dec_proj(h_enc)
        c_enc = self.c_dec_proj(c_enc)
        
        ######

        return encoder_output, (h_enc, c_enc)

class Decoder(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.config = config

        self.decoder = nn.LSTM(
            input_size=config.embed_dim + config.hidden_dim if config.do_input_feeding else config.embed_dim,
            hidden_size=config.decoder_hidden_dim,
            num_layers=config.n_layers,
            dropout=config.dropout if config.n_layers > 1 else 0,
            batch_first=True
        )
        match config.attention_type:
            case "local":
                self.attention = LocalAttention(config)
            case "global":
                self.attention = GlobalAttention(config)
            case _:
                raise ValueError(f"Unknown attention type: {config.attention_type}")
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, input_embeds, encoder_outputs, h_enc, c_enc, attention_mask):
        decoder_output, (h_dec, c_dec) = self.decoder(input_embeds, (h_enc, c_enc))
        attention_context = self.attention(decoder_output, encoder_outputs, attention_mask)
        decoder_output = decoder_output + attention_context

        return decoder_output, attention_context, (h_dec, c_dec)

class Seq2Seq(nn.Module):
    def __init__(self, config: ModelConfig):
        super().__init__()
        self.config = config

        self.embedding = nn.Embedding(config.vocab_size, config.embed_dim, padding_idx=config.pad_idx)
        
        self.encoder = Encoder(config)
        self.decoder = Decoder(config)
        
        self.lm_head = nn.Linear(config.hidden_dim, config.vocab_size)
        self.dropout = nn.Dropout(config.dropout)

    def forward(self, encoder_input_ids, encoder_attention_mask, decoder_input_ids, labels=None, cache=None):
        if cache is None:
            encoder_input_embeds = self.embedding(encoder_input_ids)
            encoder_outputs, (h_enc, c_enc) = self.encoder(encoder_input_embeds, encoder_attention_mask)

            current_h_dec, current_c_dec = h_enc, c_enc
            prev_attn_context = None
        else:
            encoder_outputs, current_h_dec, current_c_dec, prev_attn_context = cache

        batch_size, tgt_len = decoder_input_ids.shape
        decoder_input_embeds = self.embedding(decoder_input_ids)

        if prev_attn_context is None:
            prev_attn_context = torch.zeros((batch_size, 1, self.config.decoder_hidden_dim)).to(decoder_input_embeds)
        
        outputs = [] 

        for t in range(tgt_len):
            # fill here for decoder forward
            ######
            #현재 타임스텝 임베딩 추출
            current_embed = decoder_input_embeds[:, t:t+1, :]

            #input feeding - 이전 attention context 현재 입력에 concat

            if self.config.do_input_feeding:
                current_embed = torch.cat([current_embed, prev_attn_context], dim=-1)
            
            decoder_output, attn_context, (h_dec, c_dec) = self.decoder(current_embed, encoder_outputs, current_h_dec, current_c_dec, encoder_attention_mask)
            
            current_h_dec = h_dec
            current_c_dec = c_dec
            prev_attn_context = attn_context
            ######
            outputs.append(decoder_output)
            

        outputs = torch.cat(outputs, dim=1)
        
        lm_logits = self.lm_head(outputs)
    
        loss = None
        if labels is not None:
            # for cross entropy loss
            # loss must be scalar
           
            labels_for_loss = labels[:, 1:].contiguous()
            lm_logits_for_loss = lm_logits[:, :-1, :].contiguous()
            loss = F.cross_entropy(lm_logits_for_loss.view(-1, self.config.vocab_size), labels_for_loss.view(-1))
           
            return loss
        else:
            return lm_logits, (encoder_outputs, current_h_dec, current_c_dec, prev_attn_context)

    @torch.no_grad()
    def generate(
        self,
        encoder_input_ids: torch.LongTensor,
        encoder_attention_mask: torch.LongTensor,
        max_new_tokens: int = 256,
    ):
        batch_size, _ = encoder_input_ids.shape
        device = encoder_input_ids.device
        eos = self.config.eos_idx

        #생성되지 않은 문장 표시 (1=미완, 0=완성)
        unfinish_flag = torch.ones(batch_size, dtype=torch.long, device=device)
        cache = None

        #SOS 토큰으로 시작
        decoder_input_ids = torch.full((batch_size, 1), self.config.sos_idx, dtype=torch.long, device=device)

        for _ in range(max_new_tokens):
            # fill here for causal generation
           ######
        
            lm_logits, cache = self.forward(encoder_input_ids, encoder_attention_mask, decoder_input_ids[:, -1:], cache=cache) #마지막 토큰만
            next_token = lm_logits[:, -1, :].argmax(dim=-1, keepdim=True)

            #unfinished_flag 가 0 인 문장은 패딩 토큰으로 대체
            next_token = next_token * unfinish_flag.unsqueeze(-1) + self.config.pad_idx * (1 - unfinish_flag.unsqueeze(-1))

            #생성 토큰 추가
            decoder_input_ids = torch.cat([decoder_input_ids, next_token], dim=1)

            unfinish_flag = unfinish_flag * (next_token.squeeze(-1) != eos).long()

            if unfinish_flag.sum() == 0:
                break
            
            ######
        return decoder_input_ids


In [25]:
def train(model, train_dataset, valid_dataset, collate_fn, train_args, prefix):
    optimizer = optim.Adam(model.parameters(), lr=train_args["lr"])

    train_dataloader = DataLoader(train_dataset, batch_size=train_args['batch_size'], shuffle=True, collate_fn=collate_fn, num_workers=0)
    valid_dataloader = DataLoader(valid_dataset, batch_size=train_args['batch_size'], shuffle=False, collate_fn=collate_fn, num_workers=0)

    total_steps = len(train_dataloader) * train_args['epochs']

    num_training_steps = train_args['epochs'] * (len(train_dataloader) // train_args['gradient_accumulate_steps'])
    scheduler = transformers.get_scheduler(
        name="cosine",
        optimizer=optimizer,
        num_warmup_steps=train_args['warmup_steps'],
        num_training_steps=num_training_steps
    )

    best_loss = 987654321
    optimizer.zero_grad()

    output_path = os.path.join("output", prefix)
    os.makedirs(output_path, exist_ok=True)
    with open(os.path.join(output_path, "train_args.json"), "w") as f:
        json.dump(train_args, f)

    pbar = tqdm.tqdm(total=total_steps, desc="training")
    for epoch in range(train_args['epochs']):
        pbar.set_description(f"Epoch {epoch+1}/{train_args['epochs']}")
        move_avg_loss = []
        model.train()
        for i, batch in enumerate(train_dataloader):
            batch = {k:v.to(train_args['device']) if isinstance(v,torch.Tensor) else v for k,v in batch.items()}

            loss = model(**batch)
            loss = loss / train_args['gradient_accumulate_steps']
            if loss.size() != torch.Size([]):
                loss = loss.mean()
            loss.backward()
            
            if (i+1) % train_args['gradient_accumulate_steps'] == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()

            move_avg_loss.append(loss.item()) 
            if len(move_avg_loss) > 100: move_avg_loss.pop(0)
            pbar.set_postfix_str(f"loss: {sum(move_avg_loss)/len(move_avg_loss):.04f} lr: {optimizer.param_groups[0]['lr']:.2e}")
            pbar.update(1)
        
        model.eval()
        with torch.no_grad():
            eval_loss = 0
            for i, batch in enumerate(valid_dataloader):
                batch = {k:v.to(train_args['device']) if isinstance(v,torch.Tensor) else v for k,v in batch.items()}
                loss_val = model(**batch)
                if loss_val.size() != torch.Size([]):
                    loss_val = loss_val.mean()
                eval_loss += loss_val.item()
                pbar.set_postfix_str(f"val_loss: {eval_loss / (i+1):.04f}")
        eval_loss /= len(valid_dataloader)
        pbar.write(f"Validation Loss: {eval_loss:.04f}")

        if eval_loss < best_loss:
            best_loss = eval_loss
            
            torch.save(model.state_dict(), os.path.join(output_path,"best_model.pth"))
            pbar.write(f"Model Saved best loss: {best_loss:.04f}")

    pbar.close()

def evaluate(model, dataset, tokenizer, collate_fn, train_args):
    model.eval()
    dataloader = DataLoader(dataset, batch_size=train_args['batch_size'], shuffle=False, collate_fn=collate_fn, num_workers=0)
    
    answers = []
    predicts = []
    for i, batch in enumerate(tqdm.tqdm(dataloader, desc="Evaluating")):
        batch = {k:v.to(train_args['device']) if isinstance(v,torch.Tensor) else v for k,v in batch.items()}
        gen_output = model.generate(
            encoder_input_ids=batch["encoder_input_ids"],
            encoder_attention_mask=batch["encoder_attention_mask"],
            max_new_tokens=512
        )
        pred = tokenizer.batch_decode(gen_output, skip_special_tokens=True)
        ans = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
        answers.extend(ans)
        predicts.extend(pred)
    
    bleu = load("bleu")
    result = bleu.compute(predictions=predicts, references=answers)
    print(f"BLEU: {result['bleu']:.4f}")

In [None]:
config = ModelConfig(
    vocab_size=len(tokenizer),
    pad_idx=tokenizer.pad_token_id,
    sos_idx=tokenizer.bos_token_id,
    eos_idx=tokenizer.eos_token_id,
    n_layers=2,
    dropout=0.1,

    attention_type="global",
    do_input_feeding=False,
)

model = Seq2Seq(config).to(training_config["device"])
model = model.to(torch.bfloat16)
print(model)

train(
    model,
    train_set,
    valid_set,
    collate_fn,
    training_config,
    prefix="seq2seq_global_attention_no_input_feeding"
)

model.load_state_dict(torch.load(os.path.join("output", "seq2seq_global_attention_no_input_feeding", "best_model.pth")))
evaluate(
    model,
    valid_set,
    tokenizer,
    collate_fn,
    training_config
)

del model
torch.cuda.empty_cache()

Seq2Seq(
  (embedding): Embedding(65002, 512, padding_idx=65000)
  (encoder): Encoder(
    (encoder): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
    (h_dec_proj): Linear(in_features=1024, out_features=512, bias=True)
    (c_dec_proj): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (decoder): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.1)
    (attention): GlobalAttention(
      (query_proj): Linear(in_features=512, out_features=512, bias=False)
      (key_proj): Linear(in_features=1024, out_features=512, bias=False)
      (value_proj): Linear(in_features=1024, out_features=512, bias=False)
      (output_proj): Linear(in_features=512, out_features=512, bias=False)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (lm_head): Linear(in_features=512, out_features=65002, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)





[A[A[A


[A[A[A



[A[A[A


[A[A[A

In [21]:
config = ModelConfig(
    vocab_size=len(tokenizer),
    pad_idx=tokenizer.pad_token_id,
    sos_idx=tokenizer.bos_token_id,
    eos_idx=tokenizer.eos_token_id,
    n_layers=2,
    dropout=0.1,

    attention_type="global",
)

model = Seq2Seq(config).to(training_config["device"])
model = model.to(torch.bfloat16)
print(model)

train(
    model,
    train_set,
    valid_set,
    collate_fn,
    training_config,
    prefix="seq2seq_global_attention"
)

model.load_state_dict(torch.load(os.path.join("output", "seq2seq_global_attention", "best_model.pth")))
evaluate(
    model,
    valid_set,
    tokenizer,
    collate_fn,
    training_config
)

del model
torch.cuda.empty_cache()

Seq2Seq(
  (embedding): Embedding(65002, 512, padding_idx=65000)
  (encoder): Encoder(
    (encoder): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
    (h_dec_proj): Linear(in_features=1024, out_features=512, bias=True)
    (c_dec_proj): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (decoder): LSTM(1024, 512, num_layers=2, batch_first=True, dropout=0.1)
    (attention): GlobalAttention(
      (query_proj): Linear(in_features=512, out_features=512, bias=False)
      (key_proj): Linear(in_features=1024, out_features=512, bias=False)
      (value_proj): Linear(in_features=1024, out_features=512, bias=False)
      (output_proj): Linear(in_features=512, out_features=512, bias=False)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (lm_head): Linear(in_features=512, out_features=65002, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)




InductorError: RuntimeError: Compiler: cl is not found.

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"


In [22]:
config = ModelConfig(
    vocab_size=len(tokenizer),
    pad_idx=tokenizer.pad_token_id,
    sos_idx=tokenizer.bos_token_id,
    eos_idx=tokenizer.eos_token_id,
    n_layers=2,
    dropout=0.1,

    attention_type="local",
)

model = Seq2Seq(config).to(training_config["device"])
model = model.to(torch.bfloat16)
print(model)

train(
    model,
    train_set,
    valid_set,
    collate_fn,
    training_config,
    prefix="seq2seq_local_attention"
)

model.load_state_dict(torch.load(os.path.join("output", "seq2seq_local_attention", "best_model.pth")))
evaluate(
    model,
    valid_set,
    tokenizer,
    collate_fn,
    training_config
)

del model
torch.cuda.empty_cache()

Seq2Seq(
  (embedding): Embedding(65002, 512, padding_idx=65000)
  (encoder): Encoder(
    (encoder): LSTM(512, 512, num_layers=2, batch_first=True, dropout=0.1, bidirectional=True)
    (h_dec_proj): Linear(in_features=1024, out_features=512, bias=True)
    (c_dec_proj): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (decoder): LSTM(1024, 512, num_layers=2, batch_first=True, dropout=0.1)
    (attention): LocalAttention(
      (query_proj): Linear(in_features=512, out_features=512, bias=False)
      (key_proj): Linear(in_features=1024, out_features=512, bias=False)
      (value_proj): Linear(in_features=1024, out_features=512, bias=False)
      (output_proj): Linear(in_features=512, out_features=512, bias=False)
      (dropout): Dropout(p=0.1, inplace=False)
      (location_proj_up): Linear(in_features=512, out_features=512, bias=False)
      (location_proj_down): Linear(in_features=512, out_features=1, bias=False)
    )
    (dropout): Dropout(p=0.1,


[A
[A

InductorError: RuntimeError: Compiler: cl is not found.

Set TORCHDYNAMO_VERBOSE=1 for the internal stack trace (please do this especially if you're reporting a bug to PyTorch). For even more developer context, set TORCH_LOGS="+dynamo"
