transformer에서 바뀐 내용
- Encoder, Cross-Attention 모두 삭제 → Decoder-only 스택만 유지

- PositionalEncoding(sinusoidal) → 학습형 pos embedding

- ReLU → GELU

- Post-LN(잔차 후 Norm) → Pre-LN(Norm 후 서브레이어, 잔차) + 최종 LayerNorm

- 출력은 LM head 하나, weight tying 옵션 **적용**

- 학습은 언어모델링 목표 + Causal mask로 통일

In [32]:
!pip -q install sentencepiece torchinfo

import os, re, math, random, zipfile, urllib.request, io
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from torchinfo import summary


In [33]:
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device


device(type='cuda')

In [34]:
url = 'http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip'
zip_filename = 'cornell_movie_dialogs.zip'

if not os.path.exists(zip_filename):
    print("Downloading dataset ...")
    urllib.request.urlretrieve(url, zip_filename)
    print("Done.")

with zipfile.ZipFile(zip_filename, 'r') as zf:
    zf.extractall()

path_to_dataset = os.path.join("cornell movie-dialogs corpus")
path_to_movie_lines = os.path.join(path_to_dataset, 'movie_lines.txt')
path_to_movie_conversations = os.path.join(path_to_dataset, 'movie_conversations.txt')

print(path_to_dataset)
print(os.path.exists(path_to_movie_lines), os.path.exists(path_to_movie_conversations))


cornell movie-dialogs corpus
True True


In [35]:
def preprocess_sentence(sentence: str) -> str:
    s = sentence.lower().strip()
    s = re.sub(r"([?.!,])", r" \1 ", s)
    s = re.sub(r'[" "]+', " ", s)
    s = re.sub(r"[^a-zA-Z?.!,]+", " ", s).strip()
    return s

def read_cornell_data(path_to_movie_lines, path_to_movie_conversations, max_samples=50000):
    id2line = {}
    with open(path_to_movie_lines, 'r', errors='ignore') as f:
        for line in f:
            parts = line.strip().split(" +++$+++ ")
            if len(parts) >= 5:
                id2line[parts[0]] = parts[4]

    pairs = []
    with open(path_to_movie_conversations, 'r', errors='ignore') as f:
        for line in f:
            parts = line.strip().split(" +++$+++ ")
            if len(parts) < 4:
                continue
            conv_list = parts[3][1:-1].split(", ")
            conv_list = [c.strip("'") for c in conv_list]
            for i in range(len(conv_list) - 1):
                q = preprocess_sentence(id2line.get(conv_list[i], ""))
                a = preprocess_sentence(id2line.get(conv_list[i+1], ""))
                if q and a:
                    pairs.append((q, a))
                    if len(pairs) >= max_samples:
                        return pairs
    return pairs

MAX_SAMPLES = 50000
pairs = read_cornell_data(path_to_movie_lines, path_to_movie_conversations, MAX_SAMPLES)
len(pairs), pairs[0]


(50000,
 ('can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .',
  'well , i thought we d start with pronunciation , if that s okay with you .'))

In [36]:
def preprocess_sentence(sentence: str) -> str:
    s = sentence.lower().strip()
    s = re.sub(r"([?.!,])", r" \1 ", s)
    s = re.sub(r'[" "]+', " ", s)
    s = re.sub(r"[^a-zA-Z?.!,]+", " ", s).strip()
    return s

def read_cornell_data(path_to_movie_lines, path_to_movie_conversations, max_samples=50000):
    id2line = {}
    with open(path_to_movie_lines, 'r', errors='ignore') as f:
        for line in f:
            parts = line.strip().split(" +++$+++ ")
            if len(parts) >= 5:
                id2line[parts[0]] = parts[4]

    pairs = []
    with open(path_to_movie_conversations, 'r', errors='ignore') as f:
        for line in f:
            parts = line.strip().split(" +++$+++ ")
            if len(parts) < 4:
                continue
            conv_list = parts[3][1:-1].split(", ")
            conv_list = [c.strip("'") for c in conv_list]
            for i in range(len(conv_list) - 1):
                q = preprocess_sentence(id2line.get(conv_list[i], ""))
                a = preprocess_sentence(id2line.get(conv_list[i+1], ""))
                if q and a:
                    pairs.append((q, a))
                    if len(pairs) >= max_samples:
                        return pairs
    return pairs

MAX_SAMPLES = 50000
pairs = read_cornell_data(path_to_movie_lines, path_to_movie_conversations, MAX_SAMPLES)
len(pairs), pairs[0]


(50000,
 ('can we make this quick ? roxanne korrine and andrew barrett are having an incredibly horrendous public break up on the quad . again .',
  'well , i thought we d start with pronunciation , if that s okay with you .'))

In [37]:
import sentencepiece as spm

VOCAB_SIZE = 8000
SPM_PREFIX = "cornell_bpe"

# 학습용 코퍼스 파일 생성
corpus_path = "spm_corpus.txt"
if not os.path.exists(corpus_path):
    with open(corpus_path, "w", encoding="utf-8") as f:
        for q, a in pairs:
            f.write(q + "\n")
            f.write(a + "\n")

# 모델 학습 (pad/bos/eos/unk id 고정)
if not os.path.exists(SPM_PREFIX + ".model"):
    spm.SentencePieceTrainer.Train(
        input=corpus_path,
        model_prefix=SPM_PREFIX,
        vocab_size=VOCAB_SIZE,
        model_type="bpe",
        character_coverage=1.0,
        pad_id=0, pad_piece="<pad>",
        bos_id=1, bos_piece="<s>",
        eos_id=2, eos_piece="</s>",
        unk_id=3, unk_piece="<unk>"
    )

sp = spm.SentencePieceProcessor()
sp.load(SPM_PREFIX + ".model")

PAD_ID = sp.pad_id()
BOS_ID = sp.bos_id()
EOS_ID = sp.eos_id()
sp.GetPieceSize(), PAD_ID, BOS_ID, EOS_ID


(8000, 0, 1, 2)

In [38]:
class LMDataset(Dataset):
    def __init__(self, pairs, tokenizer, max_seq_len=128, bos_id=None, eos_id=None):
        self.tokenizer = tokenizer
        self.max_seq_len = max_seq_len
        self.bos_id = tokenizer.bos_id() if bos_id is None else bos_id
        self.eos_id = tokenizer.eos_id() if eos_id is None else eos_id
        self.examples = []
        for q, a in pairs:
            ids = [self.bos_id] + tokenizer.encode(q) + [self.eos_id] + tokenizer.encode(a) + [self.eos_id]
            if len(ids) >= 2:
                self.examples.append(ids[:max_seq_len])

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, idx):
        return torch.tensor(self.examples[idx], dtype=torch.long)

def lm_collate_fn(batch, pad_id):
    max_len = max(len(x) for x in batch)
    xs, ys = [], []
    for x in batch:
        x = torch.tensor(x, dtype=torch.long)
        pad_len = max_len - x.size(0)
        x = torch.cat([x, torch.full((pad_len,), pad_id, dtype=torch.long)], dim=0)
        xs.append(x[:-1])
        ys.append(x[1:])
    return torch.stack(xs, 0), torch.stack(ys, 0)

# 미니 테스트
tmp_ds = LMDataset(pairs[:10], sp, max_seq_len=32)
tmp_loader = DataLoader(tmp_ds, batch_size=2, collate_fn=lambda b: lm_collate_fn(b, PAD_ID))
xb, yb = next(iter(tmp_loader))
xb.shape, yb.shape


  x = torch.tensor(x, dtype=torch.long)


(torch.Size([2, 31]), torch.Size([2, 31]))

In [39]:
# ===== GPT1: Decoder-only Transformer =====

class CausalSelfAttention(nn.Module):
    def __init__(self, d_model=768, n_head=12, attn_p=0.1, resid_p=0.1):
        super().__init__()
        assert d_model % n_head == 0
        self.n_head = n_head
        self.head_dim = d_model // n_head
        self.qkv = nn.Linear(d_model, 3*d_model, bias=True)
        self.proj = nn.Linear(d_model, d_model, bias=True)
        self.attn_drop = nn.Dropout(attn_p)
        self.resid_drop = nn.Dropout(resid_p)

    def forward(self, x, attn_mask=None):
        B, T, C = x.size()
        qkv = self.qkv(x).view(B, T, 3, self.n_head, self.head_dim).permute(2,0,3,1,4)
        q, k, v = qkv[0], qkv[1], qkv[2]  # (B, nh, T, hd)
        att = (q @ k.transpose(-2, -1)) / math.sqrt(self.head_dim)  # (B, nh, T, T)

        # causal mask
        causal = torch.tril(torch.ones(T, T, device=x.device)).bool().unsqueeze(0).unsqueeze(0)  # (1,1,T,T)
        att = att.masked_fill(~causal, float('-inf'))

        # optional padding mask (B,1,1,T) where True(mask)=ignore
        if attn_mask is not None:
            att = att.masked_fill(attn_mask, float('-inf'))

        att = F.softmax(att, dim=-1)
        att = self.attn_drop(att)
        y = att @ v
        y = y.transpose(1,2).contiguous().view(B, T, C)
        y = self.resid_drop(self.proj(y))
        return y

class MLP(nn.Module):
    def __init__(self, d_model=768, mlp_dim=3072, p=0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, mlp_dim),
            nn.GELU(),
            nn.Linear(mlp_dim, d_model),
            nn.Dropout(p)
        )
    def forward(self, x): return self.net(x)

class GPTBlock(nn.Module):
    def __init__(self, d_model=768, n_head=12, p=0.1):
        super().__init__()
        self.ln1 = nn.LayerNorm(d_model, eps=1e-5)
        self.attn = CausalSelfAttention(d_model, n_head, p, p)
        self.ln2 = nn.LayerNorm(d_model, eps=1e-5)
        self.mlp  = MLP(d_model, 4*d_model, p)
    def forward(self, x, attn_mask=None):
        x = x + self.attn(self.ln1(x), attn_mask=attn_mask)
        x = x + self.mlp(self.ln2(x))
        return x

class GPT1(nn.Module):
    def __init__(self, vocab_size, n_layer=12, d_model=768, n_head=12, block_size=512, p=0.1, tie_weights=True):
        super().__init__()
        self.block_size = block_size
        self.tok_emb = nn.Embedding(vocab_size, d_model)
        self.pos_emb = nn.Embedding(block_size, d_model)  # learned pos emb
        self.drop = nn.Dropout(p)
        self.blocks = nn.ModuleList([GPTBlock(d_model, n_head, p) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(d_model, eps=1e-5)
        self.head = nn.Linear(d_model, vocab_size, bias=False)
        if tie_weights:
            self.head.weight = self.tok_emb.weight
        self.apply(self._init_weights)

    def _init_weights(self, m):
        if isinstance(m, (nn.Linear, nn.Embedding)):
            nn.init.normal_(m.weight, mean=0.0, std=0.02)
        if isinstance(m, nn.Linear) and m.bias is not None:
            nn.init.zeros_(m.bias)

    def forward(self, idx, targets=None, pad_id=None):
        B, T = idx.size()
        if T > self.block_size:
            raise ValueError(f"seq len {T} > block_size {self.block_size}")
        pos = torch.arange(0, T, device=idx.device).unsqueeze(0)
        x = self.tok_emb(idx) + self.pos_emb(pos)
        x = self.drop(x)

        # padding mask for attention: True means "mask"
        attn_mask = None
        if pad_id is not None:
            attn_mask = (idx == pad_id).unsqueeze(1).unsqueeze(2)  # (B,1,1,T)

        for blk in self.blocks:
            x = blk(x, attn_mask=attn_mask)
        x = self.ln_f(x)
        logits = self.head(x)

        loss = None
        if targets is not None:
            ignore = -100 if pad_id is None else pad_id
            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=ignore)
        return logits, loss


In [40]:
# --- 스모크 테스트용(빠르게) ---
BLOCK_SIZE = 128
N_LAYER    = 4
D_MODEL    = 256
N_HEAD     = 8
DROPOUT    = 0.1
BATCH_SIZE = 64

# --- 평가 스펙(GPT-1) 예시 ---
# BLOCK_SIZE = 512; N_LAYER=12; D_MODEL=768; N_HEAD=12; DROPOUT=0.1; BATCH_SIZE=64

gpt = GPT1(
    vocab_size=sp.GetPieceSize(),
    n_layer=N_LAYER,
    d_model=D_MODEL,
    n_head=N_HEAD,
    block_size=BLOCK_SIZE,
    p=DROPOUT,
    tie_weights=True
).to(device)

print(f"Params: {sum(p.numel() for p in gpt.parameters())/1e6:.2f}M")


Params: 5.24M


In [41]:
base_lr = 6e-4
optimizer = AdamW(gpt.parameters(), lr=base_lr, betas=(0.9, 0.999), weight_decay=0.01)

def build_cosine_scheduler(optimizer, warmup_steps=2000, total_steps=200000, min_lr=0.0):
    def lr_lambda(step):
        step = max(step, 1)
        if step < warmup_steps:
            return step / float(warmup_steps)
        progress = (step - warmup_steps) / float(max(1, total_steps - warmup_steps))
        cosine = 0.5 * (1.0 + math.cos(math.pi * progress))
        return max(min_lr / base_lr, cosine)
    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

scheduler = build_cosine_scheduler(optimizer, warmup_steps=2000, total_steps=200000)


In [42]:
train_dataset = LMDataset(pairs, sp, max_seq_len=BLOCK_SIZE)
train_loader  = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                           collate_fn=lambda b: lm_collate_fn(b, PAD_ID), drop_last=True)

xb, yb = next(iter(train_loader))
print("input:", xb.shape, "target:", yb.shape)
print("shift check:", xb[0,1:10].tolist(), "==", yb[0,:9].tolist())


input: torch.Size([64, 98]) target: torch.Size([64, 98])
shift check: [974, 1340, 227, 160, 5, 349, 1756, 1260, 4067] == [974, 1340, 227, 160, 5, 349, 1756, 1260, 4067]


  x = torch.tensor(x, dtype=torch.long)


In [43]:
summary(gpt, input_size=(1, BLOCK_SIZE-1), dtypes=[torch.long])


Layer (type:depth-idx)                        Output Shape              Param #
GPT1                                          [1, 127, 8000]            --
├─Embedding: 1-1                              [1, 127, 256]             2,048,000
├─Embedding: 1-2                              [1, 127, 256]             32,768
├─Dropout: 1-3                                [1, 127, 256]             --
├─ModuleList: 1-4                             --                        --
│    └─GPTBlock: 2-1                          [1, 127, 256]             --
│    │    └─LayerNorm: 3-1                    [1, 127, 256]             512
│    │    └─CausalSelfAttention: 3-2          [1, 127, 256]             263,168
│    │    └─LayerNorm: 3-3                    [1, 127, 256]             512
│    │    └─MLP: 3-4                          [1, 127, 256]             525,568
│    └─GPTBlock: 2-2                          [1, 127, 256]             --
│    │    └─LayerNorm: 3-5                    [1, 127, 256]             

In [50]:
import math
from contextlib import nullcontext

def train_epoch(
    model, loader, optimizer, scheduler, device, pad_id,
    grad_accum_steps: int = 1,     # 유효 배치 키우고 싶을 때 >1로
    use_amp: bool = True,          # CUDA일 때 자동혼합정밀도 사용
    log_every: int = 100
):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=(use_amp and device.type == "cuda"))

    total_loss = 0.0
    total_tok  = 0

    optimizer.zero_grad(set_to_none=True)

    for step, (inp, tgt) in enumerate(loader, start=1):
        inp = inp.to(device, non_blocking=True)
        tgt = tgt.to(device, non_blocking=True)

        ctx = torch.cuda.amp.autocast(enabled=(scaler.is_enabled()))
        with ctx:
            # next-token LM loss (PAD 무시)
            _, loss = model(inp, targets=tgt, pad_id=pad_id)
            # gradient accumulation 시 손실을 나눠서 스케일 유지
            loss_for_backward = loss / grad_accum_steps

        # backward
        if scaler.is_enabled():
            scaler.scale(loss_for_backward).backward()
        else:
            loss_for_backward.backward()

        # optimizer / scheduler step (accumulation 주기마다)
        if step % grad_accum_steps == 0:
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            if scaler.is_enabled():
                scaler.step(optimizer)
                scaler.update()
            else:
                optimizer.step()
            optimizer.zero_grad(set_to_none=True)
            if scheduler is not None:
                scheduler.step()

        # 통계 (PAD 제외 토큰 수 기준 평균)
        ntok = (tgt != pad_id).sum().item()
        total_loss += loss.item() * max(ntok, 1)
        total_tok  += max(ntok, 1)

        if step % log_every == 0:
            curr_lr = scheduler.get_last_lr()[0] if scheduler is not None else optimizer.param_groups[0]["lr"]
            # 로그에는 실제 손실(loss) 출력 (accum 나누기 전 값)
            print(f"[step {step:4d}] loss={loss.item():.4f}  lr={curr_lr:.8f}")

    return total_loss / max(total_tok, 1)

# ----- 학습 루프 -----
EPOCHS = 10
for ep in range(1, EPOCHS + 1):
    avg = train_epoch(gpt, train_loader, optimizer, scheduler, device, PAD_ID,
                      grad_accum_steps=1, use_amp=True, log_every=100)
    ppl = math.exp(min(avg, 20))  # overflow 방지
    print(f"[Epoch {ep}] avg_token_loss={avg:.4f}  ppl={ppl:.2f}")


  scaler = torch.cuda.amp.GradScaler(enabled=(use_amp and device.type == "cuda"))
  x = torch.tensor(x, dtype=torch.long)
  ctx = torch.cuda.amp.autocast(enabled=(scaler.is_enabled()))


[step  100] loss=3.8064  lr=0.00059868
[step  200] loss=3.6714  lr=0.00059864
[step  300] loss=3.8123  lr=0.00059859
[step  400] loss=3.5980  lr=0.00059854
[step  500] loss=3.6293  lr=0.00059850
[step  600] loss=3.8236  lr=0.00059845
[step  700] loss=3.7628  lr=0.00059840
[Epoch 1] avg_token_loss=3.7331  ppl=41.81
[step  100] loss=3.6507  lr=0.00059831
[step  200] loss=3.8333  lr=0.00059826
[step  300] loss=3.6719  lr=0.00059821
[step  400] loss=3.4927  lr=0.00059816
[step  500] loss=3.8156  lr=0.00059810
[step  600] loss=3.5673  lr=0.00059805
[step  700] loss=3.6043  lr=0.00059799
[Epoch 2] avg_token_loss=3.6749  ppl=39.45
[step  100] loss=3.5703  lr=0.00059789
[step  200] loss=3.6324  lr=0.00059784
[step  300] loss=3.6922  lr=0.00059778
[step  400] loss=3.4600  lr=0.00059772
[step  500] loss=3.6262  lr=0.00059766
[step  600] loss=3.5728  lr=0.00059760
[step  700] loss=3.6511  lr=0.00059754
[Epoch 3] avg_token_loss=3.6205  ppl=37.36
[step  100] loss=3.5930  lr=0.00059743
[step  200] l

In [51]:
@torch.no_grad()
def generate(model, tokenizer, prompt, max_new_tokens=40, temperature=1.0, top_k=50, top_p=None, device='cpu'):
    model.eval()
    text = preprocess_sentence(prompt)
    ids = [tokenizer.bos_id()] + tokenizer.encode(text)
    x = torch.tensor([ids], dtype=torch.long, device=device)

    for _ in range(max_new_tokens):
        if x.size(1) > model.block_size:
            x = x[:, -model.block_size:]
        logits, _ = model(x)
        logits = logits[:, -1, :] / max(temperature, 1e-8)
        probs = F.softmax(logits, dim=-1)

        if top_k is not None:
            v, ix = torch.topk(probs, top_k, dim=-1)
            mask = torch.ones_like(probs, dtype=torch.bool).scatter(1, ix, False)
            probs = probs.masked_fill(mask, 0)
            probs = probs / probs.sum(dim=-1, keepdim=True)

        if top_p is not None:
            sorted_probs, sorted_idx = torch.sort(probs, descending=True)
            cum = torch.cumsum(sorted_probs, dim=-1)
            mask = cum > top_p
            mask[..., 0] = False
            sorted_probs = sorted_probs.masked_fill(mask, 0)
            probs = torch.zeros_like(probs).scatter(1, sorted_idx, sorted_probs)
            probs = probs / probs.sum(dim=-1, keepdim=True)

        next_id = torch.multinomial(probs, num_samples=1)
        x = torch.cat([x, next_id], dim=1)
        if next_id.item() == tokenizer.eos_id():
            break

    out = x[0].tolist()
    out = [t for t in out if t not in (tokenizer.bos_id(), tokenizer.eos_id(), tokenizer.pad_id())]
    return tokenizer.decode(out)

print(generate(gpt, sp, "Where have you been?", device=device))


where have you been ?


In [52]:
# === GPT-1용 챗봇 생성 함수 (seq2seq 인퍼런스 대체 버전) ===
import torch
import torch.nn.functional as F

def decoder_inference_gpt1(model, sentence, tokenizer, device='cpu',
                           max_new_tokens=40, temperature=1.0, top_k=50, top_p=None):
    """
    - seq2seq의 enc/dec 입력을 쓰지 않고, GPT-1처럼 '프롬프트 이어쓰기'로 생성
    - 반환: 프롬프트 이후에 생성된 토큰 id 리스트
    """
    BOS = tokenizer.bos_id()
    EOS = tokenizer.eos_id()
    PAD = tokenizer.pad_id()

    # 전처리 후 프롬프트 토크나이즈 (BOS만 앞에 붙임)
    prompt = preprocess_sentence(sentence)
    prompt_ids = [BOS] + tokenizer.encode(prompt)

    # (1, T) 텐서
    x = torch.tensor([prompt_ids], dtype=torch.long, device=device)

    model.eval()
    with torch.no_grad():
        for _ in range(max_new_tokens):
            # 컨텍스트 길이 제한 (모델에 block_size 속성이 없으면 1024로 가정)
            max_ctx = getattr(model, 'block_size', 1024)
            if x.size(1) > max_ctx:
                x = x[:, -max_ctx:]

            # 마지막 토큰의 로짓으로 다음 토큰 샘플링
            logits, _ = model(x)                    # (B, T, V)
            logits = logits[:, -1, :] / max(temperature, 1e-8)
            probs = F.softmax(logits, dim=-1)

            # Top-k 필터링
            if top_k is not None:
                k = min(top_k, probs.size(-1))
                v, ix = torch.topk(probs, k, dim=-1)
                mask = torch.ones_like(probs, dtype=torch.bool).scatter(1, ix, False)
                probs = probs.masked_fill(mask, 0)
                probs = probs / probs.sum(dim=-1, keepdim=True)

            # Top-p(nucleus) 필터링 (선택)
            if top_p is not None:
                sorted_probs, sorted_idx = torch.sort(probs, descending=True)
                cum = torch.cumsum(sorted_probs, dim=-1)
                mask = cum > top_p
                mask[..., 0] = False
                sorted_probs = sorted_probs.masked_fill(mask, 0)
                probs = torch.zeros_like(probs).scatter(1, sorted_idx, sorted_probs)
                probs = probs / probs.sum(dim=-1, keepdim=True)

            next_id = torch.multinomial(probs, num_samples=1)  # (1,1)
            x = torch.cat([x, next_id], dim=1)

            # EOS면 조기 종료
            if next_id.item() == EOS:
                break

    # 프롬프트 이후 생성분만 반환
    full_ids = x[0].tolist()
    gen_ids = full_ids[len(prompt_ids):]
    # BOS/PAD 제거
    gen_ids = [t for t in gen_ids if t not in (BOS, PAD)]
    return gen_ids


def sentence_generation_gpt1(model, sentence, tokenizer, device='cpu',
                             max_new_tokens=40, temperature=1.0, top_k=50, top_p=None):
    """
    - decoder_inference_gpt1()를 호출해 문장을 생성하고 decode까지 수행
    - 출력: 디코딩된 문자열
    """
    out_ids = decoder_inference_gpt1(
        model, sentence, tokenizer, device=device,
        max_new_tokens=max_new_tokens, temperature=temperature,
        top_k=top_k, top_p=top_p
    )
    text = tokenizer.decode(out_ids) if out_ids else ""
    print("입력 :", sentence)
    print("출력 :", text)
    return text


In [56]:
sentence = "Where have you been?"

sentence_generation_gpt1(gpt, sentence, sp, device, max_new_tokens=40, temperature=1.0, top_k=50)



입력 : Where have you been?
출력 : 


''