# BERT, GPT-2 실습 노트북

**작성일**: 2025-07-19 by Youngwoo Kimh (Credit : DSAIL Lab, SNU)

**목표**:  
- BERT 및 GPT-2의 구조 이해
- BERT의 학습 목표 (MLM, NSP) 이해

---
> 본 노트북은 HD현대 실습을 위해 교육용 자료로서 준비되었으며, PyTorch와 HuggingFace Transformers 라이브러리를 사용합니다.

### 1. BERT

MLM

In [None]:
from transformers import pipeline, BertTokenizer, BertForMaskedLM
import torch, os
print('Transformers version:', __import__('transformers').__version__)

In [None]:
# load tokenizer and model

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

In [None]:
print(model)
print("\nEmbedding matrices:")

# Token embeddings
print("Token embeddings  :", model.bert.embeddings.word_embeddings.weight.shape)

# Position embeddings
print("Position embeddings:", model.bert.embeddings.position_embeddings.weight.shape)

# Segment embeddings
print("Segment(Type) embeddings:", model.bert.embeddings.token_type_embeddings.weight.shape)

In [None]:
def show_top_k(prompt, k = 5):

    # tokenize sentence
    inputs = tokenizer(prompt, return_tensors='pt')
    mask_idx = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]

    # inference
    with torch.no_grad():
        logits = model(**inputs).logits

    # probability for each word in the dictionary
    predicted_token_id = logits[0, mask_idx]

    top_k_tokens = torch.topk(predicted_token_id, k)
    top_k_probs = torch.softmax(predicted_token_id, dim=-1)

    print('Top', k, 'predicted tokens:')
    for i, token_id in enumerate(top_k_tokens.indices[0]):
        token = tokenizer.decode([token_id])
        probability = top_k_probs[0, token_id].item()
        print(f'{i+1}: {token} ({probability:.4f})')

In [None]:
text = 'The capital of France is [MASK].'

inputs = tokenizer(text, return_tensors='pt')

mask_idx = (inputs.input_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]

# tokenized words
print(inputs['input_ids'])

# token type
print(inputs['token_type_ids'])

# attention mask
print(inputs['attention_mask'])

# index of the [MASK] token
print(mask_idx)

In [None]:
# inference [MASK]
with torch.no_grad():
    logits = model(**inputs).logits

predicted_token_id = logits[0, mask_idx]

k = 5
top_k_tokens = torch.topk(predicted_token_id, k)
top_k_probs = torch.softmax(predicted_token_id, dim=-1)

print('Top', k, 'predicted tokens:')
for i, token_id in enumerate(top_k_tokens.indices[0]):
    token = tokenizer.decode([token_id])
    probability = top_k_probs[0, token_id].item()
    print(f'{i+1}: {token} ({probability:.4f})')

In [None]:
text = 'I went to [MASK] yesterday.'
show_top_k(text)

In [None]:
text = 'The most important thing while bouldering is [MASK].'
show_top_k(text)

In [None]:
text = '' # any sentence you want. enter [MASK] that you want to mask.
show_top_k(text)

NSP

In [None]:
from transformers import BertForNextSentencePrediction

# load model for next sentence prediction
nsp_model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

# positive pair
sent_a = "The Eiffel Tower is located in Paris."
sent_b = "It is one of the most famous landmarks in the world."
encoding = tokenizer(sent_a, sent_b, return_tensors='pt')

In [None]:
print(nsp_model)
print(encoding['input_ids'])
print(encoding['token_type_ids'])
print(encoding['attention_mask'])

# token number for [CLS] and [SEP]
print(tokenizer.cls_token, ':', tokenizer.cls_token_id)
print(tokenizer.sep_token, ':' , tokenizer.sep_token_id)

In [None]:
# inference
logits = nsp_model(**encoding).logits

# probability that sent_b is the next sentence of sent_a
prob = torch.softmax(logits, dim=1)
print("IsNext prob (positive):", prob)
print(torch.argmax(prob, dim=1).item())

In [None]:
def is_next_sentence(sent_1, sent_2):
    encoding = tokenizer(sent_1, sent_2, return_tensors='pt')
    logits = nsp_model(**encoding).logits
    prob = torch.softmax(logits, dim=1)
    print("IsNext prob (positive):", prob)
    print(torch.argmax(prob, dim=1).item())

In [None]:
sent_c = "Deep learning models require large datasets."
is_next_sentence(sent_a, sent_c)

In [None]:
sent_a = '' # fill your own sentence
sent_b = ''
is_next_sentence(sent_a, sent_b)

### 2. GPT-2

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch
import pandas as pd

In [None]:
# load tokenizer and model for gpt-2
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
gpt2_model = GPT2LMHeadModel.from_pretrained("gpt2", pad_token_id=tokenizer.eos_token_id)
print(gpt2_model)

In [None]:
# tokenize input sentence
sentence = 'In the future, artificial intelligence will'
input_ids = gpt2_tokenizer.encode(sentence, return_tensors='pt')
print(input_ids)

In [None]:
# generate the next word after the input sentence
generated_ids = gpt2_model.generate(
    input_ids,
    max_length=50,        # total tokens (prompt + generated)
    do_sample=True,       # switch on sampling
    top_k=50,             # top‑k sampling
    top_p=0.95,           # nucleus sampling
    temperature=0.8,      # softness of probabilities
    eos_token_id=gpt2_tokenizer.eos_token_id,
)

generated_text = gpt2_tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(generated_text)

### 3. Attention Mask comparison

In [None]:
text = "The quick brown fox jumps."
bert_tok = tokenizer
gpt2_tok = gpt2_tokenizer

bert_ids = bert_tok(text, return_tensors="pt")
gpt2_ids = gpt2_tok(text, return_tensors="pt")

bert = model
gpt2 = gpt2_model

# attention mask for decoder-only transformer based gpt-2
causal_mask = gpt2.transformer.h[0].attn.bias.float()
print("GPT-2 attention mask:")
print(causal_mask)
