In [1]:
from transformers import GPT2LMHeadModel

from LSTM import max_length

model = GPT2LMHeadModel.from_pretrained("gpt2")

for main_name, main_module in model.named_modules():
    print(main_name)
    for sub_name, sub_module in main_module.named_children():
        print("L", sub_name)
        for ssub_name, ssub_module in sub_module.named_children():
            print("| L", ssub_name)
            for sssub_name, sssub_module in ssub_module.named_children():
                print("| | L", sssub_name)

  from .autonotebook import tqdm as notebook_tqdm



L transformer
| L wte
| L wpe
| L drop
| L h
| | L 0
| | L 1
| | L 2
| | L 3
| | L 4
| | L 5
| | L 6
| | L 7
| | L 8
| | L 9
| | L 10
| | L 11
| L ln_f
L lm_head
transformer
L wte
L wpe
L drop
L h
| L 0
| | L ln_1
| | L attn
| | L ln_2
| | L mlp
| L 1
| | L ln_1
| | L attn
| | L ln_2
| | L mlp
| L 2
| | L ln_1
| | L attn
| | L ln_2
| | L mlp
| L 3
| | L ln_1
| | L attn
| | L ln_2
| | L mlp
| L 4
| | L ln_1
| | L attn
| | L ln_2
| | L mlp
| L 5
| | L ln_1
| | L attn
| | L ln_2
| | L mlp
| L 6
| | L ln_1
| | L attn
| | L ln_2
| | L mlp
| L 7
| | L ln_1
| | L attn
| | L ln_2
| | L mlp
| L 8
| | L ln_1
| | L attn
| | L ln_2
| | L mlp
| L 9
| | L ln_1
| | L attn
| | L ln_2
| | L mlp
| L 10
| | L ln_1
| | L attn
| | L ln_2
| | L mlp
| L 11
| | L ln_1
| | L attn
| | L ln_2
| | L mlp
L ln_f
transformer.wte
transformer.wpe
transformer.drop
transformer.h
L 0
| L ln_1
| L attn
| | L c_attn
| | L c_proj
| | L attn_dropout
| | L resid_dropout
| L ln_2
| L mlp
| | L c_fc
| | L c_proj
| | L act
| | 

In [6]:
from transformers import pipeline
import torch

# 장치 설정
device = "cuda" if torch.cuda.is_available() else "cpu"

# 텍스트 생성 파이프라인 초기화
generator = pipeline(task="text-generation", model="gpt2", device=0 if device == "cuda" else -1)

# 텍스트 생성
outputs = generator(
    text_inputs="machine learning is",
    max_length=20,
    num_return_sequences=3,
    pad_token_id=generator.tokenizer.eos_token_id  # 수정된 부분
)

# 출력 결과
for idx, output in enumerate(outputs):
    print(f"Generated Text {idx + 1}: {output['generated_text']}")


Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Generated Text 1: machine learning is likely to run its course on what's happening and how it might play out in a
Generated Text 2: machine learning is something new, and I don't mind using it if it works well for me.
Generated Text 3: machine learning is already pretty big in Russia and you have quite a number of other things you've got


In [7]:
#!pip install torch==2.3.0 torchtext==0.18.0 portalocker torchdata torchvision==0.18.0

import torch

from torchtext.datasets import CoLA
from transformers import AutoTokenizer
from torch.utils.data import DataLoader


def collator(batch, tokenizer, device):
    source, labels, texts = zip(*batch)
    tokenized = tokenizer(
        texts,
        padding="longest",
        truncation=True,
        return_tensors="pt"
    )
    input_ids = tokenized["input_ids"].to(device)
    attention_mask = tokenized["attention_mask"].to(device)
    labels = torch.tensor(labels, dtype=torch.long).to(device)
    return input_ids, attention_mask, labels


train_data = list(CoLA(split="train"))
valid_data = list(CoLA(split="dev"))
test_data = list(CoLA(split="test"))

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

epochs = 3
batch_size = 16
device = "cuda" if torch.cuda.is_available() else "cpu"

train_dataloader = DataLoader(
    train_data,
    batch_size=batch_size,
    collate_fn=lambda x: collator(x, tokenizer, device),
    shuffle=True,
)
valid_dataloader = DataLoader(
    valid_data, batch_size=batch_size, collate_fn=lambda x: collator(x, tokenizer, device)
)
test_dataloader = DataLoader(
    test_data, batch_size=batch_size, collate_fn=lambda x: collator(x, tokenizer, device)
)

print("Train Dataset Length :", len(train_data))
print("Valid Dataset Length :", len(valid_data))
print("Test Dataset Length :", len(test_data))

################################################################################
The 'datapipes', 'dataloader2' modules are deprecated and will be removed in a
future torchdata release! Please see https://github.com/pytorch/data/issues/1196
to learn more and leave feedback.
################################################################################



Train Dataset Length : 8550
Valid Dataset Length : 526
Test Dataset Length : 515


In [8]:
from torch import optim
from transformers import GPT2ForSequenceClassification

model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.config.pad_token_id = model.config.eos_token_id
optimizer = optim.Adam(model.parameters(), lr=5e-5)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
import numpy as np
from torch import nn


def calc_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

def train(model, optimizer, dataloader):
    model.train()
    train_loss = 0.0

    for input_ids, attention_mask, labels in dataloader:
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        train_loss += loss.item()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    train_loss = train_loss / len(dataloader)
    return train_loss

def evaluation(model, dataloader):
    with torch.no_grad():
        model.eval()
        criterion = nn.CrossEntropyLoss()
        val_loss, val_accuracy = 0.0, 0.0

        for input_ids, attention_mask, labels in dataloader:
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            logits = outputs.logits

            loss = criterion(logits, labels)
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to("cpu").numpy()
            accuracy = calc_accuracy(logits, label_ids)

            val_loss += loss
            val_accuracy += accuracy

    val_loss = val_loss/len(dataloader)
    val_accuracy = val_accuracy/len(dataloader)
    return val_loss, val_accuracy


best_loss = 10000
for epoch in range(epochs):
    train_loss = train(model, optimizer, train_dataloader)
    val_loss, val_accuracy = evaluation(model, valid_dataloader)
    print(f"Epoch {epoch + 1}: Train Loss: {train_loss:.4f} Val Loss: {val_loss:.4f} Val Accuracy {val_accuracy:.4f}")

    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model.state_dict(), "../models/GPT2ForSequenceClassification.pt")
        print("Saved the model weights")

Epoch 1: Train Loss: 0.6394 Val Loss: 0.5318 Val Accuracy 0.7370
Saved the model weights
Epoch 2: Train Loss: 0.5279 Val Loss: 0.4934 Val Accuracy 0.7827
Saved the model weights
Epoch 3: Train Loss: 0.4056 Val Loss: 0.4697 Val Accuracy 0.7841
Saved the model weights


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2).to(device)
model.config.pad_token_id = model.config.eos_token_id
model.load_state_dict(torch.load("../models/GPT2ForSequenceClassification.pt"))

test_loss, test_accuracy = evaluation(model, test_dataloader)
print(f"Test Loss : {test_loss:.4f}")
print(f"Test Accuracy : {test_accuracy:.4f}")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Test Loss : 0.5447
Test Accuracy : 0.7241
