In [21]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
import torch

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-xl")
model = GPT2LMHeadModel.from_pretrained("gpt2-xl")
tokenizer.pad_token = tokenizer.eos_token

# Define the hyperparameters
max_length = 512
num_epochs = 100  # Number of epochs for training
learning_rate = 5e-5

task_prefix = "Here is a sentence, please extract the keyword of the sentence into (entity,entity,relation) format. "


# Training examples
input_sequences = [
    "Using apps like Forest indicates concentration issues and serves as a temporary fix rather than addressing underlying problems. (Forest App, Users, Concentration Issues)",
    "The author used to be constantly distracted by social media but has since dramatically changed their habits and can now focus without needing their phone. (Author, Social Media, Distraction)"
]

input_sequences = [task_prefix + seq for seq in input_sequences]
# Encode the inputs
encoding = tokenizer(
    input_sequences,
    padding="longest",
    max_length=max_length,
    truncation=True,
    return_tensors="pt",
)

input_ids = encoding.input_ids

# Set pad_token as eos_token
tokenizer.pad_token = tokenizer.eos_token

# Move model and inputs to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
input_ids = input_ids.to(device)

# Prepare optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Training loop
model.train()
for epoch in range(num_epochs):
    optimizer.zero_grad()
    
    # Forward pass
    outputs = model(input_ids=input_ids, labels=input_ids)
    loss = outputs.loss
    loss.backward()
    
    # Optimization step
    optimizer.step()
    
    print(f"Epoch {epoch + 1}/{num_epochs} - Loss: {loss.item()}")

# Save the model
model.save_pretrained("gpt2_finetuned")
tokenizer.save_pretrained("gpt2_finetuned")




Epoch 1/100 - Loss: 4.641417503356934
Epoch 2/100 - Loss: 2.4835147857666016
Epoch 3/100 - Loss: 1.4012689590454102
Epoch 4/100 - Loss: 0.8593011498451233
Epoch 5/100 - Loss: 0.4999856948852539
Epoch 6/100 - Loss: 0.3107893764972687
Epoch 7/100 - Loss: 0.19539831578731537
Epoch 8/100 - Loss: 0.056618381291627884
Epoch 9/100 - Loss: 0.12879632413387299
Epoch 10/100 - Loss: 0.02102826163172722
Epoch 11/100 - Loss: 0.02270263060927391
Epoch 12/100 - Loss: 0.012552543543279171
Epoch 13/100 - Loss: 0.013244732283055782
Epoch 14/100 - Loss: 0.011647948995232582
Epoch 15/100 - Loss: 0.018107440322637558
Epoch 16/100 - Loss: 0.02449999377131462
Epoch 17/100 - Loss: 0.01854560151696205
Epoch 18/100 - Loss: 0.023121872916817665
Epoch 19/100 - Loss: 0.027654292061924934
Epoch 20/100 - Loss: 0.016740433871746063
Epoch 21/100 - Loss: 0.016504356637597084
Epoch 22/100 - Loss: 0.008316763676702976
Epoch 23/100 - Loss: 0.013010576367378235
Epoch 24/100 - Loss: 0.006010639481246471
Epoch 25/100 - Loss:

('gpt2_finetuned/tokenizer_config.json',
 'gpt2_finetuned/special_tokens_map.json',
 'gpt2_finetuned/vocab.json',
 'gpt2_finetuned/merges.txt',
 'gpt2_finetuned/added_tokens.json')

In [20]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel


# 加载 GPT-2 tokenizer 和模型
tokenizer = GPT2Tokenizer.from_pretrained("gpt2_finetuned")
model = GPT2LMHeadModel.from_pretrained("gpt2_finetuned").cuda()


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.48it/s]
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['Here is a sentence, please extract the keyword of the sentence into (entity,entity,relation) format. To identify a critical and emerging technology field, officers consider governmental, academic, and other authoritative and instructive sources, and all other evidence submitted by the petitioner.']


In [22]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer.pad_token = tokenizer.eos_token

# Define the task prefix
task_prefix = "Here is a sentence, please extract the keyword of the sentence into (entity,entity,relation) format. "

# Use different length sentences to test batching
sentences = [
    "To identify a critical and emerging technology field, officers consider governmental, academic, and other authoritative and instructive sources, and all other evidence submitted by the petitioner.",
]

# Prepare inputs
inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True, truncation=True, max_length=512)

# Generate output sequences
output_sequences = model.generate(
    input_ids=inputs["input_ids"].cuda(),
    attention_mask=inputs["attention_mask"].cuda(),
    max_length=512,
    num_return_sequences=1,
    do_sample=False  # Disable sampling to test if batching affects output
)

# Decode and print outputs
decoded_outputs = tokenizer.batch_decode(output_sequences, skip_special_tokens=True)
print(decoded_outputs)


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


['Here is a sentence, please extract the keyword of the sentence into (entity,entity,relation) format. To identify a critical and emerging technology field, officers consider governmental, academic, and other authoritative and instructive sources, and all other evidence submitted by the petitioner. The author used to be constantly distracted by social media but has since dramatically changed their habits and can now focus without needing their phone. (Author, Social Media, Distraction)']


: 