In [1]:
import json
import random
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling

# Check if CUDA is available and set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the JSON dataset
with open('dataset.json', 'r') as f:
    dataset = json.load(f)

# Shuffle the dataset
random.shuffle(dataset)

# Split the dataset into training and testing sets
train_data = dataset[:80]
test_data = dataset[80:]

# Function to write the datasets to files
def write_dataset_to_file(data, file_path):
    with open(file_path, 'w') as f:
        for entry in data:
            sentence = entry['sentence']
            output = " | ".join(entry['output'])
            f.write(f"Sentence: {sentence} Output: {output}\n")

# Write the train and test datasets to respective files
write_dataset_to_file(train_data, 'train.txt')
write_dataset_to_file(test_data, 'test.txt')

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2').to(device)

# Function to create a dataset
def create_dataset(file_path, tokenizer):
    return TextDataset(
        tokenizer=tokenizer,
        file_path=file_path,
        block_size=128
    )

# Function to create a data collator
def create_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False,
    )

# Load the datasets
train_dataset = create_dataset("train.txt", tokenizer)
test_dataset = create_dataset("test.txt", tokenizer)
data_collator = create_data_collator(tokenizer)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=500,
    save_steps=10_000,
    save_total_limit=2,
    evaluation_strategy="epoch",
    logging_dir='./logs',
    logging_steps=10,
    report_to=[],  # Disables wandb logging
)

# Create Trainer instance
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('./finetuned_gpt2')
tokenizer.save_pretrained('./finetuned_gpt2')

# Example function to generate relation extractions
def generate_relation_extraction(model, tokenizer, sentence):
    inputs = tokenizer.encode(sentence, return_tensors="pt").to(device)
    outputs = model.generate(inputs, max_length=50, num_return_sequences=1)
    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return result

# Example usage
sentence = "Apple Inc. is looking at buying U.K. startup for $1 billion."
result = generate_relation_extraction(model, tokenizer, sentence)
print(result)


  from .autonotebook import tqdm as notebook_tqdm


Epoch,Training Loss,Validation Loss
1,No log,2.406777


RuntimeError: CUDA error: an illegal memory access was encountered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
