In [1]:
from transformers import GPT2Tokenizer, LineByLineTextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, TrainingArguments, Trainer
import torch 

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Define the size of the text blocks
block_size = 128

# Load the dataset
train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="medical_corpus.txt",
    block_size=block_size,
)

# DataCollator handles padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False,
)

# Initialize the model
torch.set_default_dtype(torch.half)
model = GPT2LMHeadModel.from_pretrained('gpt2')
torch.set_default_dtype(torch.float)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=16,
    save_steps=100,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Training
#trainer.train()


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
from transformers import GPT2Tokenizer, LineByLineTextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel
from transformers import TrainingArguments, Trainer, DeepSpeedPlugin, DeepSpeedConfig, Accelerator
import torch
import deepspeed

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Define the size of the text blocks
block_size = 128

# Load the dataset
train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="medical_corpus.txt",
    block_size=block_size,
)

# DataCollator handles padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False,
)

# Initialize the model
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=16,
    save_steps=100,
    save_total_limit=2,
    deepspeed="/home/medical-llama/ds_config.json",  # path to deepspeed configuration file
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Training
trainer.train()


In [None]:
from transformers import GPT2Tokenizer, LineByLineTextDataset, DataCollatorForLanguageModeling, GPT2LMHeadModel, TrainingArguments, Trainer
import torch 
from accelerate import Accelerator

accelerator = Accelerator()

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Define the size of the text blocks
block_size = 128

# Load the dataset
train_dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="medical_corpus.txt",
    block_size=block_size,
)

# DataCollator handles padding
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False,
)

# Initialize the model
torch.set_default_dtype(torch.half)
model = GPT2LMHeadModel.from_pretrained('gpt2')
torch.set_default_dtype(torch.float)

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=16,
    save_steps=100,
    save_total_limit=2,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    accelerator=accelerator,
)

# Training
trainer.train()


In [5]:
from transformers import AdamW
import torch
from torch.cuda.amp import GradScaler, autocast
from torch.nn.utils.rnn import pad_sequence

# Initialize the optimizer
optimizer = AdamW(model.parameters(),lr=0.0001)

# For fp16 training
scaler = GradScaler()

# Define device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

# Move model to device
model.to(device)

# Convert dataset to DataLoader
from torch.utils.data import DataLoader

max_seq_length = 10

# Pad tokenized texts
def pad_collate_fn(batch):
    padded_batch = pad_sequence([torch.tensor(text) for text in batch], batch_first=True, padding_value=0)
    return padded_batch


dataloader = DataLoader(train_dataset, shuffle=True, batch_size=16, collate_fn=pad_collate_fn)

# Training loop
model.train()
for epoch in range(10):  # num of epochs
    for i, batch in enumerate(dataloader):
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Forward pass under autocast
        with autocast():
            outputs = model(**batch)
            loss = outputs.loss

        # Backward pass with gradient scaling
        scaler.scale(loss).backward()

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # Optimizer step and update scale
        scaler.step(optimizer)
        scaler.update()

        # Zero the gradients
        optimizer.zero_grad()

        # Log the loss
        if i % 10 == 0:  # print every 100 batches
            print(f'Epoch: {epoch}, Batch: {i}, Loss: {loss.item()}')
            
    # Save the model after each epoch
    #torch.save(model.state_dict(), f'./results/model_{epoch}.bin')




In [7]:
dataloader

<torch.utils.data.dataloader.DataLoader at 0x7f93709f8d90>

In [6]:
for i in dataloader:
    print(i)
    break   