- **Author:** **Kandimalla Hemanth**
- **Date of modified:**  **1-13-2024**
- **E-mail:** **speechcodehemanth2@gmail.com**


In [None]:
!pip install -q -U transformers accelerate evaluate deepspeed tqdm datasets peft

In [None]:
from transformers import AutoModelForCausalLM
from peft import PeftModel, PeftConfig
import torch
from datasets import load_dataset
import os
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from tqdm import tqdm
from datasets import load_dataset

device = "cuda"
model_name_or_path = "bigscience/bloomz-7b1"
tokenizer_name_or_path = "bigscience/bloomz-7b1"
dataset_name = "twitter_complaints"
text_column = "Tweet text"
label_column = "text_label"
max_length = 64
lr = 1e-3
num_epochs = 50
batch_size = 8

In [None]:
from datasets import load_dataset

dataset = load_dataset("ought/raft", dataset_name)

classes = [k.replace("_", " ") for k in dataset["train"].features["Label"].names]
print(classes)
dataset = dataset.map(
    lambda x: {"text_label": [classes[label] for label in x["Label"]]},
    batched=True,
    num_proc=1,
)
print(dataset)
dataset["train"][0]

In [None]:
# data preprocessing
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
target_max_length = max([len(tokenizer(class_label)["input_ids"]) for class_label in classes])
print(target_max_length)


def preprocess_function(examples):
    batch_size = len(examples[text_column])
    inputs = [f"{text_column} : {x} Label : " for x in examples[text_column]]
    targets = [str(x) for x in examples[label_column]]
    model_inputs = tokenizer(inputs)
    labels = tokenizer(targets, add_special_tokens=False)  # don't add bos token because we concatenate with inputs
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i] + [tokenizer.eos_token_id]
        # print(i, sample_input_ids, label_input_ids)
        model_inputs["input_ids"][i] = sample_input_ids + label_input_ids
        labels["input_ids"][i] = [-100] * len(sample_input_ids) + label_input_ids
        model_inputs["attention_mask"][i] = [1] * len(model_inputs["input_ids"][i])
    # print(model_inputs)
    for i in range(batch_size):
        sample_input_ids = model_inputs["input_ids"][i]
        label_input_ids = labels["input_ids"][i]
        model_inputs["input_ids"][i] = [tokenizer.pad_token_id] * (
            max_length - len(sample_input_ids)
        ) + sample_input_ids
        model_inputs["attention_mask"][i] = [0] * (max_length - len(sample_input_ids)) + model_inputs[
            "attention_mask"
        ][i]
        labels["input_ids"][i] = [-100] * (max_length - len(sample_input_ids)) + label_input_ids
        model_inputs["input_ids"][i] = torch.tensor(model_inputs["input_ids"][i][:max_length])
        model_inputs["attention_mask"][i] = torch.tensor(model_inputs["attention_mask"][i][:max_length])
        labels["input_ids"][i] = torch.tensor(labels["input_ids"][i][:max_length])
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


processed_datasets = dataset.map(
    preprocess_function,
    batched=True,
    num_proc=1,
    remove_columns=dataset["train"].column_names,
    load_from_cache_file=False,
    desc="Running tokenizer on dataset",
)

train_dataset = processed_datasets["train"]


train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)

In [None]:
import argparse
import evaluate
import torch
from datasets import load_dataset
from torch.optim import AdamW
from torch.utils.data import DataLoader, random_split
from transformers import GPT2LMHeadModel, GPT2Tokenizer, get_linear_schedule_with_warmup, set_seed
from accelerate import Accelerator
from tqdm.auto import tqdm
import matplotlib.pyplot as plt

# Set random seed for reproducibility
set_seed(42)

# Argument parser to easily change dataset and model
parser = argparse.ArgumentParser()
parser.add_argument("--dataset_name", type=str, default="wikitext", help="The name of the dataset to load.")
parser.add_argument("--model_name", type=str, default="gpt2", help="The name of the model to use.")
args = parser.parse_args()

# Load the dataset
dataset = load_dataset(args.dataset_name, 'wikitext-2-raw-v1')

# Pre-processing
tokenizer = GPT2Tokenizer.from_pretrained(args.model_name)
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_dataset, test_dataset = random_split(tokenized_datasets['train'], [int(0.9 * len(tokenized_datasets['train'])), int(0.1 * len(tokenized_datasets['train']))])

# Make DataLoaders
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

# Load the model
model = GPT2LMHeadModel.from_pretrained(args.model_name)

# Setup for training
accelerator = Accelerator()
model, train_loader, test_loader = accelerator.prepare(model, train_loader, test_loader)

optimizer = AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=len(train_loader) * 3)

# Training loop
model.train()
for epoch in range(3):
    tqdm_train_loader = tqdm(train_loader, desc=f"Training Epoch {epoch}")
    for batch in tqdm_train_loader:
        outputs = model(batch['input_ids'], labels=batch['input_ids'])
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        tqdm_train_loader.set_postfix(loss=loss.item())

# Testing loop
model.eval()
test_loss = 0
with torch.no_grad():
    for batch in test_loader:
        outputs = model(batch['input_ids'], labels=batch['input_ids'])
        test_loss += outputs.loss.item()
test_loss /= len(test_loader)

# Plotting
plt.figure(figsize=(10, 4))
plt.title("Training and Testing Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.plot([1, 2, 3], [train_loss, train_loss, train_loss], label="Training Loss")
plt.plot([1, 2, 3], [test_loss, test_loss, test_loss], label="Testing Loss")
plt.legend()
plt.show()

# Evaluation using ROUGE
rouge = evaluate.load("rouge")
model.eval()
predictions = []
references = []
for batch in test_loader:
    generated_tokens = accelerator.unwrap_model(model).generate(batch['input_ids'], max_length=30)
    decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(batch['input_ids'], skip_special_tokens=True)
    predictions.extend(decoded_preds)
    references.extend(decoded_labels)

result = rouge.compute(predictions=predictions, references=references)
print(result)

In [None]:

import gc
import os
import torch
import matplotlib.pyplot as plt
from accelerate import Accelerator
from datasets import load_dataset, concatenate_datasets
from torch.utils.data import DataLoader
from tqdm.auto import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
    get_linear_schedule_with_warmup
)
import evaluate

# Define hyperparameters
model_checkpoint = "gpt2"
dataset_name = "wikitext"
dataset_config_name = "wikitext-103-raw-v1"
batch_size = 4
num_train_epochs = 3
learning_rate = 5e-5
weight_decay = 0.01
max_seq_length = 512
pad_to_max_length = True

# Check GPU availability and select device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load dataset
raw_datasets = load_dataset(dataset_name, dataset_config_name)
# Pre-process datasets
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
def tokenize(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_seq_length)

tokenized_datasets = raw_datasets.map(tokenize, batched=True)
# Split datasets into train and test
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Initialize Accelerator
accelerator = Accelerator()

# Load model
model = AutoModelForCausalLM.from_pretrained(model_checkpoint).to(device)

# Prepare optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
num_training_steps = num_train_epochs * (len(train_dataset) // batch_size)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Prepare training arguments for Trainer
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    optimizers=(optimizer, lr_scheduler)
)

# Train model
train_results = trainer.train()
trainer.save_model()

# Evaluate model
metrics = trainer.evaluate()
perplexity = torch.exp(torch.tensor(metrics["eval_loss"]))
metrics["perplexity"] = perplexity.item()

# Use evaluate library for ROUGE calculation
rouge_score = evaluate.load("rouge")

# Generate predictions for test set
test_dataloader = DataLoader(test_dataset, collate_fn=data_collator, batch_size=batch_size)
model.eval()
predictions = []
references = []
for batch in test_dataloader:
    with torch.no_grad():
        generated_tokens = model.generate(batch["input_ids"].to(device), max_length=max_seq_length)
        decoded_preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        decoded_labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
        predictions.extend(decoded_preds)
        references.extend(decoded_labels)

# Calculate ROUGE score
rouge_results = rouge_score.compute(predictions=predictions, references=references)

# Plot training and testing losses
losses = train_results.training_loss_history
eval_losses = metrics["eval_loss"]

plt.figure(figsize=(10, 5))
plt.plot(losses, label="Training Loss")
plt.plot(eval_losses, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Validation Losses")
plt.legend()
plt.show()

# Print evaluation metrics
print("Evaluation metrics:", metrics)
print("ROUGE metrics:", rouge_results)

# Cleanup to save memory
del model
torch.cuda.empty_cache()
gc.collect()

In [None]:
import gc
import matplotlib.pyplot as plt
from datasets import load_dataset
from accelerate import Accelerator
from torch.utils.data import DataLoader
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments,
)
import evaluate
from tqdm.auto import tqdm

# Define hyperparameters
model_checkpoint = "gpt2"
dataset_name = "wikitext"
dataset_config_name = "wikitext-103-raw-v1"
batch_size = 4
num_train_epochs = 3
learning_rate = 5e-5
weight_decay = 0.01
max_seq_length = 512
pad_to_max_length = True

# Load dataset
raw_datasets = load_dataset(dataset_name, dataset_config_name)

# Pre-process datasets
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

def tokenize(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_seq_length)

tokenized_datasets = raw_datasets.map(tokenize, batched=True)

# Split datasets into train and test
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]

# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Initialize Accelerator
accelerator = Accelerator()

# Load model
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

# Prepare training arguments for Trainer
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    accelerate=accelerator,
)

# Train model
train_results = trainer.train()
trainer.save_model()

# Evaluate model
metrics = trainer.evaluate()
perplexity = torch.exp(torch.tensor(metrics["eval_loss"]))
metrics["perplexity"] = perplexity.item()

# Use evaluate library for ROUGE calculation
rouge_score = evaluate.load("rouge")

# Generate predictions for test set
test_dataloader = DataLoader(test_dataset, collate_fn=data_collator, batch_size=batch_size)
model.eval()
predictions = []
references = []
for batch in tqdm(test_dataloader, desc="Generating predictions"):
    with torch.no_grad():
        generated_tokens = accelerator.unwrap_model(model).generate(
            batch["input_ids"].to(accelerator.device), max_length=max_seq_length
        )
        generated_tokens = accelerator.pad_across_processes(generated_tokens, dim=1, pad_index=tokenizer.pad_token_id)
        generated_tokens = accelerator.gather(generated_tokens)

        preds = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)
        labels = tokenizer.batch_decode(batch["labels"], skip_special_tokens=True)
        predictions.extend(preds)
        references.extend(labels)

# Calculate ROUGE score
rouge_results = rouge_score.compute(predictions=predictions, references=references)

# Plot training and testing losses
plt.figure(figsize=(10, 5))
plt.plot(train_results.log_history, label="Training Loss")
plt.plot(test_results.log_history, label="Validation Loss")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Validation Losses")
plt.legend()
plt.show()

# Print evaluation metrics
print("Evaluation metrics:", metrics)
print("ROUGE metrics:", rouge_results)

# Cleanup to save memory
del model
accelerator.free_memory()
gc.collect()

In [None]:
import os
import gc
import torch
from torch.utils.data import DataLoader
from datasets import load_dataset
from transformers import GPT2Tokenizer, GPT2LMHeadModel, get_linear_schedule_with_warmup
from accelerate import Accelerator
from peft import PeftModel, PeftConfig  # Assuming these are part of a custom library
import evaluate

# Check GPU availability and select device
if torch.cuda.is_available():
    device = torch.device("cuda")
    torch.cuda.empty_cache()  # Clear memory cache
    print(f"Using GPU: {torch.cuda.get_device_name(0)}")
else:
    device = torch.device("cpu")
    print("Using CPU")

# Function to track memory usage (using `os` module)
def report_memory_usage():
    pid = os.getpid()
    py = psutil.Process(pid)
    memory_use = py.memory_info()[0] / 2. ** 30  # Memory usage in GB
    print(f'Memory used: {memory_use:.2f} GB')

# Load dataset and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")
dataset = load_dataset("wikitext", "wikitext-103-raw-v1")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)

# Tokenize dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_dataset, eval_dataset = tokenized_datasets["train"].train_test_split(test_size=0.1).values()

# Instantiate custom model (assuming PeftModel is akin to GPT2LMHeadModel)
model_config = PeftConfig.from_pretrained("gpt2-large")  # Assuming PeftConfig exists
model = PeftModel(model_config)

# Move model to appropriate device
model.to(device)

# Prepare for distributed training using Accelerate
accelerator = Accelerator()
model = accelerator.prepare(model)

# DataLoader
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=1)  # Small batch size for demonstration
eval_dataloader = DataLoader(eval_dataset, batch_size=1)

# Optimizer and scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=len(train_dataloader) * 3)

# Training loop
model.train()
for epoch in range(3):
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        accelerator.backward(loss)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        report_memory_usage()  # Report memory usage after each batch

# Evaluation
model.eval()
evaluator = evaluate.load("rouge")
for batch in eval_dataloader:
    with torch.no_grad():
        outputs = model.generate(**batch)
        # ... (evaluation logic)
    report_memory_usage()  # Report memory usage after each evaluation batch

# Clean up to save memory
del model
torch.cuda.empty_cache()
gc.collect()

# Note: Actual evaluation logic, saving/loading models, detailed memory management, and other aspects are omitted due to complexity