In [None]:
# ! pip install datasets

In [None]:
import torch  # PyTorch for tensor operations and ML.
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments  # Tools for loading, training, and fine-tuning GPT-2.
from datasets import load_metric  # To load and calculate evaluation metrics.
from datasets import load_dataset, load_from_disk  # For loading and managing datasets.
from torch.quantization import quantize_dynamic  # For applying dynamic quantization to the GPT-2.


In [None]:

# Check device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load tokenizer and model
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Load the model
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
model.resize_token_embeddings(len(tokenizer))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50258, 768)

 Model Fine-Tuning

In [None]:


def tokenize_function(examples):
    tokenized = tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128, return_tensors="pt")
    tokenized["labels"] = tokenized["input_ids"].clone()  # For language modeling, labels are the same as input_ids
    tokenized["attention_mask"] = (tokenized["input_ids"] != tokenizer.pad_token_id).long()
    return tokenized

# Load and preprocess dataset
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

dataset = dataset.map(tokenize_function, batched=True)





Downloading readme:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/733k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/6.36M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

Map:   0%|          | 0/4358 [00:00<?, ? examples/s]

Map:   0%|          | 0/36718 [00:00<?, ? examples/s]

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

Training Arguments:

Device: The model was trained on a GPU to leverage faster computation.

Learning Rate: A learning rate of 1e-5 was chosen to ensure more gradual updates, helping to mitigate overfitting and stabilize training.

Batch Size: Both training and evaluation batch sizes were set to 16, which efficiently utilizes the available GPU memory.

Epochs: The model was trained for 3 epochs, balancing training time and performance while reducing the risk of overfitting.

Weight Decay: Regularization was applied with a weight decay of 0.1, which was adjusted to help minimize overfitting after multiple tests.

Mixed Precision Training: Using fp16 for mixed precision training was a strategic choice to speed up training and reduce memory usage without significant accuracy loss.

In [None]:
# Define training arguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    save_steps=1000,
    weight_decay=0.1,
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"]
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.8004,1.352406
2,0.9433,1.345479
3,0.9461,1.336387


TrainOutput(global_step=6885, training_loss=0.870061821061455, metrics={'train_runtime': 419.3909, 'train_samples_per_second': 262.652, 'train_steps_per_second': 16.417, 'total_flos': 7195590623232000.0, 'train_loss': 0.870061821061455, 'epoch': 3.0})

The decrease in validation loss over the epochs, although slight, suggests that the model is improving, but very slowly. The small improvements in validation loss between epochs indicate that the model may be nearing convergence


In [None]:
# Save the fine-tuned model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

Prepare for Quantization after Fine-tuning our model


In [None]:
# Load the fine-tuned model for quantization
model = AutoModelForCausalLM.from_pretrained("./fine_tuned_model").to(device)

Apply dynamic Quantization


In [None]:
# Apply dynamic quantization
quantized_model = quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)


In [None]:
# Save the quantized model
torch.save(quantized_model.state_dict(), "./quantized_model.pth")

In [None]:
# Define functions for evaluation :  tokenizes the prompt, generates text using the model, and decodes the output back into a human-readable string.
def generate_text(model, tokenizer, prompt, max_length=50):
    inputs = tokenizer(prompt, return_tensors='pt').to(model.device)
    outputs = model.generate(inputs['input_ids'], max_length=max_length, num_return_sequences=1)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# This function calculates the perplexity of a given text using the model.
# Perplexity is a measure of how well a language model predicts the next word in a sequence.
def calculate_perplexity(model, tokenizer, text):
    inputs = tokenizer(text, return_tensors='pt')
    input_ids = inputs['input_ids'].to(model.device)
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        loss = outputs.loss
    perplexity = torch.exp(loss)
    return perplexity.item()

In [None]:
# Load the original model
original_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

# Load the fine-tuned model
fine_tuned_model = AutoModelForCausalLM.from_pretrained("./fine_tuned_model").to(device)

In [None]:
# Define evaluation prompts
prompt = "once upon the time"

# Generate text
original_text = generate_text(original_model, tokenizer, prompt)
fine_tuned_text = generate_text(fine_tuned_model, tokenizer, prompt)

# Print generated texts
print("Original Text:", original_text)

print("Fine-Tuned Text:", fine_tuned_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Original Text: once upon the time of the first of the three, the first of the three, and the first of the three, and the first of the three, and the first of the three, and the first of the three, and the first of the
Fine-Tuned Text: once upon the time jump, the player assumes the role of a young lawyer who is hired by a law firm to represent clients in a trial. He is assisted by his partner, who is hired by the firm's client to represent the client in


Original Model Output:
Content: The response is highly repetitive and lacks coherence. This repetition could be due to overfitting, where the model memorizes patterns rather than generalizing effectively.

Fine-Tuned Model Output:
Content: The response is more structured and coherent compared to the original model output.

In [None]:
# Calculate perplexity
original_perplexity = calculate_perplexity(original_model, tokenizer, prompt)
fine_tuned_perplexity = calculate_perplexity(fine_tuned_model, tokenizer, prompt)

In [None]:
# Print perplexity results
print("Original Model Perplexity:", original_perplexity)
print("Fine-Tuned Model Perplexity:", fine_tuned_perplexity)


Original Model Perplexity: 723.1328125
Fine-Tuned Model Perplexity: 1384.2364501953125


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Save the fine-tuned model and tokenizer
model.save_pretrained("/content/fine_tuned_model")
tokenizer.save_pretrained("/content/fine_tuned_model")


('/content/fine_tuned_model/tokenizer_config.json',
 '/content/fine_tuned_model/special_tokens_map.json',
 '/content/fine_tuned_model/vocab.json',
 '/content/fine_tuned_model/merges.txt',
 '/content/fine_tuned_model/added_tokens.json',
 '/content/fine_tuned_model/tokenizer.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive')