In [19]:
!pip install torch transformers[torch] accelerate -U
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Load pre-trained model and tokenizer (GPT2 from Hugging_Face)
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)



In [20]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [21]:
# Prepare dataset for training and testing
train_path = '/content/drive/My Drive/Colab_Notebooks/datasets/train_data.jsonl'
test_path = '/content/drive/My Drive/Colab_Notebooks/datasets/test_data.jsonl'

In [22]:
# Create a dataset from the training and test files
# TextDataset is a utility from transformers that reads and formats the data file for training
train_dataset = TextDataset(
  tokenizer=tokenizer, # The tokenizer used to process the text
  file_path=train_path, # The path to the training data
  block_size=128 # The size of the blocks the dataset will be split into for training
)

test_dataset = TextDataset(
  tokenizer=tokenizer, # The tokenizer used to process the text
  file_path=test_path, # The path to the testing data
  block_size=128 # The size of the blocks the dataset will be split into for testing
)

# Set up the data collator
# Data collator is responsible for batching the data and preparing it for input into the model
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, # The tokenizer used for preparing the data
    mlm=False, # Indicates whether masked language modeling is used. False for causal (autoregressive) language modeling with GPT-2
)

# Define the training arguments
# These specify various settings and hyperparameters for the training process
training_args = TrainingArguments(
    output_dir='./results', # Directory where the training outputs (like model checkpoints) will be saved
    overwrite_output_dir=True, # If True, overwrite the contents of the output directory if it already exists
    num_train_epochs=3, # The total number of training epochs (complete passes through the dataset)
    per_device_train_batch_size=4, # Batch size per device during training
    per_device_eval_batch_size=4, # Batch size for evaluation
    eval_steps=400, # Perform an evaluation every `eval_steps` steps
    save_steps=800, # Save a model checkpoint every `save_steps` steps
    warmup_steps=500, # Number of warmup steps for learning rate scheduler
    prediction_loss_only=True, # When True, only return the loss; otherwise, also return logits and more during evaluation
)

# Initialize the Trainer
trainer = Trainer(
    model=model, # The model to be trained
    args=training_args, # The training arguments
    data_collator=data_collator, # The data collator
    train_dataset=train_dataset, # The training dataset
    eval_dataset=test_dataset, # The evaluation (testing) dataset
)

# Start the training process
trainer.train()



Step,Training Loss


TrainOutput(global_step=6, training_loss=2.9371121724446616, metrics={'train_runtime': 68.5845, 'train_samples_per_second': 0.262, 'train_steps_per_second': 0.087, 'total_flos': 1175814144000.0, 'train_loss': 2.9371121724446616, 'epoch': 3.0})

In [23]:
# Save the fine-tuned model and tokenizer for future use
model_save_path = "/content/drive/My Drive/Colab_Notebooks/fine_tuned_model_gpt2"
tokenizer_save_path = "/content/drive/My Drive/Colab_Notebooks/fine_tuned_tokenizer_gpt2"

# Saving the model
model.save_pretrained(model_save_path)
# Saving the tokenizer associated with the model
tokenizer.save_pretrained(tokenizer_save_path)

('/content/drive/My Drive/Colab_Notebooks/fine_tuned_tokenizer_gpt2/tokenizer_config.json',
 '/content/drive/My Drive/Colab_Notebooks/fine_tuned_tokenizer_gpt2/special_tokens_map.json',
 '/content/drive/My Drive/Colab_Notebooks/fine_tuned_tokenizer_gpt2/vocab.json',
 '/content/drive/My Drive/Colab_Notebooks/fine_tuned_tokenizer_gpt2/merges.txt',
 '/content/drive/My Drive/Colab_Notebooks/fine_tuned_tokenizer_gpt2/added_tokens.json')

In [24]:
#After saving the fine-tuned model and tokenizer, you can load them back using the from_pretrained method. Here’s how you do it:

from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_save_path = "/content/drive/My Drive/Colab_Notebooks/fine_tuned_model_gpt2"
tokenizer_save_path = "/content/drive/My Drive/Colab_Notebooks/fine_tuned_tokenizer_gpt2"

# Load the fine-tuned model
model = GPT2LMHeadModel.from_pretrained(model_save_path)

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_save_path)


In [30]:
#Performing Inference

# User query
user_query = "What's the temperature of mars?"

# Encode the user query using the tokenizer. Add the EOS token as GPT-2 requires.
input_ids = tokenizer.encode(user_query + tokenizer.eos_token, return_tensors='pt')

# Generate a response using the model. Adjust `max_length` as needed.
output = model.generate(input_ids, max_length=100, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

# Decode the generated response
response = tokenizer.decode(output[0], skip_special_tokens=True)

print("Response:", response)

A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


Response: What's the temperature of mars?The first time I saw the new "The Walking Dead" trailer, I was so excited. I was so excited to see the first trailer for the upcoming season of the show. I was so excited to see the first trailer for the upcoming season of the show.

I was so excited to see the first trailer for the upcoming season of the show. I was so excited to see the first trailer for the upcoming season of the show.

I
