In [None]:
!pip install torch transformers[torch] accelerate -U
from transformers import GPT2LMHeadModel, GPT2Tokenizer, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

# Load pre-trained model and tokenizer (GPT2 from Hugging_Face)
model_name = 'gpt2'
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Prepare dataset for training and testing
train_path = '/content/drive/My Drive/Colab_Notebooks/datasets/train_data.jsonl'
test_path = '/content/drive/My Drive/Colab_Notebooks/datasets/test_data.jsonl'

In [4]:
# Create a dataset from the training and test files
# TextDataset is a utility from transformers that reads and formats the data file for training
train_dataset = TextDataset(
  tokenizer=tokenizer, # The tokenizer used to process the text
  file_path=train_path, # The path to the training data
  block_size=128 # The size of the blocks the dataset will be split into for training
)

test_dataset = TextDataset(
  tokenizer=tokenizer, # The tokenizer used to process the text
  file_path=test_path, # The path to the testing data
  block_size=128 # The size of the blocks the dataset will be split into for testing
)

# Set up the data collator
# Data collator is responsible for batching the data and preparing it for input into the model
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, # The tokenizer used for preparing the data
    mlm=False, # Indicates whether masked language modeling is used. False for causal (autoregressive) language modeling with GPT-2
)

# Define the training arguments
# These specify various settings and hyperparameters for the training process
training_args = TrainingArguments(
    output_dir='./results', # Directory where the training outputs (like model checkpoints) will be saved
    overwrite_output_dir=True, # If True, overwrite the contents of the output directory if it already exists
    num_train_epochs=3, # The total number of training epochs (complete passes through the dataset)
    per_device_train_batch_size=4, # Batch size per device during training
    per_device_eval_batch_size=4, # Batch size for evaluation
    eval_steps=400, # Perform an evaluation every `eval_steps` steps
    save_steps=800, # Save a model checkpoint every `save_steps` steps
    warmup_steps=500, # Number of warmup steps for learning rate scheduler
    prediction_loss_only=True, # When True, only return the loss; otherwise, also return logits and more during evaluation
)

# Initialize the Trainer
trainer = Trainer(
    model=model, # The model to be trained
    args=training_args, # The training arguments
    data_collator=data_collator, # The data collator
    train_dataset=train_dataset, # The training dataset
    eval_dataset=test_dataset, # The evaluation (testing) dataset
)

# Start the training process
trainer.train()



Step,Training Loss


TrainOutput(global_step=6, training_loss=2.9471543629964194, metrics={'train_runtime': 1.197, 'train_samples_per_second': 15.038, 'train_steps_per_second': 5.013, 'total_flos': 1175814144000.0, 'train_loss': 2.9471543629964194, 'epoch': 3.0})

In [5]:
# Save the fine-tuned model and tokenizer for future use
model_save_path = "/content/drive/My Drive/Colab_Notebooks/fine_tuned_model_gpt2"
tokenizer_save_path = "/content/drive/My Drive/Colab_Notebooks/fine_tuned_tokenizer_gpt2"

# Saving the model
model.save_pretrained(model_save_path)
# Saving the tokenizer associated with the model
tokenizer.save_pretrained(tokenizer_save_path)

('/content/drive/My Drive/Colab_Notebooks/fine_tuned_tokenizer_gpt2/tokenizer_config.json',
 '/content/drive/My Drive/Colab_Notebooks/fine_tuned_tokenizer_gpt2/special_tokens_map.json',
 '/content/drive/My Drive/Colab_Notebooks/fine_tuned_tokenizer_gpt2/vocab.json',
 '/content/drive/My Drive/Colab_Notebooks/fine_tuned_tokenizer_gpt2/merges.txt',
 '/content/drive/My Drive/Colab_Notebooks/fine_tuned_tokenizer_gpt2/added_tokens.json')

In [10]:
#After saving the fine-tuned model and tokenizer, you can load them back using the from_pretrained method. Here’s how you do it:

from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_save_path = "/content/drive/My Drive/Colab_Notebooks/fine_tuned_model_gpt2"
tokenizer_save_path = "/content/drive/My Drive/Colab_Notebooks/fine_tuned_tokenizer_gpt2"

# Load the fine-tuned model
model = GPT2LMHeadModel.from_pretrained(model_save_path)

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_save_path)

# Ensure the tokenizer uses the correct padding token
tokenizer.pad_token = tokenizer.eos_token

In [15]:
# User query
user_query = "What's the temperature of the sun?"

input_ids = tokenizer.encode(user_query, return_tensors='pt', add_special_tokens=True)

# Generate a response using the model with adjusted parameters
output = model.generate(
    input_ids,
    max_length=200,  # Adjust as needed for longer responses
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id,
    temperature=0.3,  # Adjust for creativity
    top_k=50,
    top_p=0.92,
    repetition_penalty=1.2,
    no_repeat_ngram_size=2,
    do_sample=True  # Important for using temperature, top_k, and top_p
)

# Decode the generated response
response = tokenizer.decode(output[0], skip_special_tokens=True)

print("Response:", response)

Response: What's the temperature of the sun?
The temperatures are not exactly what you'd expect from a planet with an average annual mean surface area (AAP). The Aap is just one measure that measures how much heat energy gets absorbed by Earth. So, for example if we were to take our current climate and put it into perspective: If there was no solar activity at all in 2012-2013 then this would be about 1/3 as hot per year compared between 2011 & 2013 - which means 2 times more CO2 than today! This isn't even close enough... but let's go back further because now I'm sure some people will say "well why don' t they have better data?" Well yes indeed!! We can see here on NASA website http://www1.nasaearthquakes.org/. And look up their page where most scientists agree when looking at global warming rates over time based upon satellite measurements or other sources such like weather stations etc.. It seems pretty clear these
