In [None]:
# Noted that we run the following on Colab

In [None]:
# Check GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [None]:
# Check memory
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

In [None]:
# intsall packages for colab
! pip install datasets transformers
! apt install git-lfs

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from nltk.tokenize import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer
from sklearn.model_selection import train_test_split
import transformers
import os
print(transformers.__version__)
os.getcwd()

In [None]:
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import TextDataset, DataCollatorForLanguageModeling

tokenizer = AutoTokenizer.from_pretrained("HScomcom/gpt2-game-of-thrones", truncation=True, max_length = 512)

SPECIAL_TOKENS = {
    "bos_token": "[BOS]",
    "eos_token": "[EOS]",
}
tokenizer.add_special_tokens(SPECIAL_TOKENS)

train_path = 'GOT_Train_Final1.txt'
test_path = 'GOT_Test_Final1.txt'

def load_dataset(train_path,test_path,tokenizer):
    train_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=train_path,
          block_size=50,
          )

    test_dataset = TextDataset(
          tokenizer=tokenizer,
          file_path=test_path,
          block_size=50,
          )   
    
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer, mlm=False,
    )

    return train_dataset,test_dataset,data_collator


train_dataset,test_dataset,data_collator = load_dataset(train_path,test_path,tokenizer)

In [None]:
from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

model = AutoModelWithLMHead.from_pretrained("HScomcom/gpt2-game-of-thrones")
# model = AutoModelWithLMHead.from_pretrained("gpt2-xl")
model.resize_token_embeddings(len(tokenizer))

training_args = TrainingArguments(
    f"GPT2-large-GOTfinetuned_v5",
    # output_dir="/content", #The output directory
    overwrite_output_dir = True, #overwrite the content of the output directory
    num_train_epochs = 3, # number of training epochs
    per_device_train_batch_size = 2, # batch size for training
    per_device_eval_batch_size = 2,  # batch size for evaluation
    eval_steps = 50, # Number of update steps between two evaluations.
    evaluation_strategy = "steps",
    save_steps = 300, # after # steps model is saved 
    # warmup_steps = 50, # number of warmup steps for learning rate scheduler
    learning_rate = 9e-6,
    logging_strategy = 'steps',
    logging_steps = 50,
    push_to_hub=True # push to the huggingface
    )


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

In [None]:
trainer.train()

# ***** Running training *****
#   Num examples = 378
#   Num Epochs = 3
#   Instantaneous batch size per device = 2
#   Total train batch size (w. parallel, distributed & accumulation) = 2
#   Gradient Accumulation steps = 1
#   Total optimization steps = 567
#   Number of trainable parameters = 774032640
#  [567/567 11:23, Epoch 3/3]
# Step	Training Loss	Validation Loss
# 50	7.001300	5.906636
# 100	5.125400	4.208858
# 150	3.968200	3.765974
# 200	3.614600	3.627266
# 250	3.252600	3.604956
# 300	3.242000	3.568900
# 350	3.140900	3.545990
# 400	3.062200	3.538033
# 450	2.877700	3.557756
# 500	2.877600	3.549932
# 550	2.848400	3.549331
# ***** Running Evaluation *****
#   Num examples = 98
#   Batch size = 2
# ***** Running Evaluation *****
#   Num examples = 98
#   Batch size = 2
# ***** Running Evaluation *****
#   Num examples = 98
#   Batch size = 2
# ***** Running Evaluation *****
#   Num examples = 98
#   Batch size = 2
# ***** Running Evaluation *****
#   Num examples = 98
#   Batch size = 2
# ***** Running Evaluation *****
#   Num examples = 98
#   Batch size = 2
# Saving model checkpoint to GPT2-large-GOTfinetuned_v5/checkpoint-300
# Configuration saved in GPT2-large-GOTfinetuned_v5/checkpoint-300/config.json
# Model weights saved in GPT2-large-GOTfinetuned_v5/checkpoint-300/pytorch_model.bin
# Several commits (3) will be pushed upstream.
# WARNING:huggingface_hub.repository:Several commits (3) will be pushed upstream.
# ***** Running Evaluation *****
#   Num examples = 98
#   Batch size = 2
# ***** Running Evaluation *****
#   Num examples = 98
#   Batch size = 2
# ***** Running Evaluation *****
#   Num examples = 98
#   Batch size = 2
# ***** Running Evaluation *****
#   Num examples = 98
#   Batch size = 2
# ***** Running Evaluation *****
#   Num examples = 98
#   Batch size = 2


# Training completed. Do not forget to share your model on huggingface.co/models =)


# TrainOutput(global_step=567, training_loss=3.7041593884664867, metrics={'train_runtime': 685.3742, 'train_samples_per_second': 1.655, 'train_steps_per_second': 0.827, 'total_flos': 240994414080000.0, 'train_loss': 3.7041593884664867, 'epoch': 3.0})