In [None]:
!pip install transformers[torch]
!pip install datasets

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, TrainerCallback
import string
import numpy as np
import torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd '/content/drive/MyDrive/ECE1786/Project/CSVs'

/content/drive/.shortcut-targets-by-id/1dzrXydfa8tV9U7malP1p3mQtrDhnia76/Project/CSVs


In [None]:
# Read CSV as Pandas DataFrame
data_df = pd.read_csv("trainingdata_gpt2.csv", header=None, names=["sentence", "label"])

In [None]:
# Remove trailing punctuation from masked token labels

def remove_punctuation(text):
  return text.translate(str.maketrans('', '', string.punctuation))

data_df['label'] = data_df['label'].apply(remove_punctuation)

In [None]:
data_df

Unnamed: 0,sentence,label
0,"MEMORANDUM FOR: Inspector General, [MASK]",CIA
1,document that would [MASK] CIA and the IC impr...,help
2,document that would help [MASK] and the IC imp...,CIA
3,sets a precedent for demands that [MASK] CIA also,the
4,sets a precedent for demands that the [MASK] also,CIA
...,...,...
15942,"James Locher, Victory on [MASK] Potomac: The G...",the
15943,"James Locher, Victory on the Potomac: The Gold...",Pentagon
15944,8. For a history of the DCI’s authority over t...,see
15945,8. For a history of the DCI’s authority over t...,CIA


In [None]:
model_name = "gpt2" # Just change this to try out different models
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/ECE1786/Project/Tokenizer Files/gpt2_tokenizer") #load just the tokenizer


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Add masked token labels to vocabulary (cannot be split into multiple tokens)
# only needs to be run when trying out a new model or training set

# for i, word in enumerate(data_df["label"].values):
#   if word not in tokenizer.vocab:
#     tokenizer.add_tokens(word)

In [None]:
#tokenizer.save_pretrained("/content/drive/MyDrive/ECE1786/Project/Tokenizer Files/gpt2_tokenizer") #save the tokenizer

In [None]:
# tokenizer = AutoTokenizer.from_pretrained("drive/MyDrive/Project/Tokenizer Files/gpt2_tokenizer")

In [None]:
# data_df = data_df[:20] # For testing purposes

# Convert Pandas DataFrame to HuggingFace Dataset
full_dataset = Dataset.from_pandas(data_df)
full_dataset = full_dataset.shuffle(seed=42)
# Split dataset into train-test-validation sets
train_testvalid = full_dataset.train_test_split(test_size =0.3)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size = 0.65)
# gather everyone if you want to have a single DatasetDict
data_ds = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'val': test_valid['train']})
data_ds

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 11162
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 3111
    })
    val: Dataset({
        features: ['sentence', 'label'],
        num_rows: 1674
    })
})

In [None]:
# Apply tokenizer

def tokenize_function(batch):
  # Tokenize sentences
  encoded_batch = tokenizer(batch["sentence"], padding="max_length", truncation=True, max_length=64)
  # Create label for each sentence: -100 for unmasked token, actual ID for masked token (list)
  tokenized_sentences = encoded_batch.input_ids.copy()
  encoded_labels = []
  for i, sent in enumerate(tokenized_sentences):
    encoded_labels += [[-100]*len(sent)]
    for j, token_id in enumerate(sent):
      if token_id == tokenizer.mask_token_id:
        if tokenizer.tokenize(batch["label"][i]) == []:
          label = " "
        else:
          label = tokenizer.tokenize(batch["label"][i])[0]
        encoded_labels[-1][j] = tokenizer.convert_tokens_to_ids(label)
        break # Can break as only one mask per sentence
  encoded_batch["label"] = encoded_labels
  return encoded_batch

tokenized_data_ds = data_ds.map(tokenize_function, batched= True)

Map:   0%|          | 0/11162 [00:00<?, ? examples/s]

Map:   0%|          | 0/3111 [00:00<?, ? examples/s]

Map:   0%|          | 0/1674 [00:00<?, ? examples/s]

In [None]:
tokenized_data_ds["train"]["label"][0] # All -100 except 1

[-100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 50276,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100]

In [None]:
eval_len = tokenized_data_ds['val'].num_rows

In [None]:
def compute_metrics(eval_preds):
  logits, labels = eval_preds
  # Get ground truths of masked tokens and predicted tokens
  masked_tokens_gts = []
  masked_tokens_inds = []
  for label in labels:
    for i in range(len(label)):
      if label[i] != -100:
        masked_tokens_inds += [i]
        masked_tokens_gts += [label[i]]*10 #multiply by k = 10 to accomodate top k predictions
        break
  # Get predicted masked token (index = token_id)
  predicted_sent = torch.topk(torch.tensor(logits), 10, dim=2).indices #do topk with k = 10
  predicted_masked_tokens = []
  for i, ind in enumerate(masked_tokens_inds):
    predicted_masked_tokens += predicted_sent[i][ind] #get whole flattened list of topk predictions
  # Compute accuracy
  accuracy = 10*sum(masked_tokens_gts == np.array(predicted_masked_tokens))/len(masked_tokens_gts) #multipy by k to compensate for flattening
  return {'accuracy': accuracy}

In [None]:
# Initialize model and training arguments

initial_model = AutoModelForCausalLM.from_pretrained(model_name)
initial_model.config.vocab_size = tokenizer.vocab_size
initial_model.resize_token_embeddings(len(tokenizer))
# model.config.pad_token_id = model.config.eos_token_id

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/ECE1786/Project/Models/GPT",
    # output_dir="models",
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    per_device_eval_batch_size= 100,
    eval_steps = 1/20, # Can just be a ratio less than 1 of the total training steps
    logging_steps = 1/20,
    save_total_limit=1,
    save_strategy="epoch",
    # num_train_epochs = 3,
    per_device_train_batch_size = 32,
    learning_rate = 5e-4,
    eval_accumulation_steps = 2 #this is the parameter that prevents GPU memory overflow (unload eval results to CPU memory)
)
class EvalCallback(TrainerCallback):
    def on_evaluate(self, args, state, control, logs=None, **kwargs):
      kwargs['model'].eval()
      pass
    def on_log(self, args, state, control, logs=None, **kwargs):
      kwargs['model'].train()
      pass
ee = EvalCallback()
trainer = Trainer(
    model = initial_model,
    args = training_args,
    train_dataset=tokenized_data_ds["train"],
    eval_dataset=tokenized_data_ds["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [ee]
)

# Train model
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
53,6.3418,4.30401,0.454002
106,4.3872,4.022881,0.544205
159,3.979,4.051895,0.381123
212,3.7111,3.572565,0.477897
265,3.446,3.224833,0.555556
318,3.1155,3.038158,0.591398
371,2.8915,3.054635,0.641577
424,2.8088,2.905063,0.614098
477,2.6448,2.824748,0.645161
530,2.6661,2.826975,0.649343


TrainOutput(global_step=1047, training_loss=2.83234122056788, metrics={'train_runtime': 1440.2217, 'train_samples_per_second': 23.251, 'train_steps_per_second': 0.727, 'total_flos': 1093703122944000.0, 'train_loss': 2.83234122056788, 'epoch': 3.0})

In [None]:
trainer.save_model("/content/drive/MyDrive/ECE1786/Project/Models/GPT")

In [None]:
#test a saved model from model_dir, with training arguments from training set to RAM doesn't overflow
#with known maxmimum val batch size to prevent RAM overflow
def test(model_dir, training_args, tokenized_data_ds, val_size):
  model = AutoModelForCausalLM.from_pretrained(model_dir)
  model.eval()

  tester = Trainer(
    model = model,
    args = training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
  )
  accuracy = 0.0
  total_samples = 0
  test_set = tokenized_data_ds['test']
  n = test_set.num_rows
  while n > 0:
    print(n)
    if n > val_size:
      total_samples += val_size
      test_sets = test_set.train_test_split(test_size = val_size/n)
      n = test_sets['train'].num_rows
      accuracy += val_size*tester.evaluate(test_sets['test'])['eval_accuracy']
      test_set = test_sets['train']
    else:
      total_samples += test_set.num_rows
      accuracy += test_set.num_rows*tester.evaluate(test_set)['eval_accuracy']
      n = 0
  return "accuracy: "+str(accuracy/total_samples)
print(test("/content/drive/MyDrive/ECE1786/Project/Models/GPT", training_args, tokenized_data_ds, 1674))

3111


1437
accuracy: 0.6219864995178399


In [None]:
def get_samples(model_dir, tokenized_data_ds):
  model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path = model_dir)
  model.eval()

  for i in range(10):
    ids = torch.tensor([tokenized_data_ds['test']['input_ids'][i]])
    label_ids = tokenized_data_ds['test']['label'][i]
    for id in label_ids:
      if id != -100:
        label = tokenizer.convert_ids_to_tokens(id)
    trimmed_ids_nonzero_idx = ids[0].clone().detach().nonzero() #get rid of pad tokens for demonstration purposes
    trimmed_ids = ids[0].clone().detach()[trimmed_ids_nonzero_idx]
    input_sentence = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(trimmed_ids))
    output = model(ids)
    logits = output.logits[0][-1]
    probs, ids = torch.sort(torch.softmax(logits, dim = 0), dim = 0, descending = True)

    print("Input Sentence: "+ input_sentence)
    print("Correct Label: "+label)
    print("Probabilities: "+str(probs[:10].detach().numpy()))
    print("Top 10 Guesses: "+str(tokenizer.convert_ids_to_tokens(ids[:10])))
    print("\n")
  return

In [None]:
get_samples("/content/drive/MyDrive/ECE1786/Project/Models/GPT", tokenized_data_ds)

Input Sentence: Omar and other [MASK] leaders. Apparently employing a mixture of possible[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
Correct Label: Taliban
Probabilities: [0.13015407 0.09395175 0.05937878 0.05213116 0.04918284 0.0454994
 0.03944989 0.03884535 0.0386872  0.0335014 ]
Top 10 Guesses: ['al', 'Binalshibh', 'Hanjour', 'Mihdhar', 'Berger', 'Ahmed', 'Rice', 'Khallad', 'Jarrah', 'Abu']


Input Sentence: the ZOB [Cleveland Center] radio.” [MASK] audio file, Cleveland Center, Lorain Radar position; [MASK] memo, “Full[PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD][PAD]
Correct Label: FAA
Probabilities: [0.32237107 0.2305458  0.1309529  0.03713027 0.02876003 0.02619231
 0.