In [1]:
!pip install transformers[torch]
!pip install datasets



In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer
import string
import numpy as np

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
# Read CSV as Pandas DataFrame
data_df = pd.read_csv("trainingdata.csv", header=None, names=["sentence", "label"])

In [5]:
# Remove trailing punctuation from masked token labels

def remove_punctuation(text):
  return text.translate(str.maketrans('', '', string.punctuation))

data_df['label'] = data_df['label'].apply(remove_punctuation)

In [6]:
data_df

Unnamed: 0,sentence,label
0,"MEMORANDUM FOR: Inspector General, [MASK]",CIA
1,document that would help [MASK] and the IC imp...,CIA
2,sets a precedent for demands that the [MASK] also,CIA
3,failures of the [MASK] and might have led to,CIA
4,constraints that governed [MASK] decisions and...,CTC
...,...,...
10065,"[MASK] or Hezbollah worldwide, overseas, and i...",Qaeda
10066,"some sheer bad luck.” [MASK] Schelling, forewo...",Thomas
10067,"James Locher, Victory on the Potomac: The Gold...",Pentagon
10068,8. For a history of the DCI’s authority over t...,CIA


In [7]:
model_name = "gpt2" # Just change this to try out different models
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [8]:
# Add mask token
tokenizer.add_special_tokens({"mask_token": "[MASK]"})
tokenizer.special_tokens_map
tokenizer.pad_token = tokenizer.eos_token

In [9]:
# # with open('/content/drive/MyDrive/ECE1786/Project/Textfiles/master', 'r') as master:

# with open('master', 'r') as master:
#   m = master.read()

# worddict = {}
# lines = m.split("\n")
# for line in lines:
#   words = line.split()
#   for word in words:
#     if word not in worddict:
#       worddict[word] = 1
#     else:
#       worddict[word] += 1
# s = sorted(worddict, key=worddict.get, reverse=True)

# Add masked token labels to vocabulary (cannot be split into multiple tokens)
add_to_vocab = []
for word in data_df["label"].values:
  if word not in tokenizer.vocab:
    print(word)
    tokenizer.add_tokens(word)

CTC
Clarke
FAA
suicide
Usama
Bin
Ladin
Laden
Berger
Rice
Ramzi
Mihdhar
Hamburg
Pentagon
Islamist
Sheikh
Samuel
Mohamed
Ahmed
Shehhi
Ghamdi
Hanjour
Nawaf
Hazmi
Saeed
Jarrah
Condoleezza
Rumsfeld
Hadley
Abdullah
Omar
Sudan
Sudanese
Abu
Yousef
Taliban
Shelton
interrogation
Nashiri
Hambali
Khallad
Binalshibh
Ressam
Ashcroft
Bayoumi
Moussaoui
Cressey
NYPD
DOJ
DOD
Battalion


In [10]:
tokenizer.tokenize("MEMORANDUM FOR: Inspector General, [MASK]	")

['M',
 'EM',
 'OR',
 'AND',
 'UM',
 'ĠFOR',
 ':',
 'ĠInspector',
 'ĠGeneral',
 ',',
 'Ġ',
 '[MASK]',
 'ĉ']

In [11]:
# data_df = data_df[:20] # For testing purposes

# Convert Pandas DataFrame to HuggingFace Dataset
data_ds = Dataset.from_pandas(data_df)
# Split dataset into train-test sets
data_ds = data_ds.train_test_split(test_size=0.01, shuffle=False)
data_ds

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 9969
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 101
    })
})

In [12]:
# Apply tokenizer

def tokenize_function(batch):
  # Tokenize sentences
  encoded_batch = tokenizer(batch["sentence"], padding="max_length", truncation=True, max_length=64)
  # Create label for each sentence: -100 for unmasked token, actual ID for masked token (list)
  tokenized_sentences = encoded_batch.input_ids.copy()
  encoded_labels = []
  for i, sent in enumerate(tokenized_sentences):
    encoded_labels += [[-100]*len(sent)]
    for j, token_id in enumerate(sent):
      if token_id == tokenizer.mask_token_id:
        label = tokenizer.tokenize(batch["label"][i])[0]
        encoded_labels[-1][j] = tokenizer.convert_tokens_to_ids(label)
        break # Can break as only one mask per sentence
  encoded_batch["label"] = encoded_labels
  return encoded_batch

tokenized_data_ds = data_ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/9969 [00:00<?, ? examples/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

In [13]:
tokenized_data_ds["train"]["label"][0] # All -100 except 1

[-100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 49732,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100]

In [14]:
def compute_metrics(eval_preds):
  logits, labels = eval_preds
  # Get ground truths of masked tokens and predicted tokens
  masked_tokens_gts = []
  masked_tokens_inds = []
  for label in labels:
    for i in range(len(label)):
      if label[i] != -100:
        masked_tokens_inds += [i]
        masked_tokens_gts += [label[i]]
        break
  # Get predicted masked token (index = token_id)
  predicted_sent = logits.argmax(axis=2)
  predicted_masked_tokens = []
  for i, ind in enumerate(masked_tokens_inds):
    predicted_masked_tokens += [predicted_sent[i][ind]]
  # Compute accuracy
  accuracy = sum(masked_tokens_gts==np.array(predicted_masked_tokens))/len(masked_tokens_gts)
  return {'accuracy': accuracy}

In [15]:
# Initialize model and training arguments

model = AutoModelForCausalLM.from_pretrained(model_name)
model.config.vocab_size = tokenizer.vocab_size
model.resize_token_embeddings(len(tokenizer))
# model.config.pad_token_id = model.config.eos_token_id

training_args = TrainingArguments(
    output_dir="models",
    overwrite_output_dir=True,
    logging_steps = 129,
    evaluation_strategy = "steps",
    per_device_eval_batch_size=len(tokenized_data_ds["test"]),
    save_total_limit=1,
    save_strategy="epoch"
)

In [16]:
# Initialize trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_data_ds["train"],
    eval_dataset=tokenized_data_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train model
trainer.train()

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
129,6.0506,6.609126,0.049505
258,3.8284,4.841456,0.029703
387,3.225,3.923324,0.029703
516,2.8887,4.002119,0.029703
645,2.9512,3.911538,0.029703
774,2.9299,3.940371,0.029703
903,2.7226,3.583596,0.029703
1032,2.7006,3.689245,0.029703
1161,2.6066,3.426747,0.029703
1290,2.5379,3.632341,0.039604


TrainOutput(global_step=3741, training_loss=2.426716302557127, metrics={'train_runtime': 711.7553, 'train_samples_per_second': 42.019, 'train_steps_per_second': 5.256, 'total_flos': 976807600128000.0, 'train_loss': 2.426716302557127, 'epoch': 3.0})

In [17]:
trainer.save_model("drive/MyDrive/Project/Models/GPT")