In [None]:
!pip install transformers[torch]
!pip install datasets



In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer, TrainerCallback, PreTrainedModel
import string
import numpy as np
import torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd '/content/drive/MyDrive/ECE1786/Project/CSVs'

/content/drive/MyDrive/ECE1786/Project/CSVs


In [None]:
# Read CSV as Pandas DataFrame
data_df = pd.read_csv("trainingdata_bert-base-cased.csv", header=None, names=["sentence", "label"])

In [None]:
# Remove trailing punctuation from masked token labels

def remove_punctuation(text):
  return text.translate(str.maketrans('', '', string.punctuation))

data_df['label'] = data_df['label'].apply(remove_punctuation)

In [None]:
data_df

Unnamed: 0,sentence,label
0,"MEMORANDUM FOR: Inspector General, [MASK]",CIA
1,document that would [MASK] CIA and the IC impr...,help
2,document that would help [MASK] and the IC imp...,CIA
3,sets a precedent for demands that [MASK] CIA also,the
4,sets a precedent for demands that the [MASK] also,CIA
...,...,...
16338,"James Locher, Victory on [MASK] Potomac: The G...",the
16339,"James Locher, Victory on the Potomac: The Gold...",Pentagon
16340,8. For a history of the DCI’s authority over t...,see
16341,8. For a history of the DCI’s authority over t...,CIA


In [None]:
model_name = "bert-base-cased" # Just change this to try out different models
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/ECE1786/Project/Tokenizer Files/BERT_tokenizer") #load just the tokenizer
#tokenizer = AutoTokenizer.from_pretrained(model_name)
# Add mask token

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
print(tokenizer.tokenize("MEMORANDUM FOR: Inspector General, CIA"))

['ME', '##MO', '##RA', '##ND', '##UM', 'F', '##OR', ':', 'Inspector', 'General', ',', 'CIA']


In [None]:
# data_df = data_df[:20] # For testing purposes

# Convert Pandas DataFrame to HuggingFace Dataset
full_dataset = Dataset.from_pandas(data_df)
full_dataset = full_dataset.shuffle(seed=42)
# Split dataset into train-test-validation sets
train_testvalid = full_dataset.train_test_split(test_size =0.3)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size = 0.5)
# gather everyone if you want to have a single DatasetDict
data_ds = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'val': test_valid['train']})
data_ds['train']['sentence'][0]

'After the Hamburg recruits joined the 9/11 conspiracy, al [MASK] began giv-'

In [None]:
# Apply tokenizer
def tokenize_function(batch):
  # Tokenize sentences
  encoded_batch = tokenizer(batch["sentence"], padding="max_length", truncation=True, max_length=64)
  # Create label for each sentence: -100 for unmasked token, actual ID for masked token (list)
  tokenized_sentences = encoded_batch.input_ids.copy()
  encoded_labels = []
  for i, sent in enumerate(tokenized_sentences):
    encoded_labels += [[-100]*len(sent)]
    for j, token_id in enumerate(sent):
      if token_id == tokenizer.mask_token_id:
        if tokenizer.tokenize(batch["label"][i]) == []:
          label = " "
        else:
          label = tokenizer.tokenize(batch["label"][i])[0]
        encoded_labels[-1][j] = tokenizer.convert_tokens_to_ids(label)
        break # Can break as only one mask per sentence
  encoded_batch["label"] = encoded_labels
  return encoded_batch

tokenized_data_ds = data_ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/11440 [00:00<?, ? examples/s]

Map:   0%|          | 0/2452 [00:00<?, ? examples/s]

Map:   0%|          | 0/2451 [00:00<?, ? examples/s]

In [None]:
tokenized_data_ds['train']['attention_mask'][0]

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [None]:
def compute_metrics(eval_preds):
  logits, labels = eval_preds
  # Get ground truths of masked tokens and predicted tokens
  masked_tokens_gts = []
  masked_tokens_inds = []
  for label in labels:
    for i in range(len(label)):
      if label[i] != -100:
        masked_tokens_inds += [i]
        masked_tokens_gts += [label[i]]*10
        break
  # Get predicted masked token (index = token_id)
  predicted_sent = torch.topk(torch.tensor(logits), 10, dim=2).indices
  predicted_masked_tokens = []
  for i, ind in enumerate(masked_tokens_inds):
    predicted_masked_tokens += predicted_sent[i][ind]
  # Compute accuracy
  accuracy = 10*sum(masked_tokens_gts == np.array(predicted_masked_tokens))/len(masked_tokens_gts)
  return {'accuracy': accuracy}

#Initialize model and training arguments

initial_model = AutoModelForMaskedLM.from_pretrained(model_name)
initial_model.config.vocab_size = tokenizer.vocab_size
initial_model.resize_token_embeddings(len(tokenizer))
# model.config.pad_token_id = model.config.eos_token_id

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Embedding(29036, 768)

In [None]:
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/ECE1786/Project/Models/BERT",
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    per_device_eval_batch_size= 50, #fine tuned: larger = faster but uses more RAM. Must run with T4 high RAM
    eval_steps = 1/9,
    logging_steps = 1/9,
    save_strategy="no",
    num_train_epochs = 3,
    per_device_train_batch_size = 50,
    gradient_accumulation_steps = 6,
    eval_accumulation_steps = 1, #unloads val results to CPU memory after this many steps. larger = faster but uses more RAM
    learning_rate = 5e-5
)

In [None]:
# Initialize trainer
class EvalCallback(TrainerCallback): #callback to prevent the model from accumulating gradients when doing eval. this prevents RAM overflow
    def on_evaluate(self, args, state, control, logs=None, **kwargs):
      kwargs['model'].eval()
      pass
    def on_log(self, args, state, control, logs=None, **kwargs):
      kwargs['model'].train()
      pass
ee = EvalCallback()
trainer = Trainer(
    model = initial_model,
    args = training_args,
    train_dataset=tokenized_data_ds["train"],
    eval_dataset=tokenized_data_ds["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [ee]
)
# Train model
trainer.train()
#trainer.save_model("drive/MyDrive/Project/Models/Bert")


Step,Training Loss,Validation Loss,Accuracy
13,4.0771,2.791864,0.733986
26,2.5247,2.107329,0.79029
39,1.9489,1.625219,0.871073
52,1.2846,1.455683,0.889025
65,1.2518,1.342881,0.900449
78,1.1111,1.259048,0.910649
91,0.8971,1.228698,0.914321
104,0.8682,1.211087,0.918401


TrainOutput(global_step=114, training_loss=1.6641042357996891, metrics={'train_runtime': 1954.1835, 'train_samples_per_second': 17.562, 'train_steps_per_second': 0.058, 'total_flos': 1124523058759680.0, 'train_loss': 1.6641042357996891, 'epoch': 2.99})

In [None]:
trainer.save_model("/content/drive/MyDrive/ECE1786/Project/Models/Bert")

In [None]:
#test a saved model from model_dir, with training arguments from training set to RAM doesn't overflow
#with known maxmimum val batch size to prevent RAM overflow
def test(model_dir, training_args, tokenized_data_ds, val_size):
  model = AutoModelForMaskedLM.from_pretrained(model_dir)
  model.eval()

  tester = Trainer(
    model = initial_model,
    args = training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
  )
  accuracy = 0.0
  total_samples = 0
  test_set = tokenized_data_ds['test']
  n = test_set.num_rows
  while n > 0:
    print(n)
    if n > val_size:
      total_samples += val_size
      test_sets = test_set.train_test_split(test_size = val_size/n)
      n = test_sets['train'].num_rows
      accuracy += val_size*tester.evaluate(test_sets['test'])['eval_accuracy']
      test_set = test_sets['train']
    else:
      total_samples += test_set.num_rows
      accuracy += test_set.num_rows*tester.evaluate(test_set)['eval_accuracy']
      n = 0
  return "accuracy: "+str(accuracy/total_samples)
print(test("/content/drive/MyDrive/ECE1786/Project/Models/Bert", training_args, tokenized_data_ds, 2000))

2452


451
accuracy: 0.9298535146951209


In [None]:
def get_samples(model_dir, tokenized_data_ds):
  model = AutoModelForMaskedLM.from_pretrained(model_dir)
  model.eval()

  for i in range(10):
    ids = torch.tensor([tokenized_data_ds['test']['input_ids'][i]])
    masks = torch.tensor([tokenized_data_ds['test']['attention_mask'][i]])
    label_ids = tokenized_data_ds['test']['label'][i]
    for id in label_ids:
      if id != -100:
        label = tokenizer.convert_ids_to_tokens(id)
    trimmed_ids_nonzero_idx = ids[0].clone().detach().nonzero() #get rid of pad tokens for demonstration purposes
    trimmed_ids = ids[0].clone().detach()[trimmed_ids_nonzero_idx]
    input_sentence = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(trimmed_ids))
    output = model(ids, masks)
    logits = output.logits[0][-1]
    probs, ids = torch.sort(torch.softmax(logits, dim = 0), dim = 0, descending = True)

    print("Input Sentence: "+ input_sentence)
    print("Correct Label: "+label)
    print("Probabilities: "+str(probs[:10].detach().numpy()))
    print("Top 10 Guesses: "+str(tokenizer.convert_ids_to_tokens(ids[:10])))
    print("\n")
  return

In [None]:
get_samples("/content/drive/MyDrive/ECE1786/Project/Models/Bert", tokenized_data_ds)

In [None]:
tokenized_data_ds['test']['input_ids'][0]

In [None]:
def guess(model_dir, sentence):
  model = AutoModelForMaskedLM.from_pretrained(model_dir)
  model.eval()
  s = sentence
  idx = 0
  print(s)

  for i in range(10):
    print(s.split())
    mask_index = s.split().index('[MASK]')
    encoded_sentence = tokenizer(sentence, padding="max_length", truncation=True, max_length=64)
    output = model(torch.tensor([encoded_sentence['input_ids']]), torch.tensor([encoded_sentence['attention_mask']]))
    logits = output.logits[0][-1]
    probs, ids = torch.sort(torch.softmax(logits, dim = 0), dim = 0, descending = True)

    print("Sentence so far: "+s)
    print("Probabilities: "+str(probs[:10].detach().numpy()))
    print("Top 10 Guesses: "+str(tokenizer.convert_ids_to_tokens(ids[:10])))
    print("Enter an index to select a word and generate the next. Enter -1 to produce more guesses.")
    idx = int(input())
    i = 0
    while idx == -1:
      print("Probabilities: "+str(probs[10*(i+1):10*(i+2)].detach().numpy()))
      print("Next 10 Guesses: "+str(tokenizer.convert_ids_to_tokens(ids[10*(i+1):10*(i+2)])))
      print("Enter an index to select a word and generate the next. Enter -1 to produce more guesses.")
      idx = int(input())
      i += 1
    s = s.split()
    s[mask_index] = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens([ids[idx]]))
    s += [" "]
    for i in range(len(s)-2, mask_index, -1):
      s[i+1] = s[i]
    s[mask_index + 1] = '[MASK]'
    s = ' '.join(s)

In [None]:
guess("/content/drive/MyDrive/ECE1786/Project/Models/Bert",s2)

Renditions Branch helped capture and render [MASK] 
['Renditions', 'Branch', 'helped', 'capture', 'and', 'render', '[MASK]']
Sentence so far: Renditions Branch helped capture and render [MASK] 
Probabilities: [0.3241892  0.19985558 0.0786012  0.04776481 0.03368598 0.02126401
 0.02037051 0.01927282 0.01836933 0.01731136]
Top 10 Guesses: ['CTC', 'FBI', 'NYPD', 'CIA', 'DOJ', 'the', 'Binalshibh', 'Bin', 'Moussaoui', 'The']
Enter an index to select a word and generate the next. Enter -1 to produce more guesses.
6
['Renditions', 'Branch', 'helped', 'capture', 'and', 'render', 'Binalshibh', '[MASK]']
Sentence so far: Renditions Branch helped capture and render Binalshibh [MASK]
Probabilities: [0.3241892  0.19985558 0.0786012  0.04776481 0.03368598 0.02126401
 0.02037051 0.01927282 0.01836933 0.01731136]
Top 10 Guesses: ['CTC', 'FBI', 'NYPD', 'CIA', 'DOJ', 'the', 'Binalshibh', 'Bin', 'Moussaoui', 'The']
Enter an index to select a word and generate the next. Enter -1 to produce more guesses.
-1

KeyboardInterrupt: ignored

In [None]:
s1 = "But even smaller groups, such as the [MASK] posed threats to US interests."
s2 = "Renditions Branch helped capture and render [MASK] "