In [None]:
!pip install transformers[torch]
!pip install datasets

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.25.0-py3-none-any.whl (265 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m265.7/265.7 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.25.0
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer, TrainerCallback
import string
import numpy as np
import torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd '/content/drive/MyDrive/ECE1786/Project/CSVs'

/content/drive/.shortcut-targets-by-id/1dzrXydfa8tV9U7malP1p3mQtrDhnia76/Project/CSVs


In [None]:
# Read CSV as Pandas DataFrame
data_df = pd.read_csv("trainingdata_deberta.csv", header=None, names=["sentence", "label"])

In [None]:
# Remove trailing punctuation from masked token labels

def remove_punctuation(text):
  return text.translate(str.maketrans('', '', string.punctuation))

data_df['label'] = data_df['label'].apply(remove_punctuation)

In [None]:
data_df

Unnamed: 0,sentence,label
0,"MEMORANDUM FOR: Inspector General, [MASK]",CIA
1,document that would [MASK] CIA and the IC impr...,help
2,document that would help [MASK] and the IC imp...,CIA
3,sets a precedent for demands that [MASK] CIA also,the
4,sets a precedent for demands that the [MASK] also,CIA
...,...,...
15942,"James Locher, Victory on [MASK] Potomac: The G...",the
15943,"James Locher, Victory on the Potomac: The Gold...",Pentagon
15944,8. For a history of the DCI’s authority over t...,see
15945,8. For a history of the DCI’s authority over t...,CIA


In [None]:
model_name = "microsoft/deberta-base"
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/ECE1786/Project/Tokenizer Files/DeBERTa_tokenizer") #load just the tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
print(tokenizer.tokenize("MEMORANDUM FOR: Inspector General, CIA"))

['M', 'EM', 'OR', 'AND', 'UM', 'ĠFOR', ':', 'ĠInspector', 'ĠGeneral', ',', 'ĠCIA']


In [None]:
# data_df = data_df[:20] # For testing purposes

# Convert Pandas DataFrame to HuggingFace Dataset
full_dataset = Dataset.from_pandas(data_df)
full_dataset = full_dataset.shuffle(seed=42)
# Split dataset into train-test-validation sets
train_testvalid = full_dataset.train_test_split(test_size =0.3)
# Split the 10% test + valid in half test, half valid
test_valid = train_testvalid['test'].train_test_split(test_size = 0.6)
# gather everyone if you want to have a single DatasetDict
data_ds = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'val': test_valid['train']})
data_ds

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 11162
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 2871
    })
    val: Dataset({
        features: ['sentence', 'label'],
        num_rows: 1914
    })
})

In [None]:
# Apply tokenizer
def tokenize_function(batch):
  # Tokenize sentences
  encoded_batch = tokenizer(batch["sentence"], padding="max_length", truncation=True, max_length=64)
  # Create label for each sentence: -100 for unmasked token, actual ID for masked token (list)
  tokenized_sentences = encoded_batch.input_ids.copy()
  encoded_labels = []
  for i, sent in enumerate(tokenized_sentences):
    encoded_labels += [[-100]*len(sent)]
    for j, token_id in enumerate(sent):
      if token_id == tokenizer.mask_token_id:
        if tokenizer.tokenize(batch["label"][i]) == []:
          label = " "
        else:
          label = tokenizer.tokenize(batch["label"][i])[0]
        encoded_labels[-1][j] = tokenizer.convert_tokens_to_ids(label)
        break # Can break as only one mask per sentence
  encoded_batch["label"] = encoded_labels
  return encoded_batch

tokenized_data_ds = data_ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/11162 [00:00<?, ? examples/s]

Map:   0%|          | 0/2871 [00:00<?, ? examples/s]

Map:   0%|          | 0/1914 [00:00<?, ? examples/s]

In [None]:
tokenized_data_ds['train'].num_rows

11162

In [None]:
def compute_metrics(eval_preds):
  logits, labels = eval_preds
  # Get ground truths of masked tokens and predicted tokens
  masked_tokens_gts = []
  masked_tokens_inds = []
  for label in labels:
    for i in range(len(label)):
      if label[i] != -100:
        masked_tokens_inds += [i]
        masked_tokens_gts += [label[i]]*10 #multiply by k = 100 to accomodate top k predictions
        break
  # Get predicted masked token (index = token_id)
  predicted_sent = torch.topk(torch.tensor(logits), 10, dim=2).indices #do topk with k = 100
  predicted_masked_tokens = []
  for i, ind in enumerate(masked_tokens_inds):
    predicted_masked_tokens += predicted_sent[i][ind] #get whole flattened list of topk predictions
  # Compute accuracy
  accuracy = 10*sum(masked_tokens_gts == np.array(predicted_masked_tokens))/len(masked_tokens_gts) #multipy by k to compensate for flattening
  return {'accuracy': accuracy}

#Initialize model and training arguments

initial_model = AutoModelForMaskedLM.from_pretrained(model_name)
initial_model.config.vocab_size = tokenizer.vocab_size
initial_model.resize_token_embeddings(len(tokenizer))
# model.config.pad_token_id = model.config.eos_token_id

config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/559M [00:00<?, ?B/s]

Some weights of DebertaForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(50326, 768)

In [None]:
# Grid search for hyperparameter testing: train_batch_size = [50,100], learning_rate = [5e-5, 2e-5]

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/ECE1786/Project/Models/Deberta",
    # output_dir="models",
    overwrite_output_dir=True,
    evaluation_strategy = "steps",
    per_device_eval_batch_size= 125, #fine tuned: larger = faster but uses more RAM. Must run with T4 high RAM
    eval_steps = 1/20,
    logging_steps = 1/20,
    save_strategy="no",
    # num_train_epochs = 6,
    per_device_train_batch_size = 64,
    gradient_accumulation_steps = 4,
    eval_accumulation_steps = 1, #unloads val results to CPU memory after this many steps. larger = faster but uses more RAM
    learning_rate = 5e-4
)
# Initialize trainer
class EvalCallback(TrainerCallback): #callback to prevent the model from accumulating gradients when doing eval. this prevents RAM overflow
    def on_evaluate(self, args, state, control, logs=None, **kwargs):
      kwargs['model'].eval()
      pass
    def on_log(self, args, state, control, logs=None, **kwargs):
      kwargs['model'].train()
      pass
ee = EvalCallback()
trainer = Trainer(
    model = initial_model,
    args = training_args,
    train_dataset=tokenized_data_ds["train"],
    eval_dataset=tokenized_data_ds["val"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks = [ee]
)
# Train model
# trainer.train()
# trainer.save_model("/content/drive/MyDrive/Project/Models/Bert")
# trainer.save_model("/content/drive/MyDrive/ECE1786/Project/Models/Deberta")

In [None]:
#test a saved model from model_dir, with training arguments from training set to RAM doesn't overflow
#with known maxmimum val batch size to prevent RAM overflow
def test(model_dir, training_args, tokenized_data_ds, val_size):
  model = AutoModelForMaskedLM.from_pretrained(model_dir)
  model.eval()

  tester = Trainer(
    model = model,
    args = training_args,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
  )
  accuracy = 0.0
  total_samples = 0
  test_set = tokenized_data_ds['test']
  n = test_set.num_rows
  while n > 0:
    print(n)
    if n > val_size:
      total_samples += val_size
      test_sets = test_set.train_test_split(test_size = val_size/n)
      n = test_sets['train'].num_rows
      accuracy += val_size*tester.evaluate(test_sets['test'])['eval_accuracy']
      test_set = test_sets['train']
    else:
      total_samples += test_set.num_rows
      accuracy += test_set.num_rows*tester.evaluate(test_set)['eval_accuracy']
      n = 0
  return "accuracy: "+str(accuracy/total_samples)
# print(test("/content/drive/MyDrive/ECE1786/Project/Models/Deberta", training_args, tokenized_data_ds, 1914))

In [None]:
def get_samples(model_dir, tokenized_data_ds):
  model = AutoModelForMaskedLM.from_pretrained(model_dir)
  model.eval()

  for i in range(10):
    ids = torch.tensor([tokenized_data_ds['test']['input_ids'][i]])
    masks = torch.tensor([tokenized_data_ds['test']['attention_mask'][i]])
    label_ids = tokenized_data_ds['test']['label'][i]
    mask_id = 0
    for j, id in enumerate(label_ids):
      if id != -100:
        label = tokenizer.convert_ids_to_tokens(id)
        mask_id = j
    trimmed_ids_nonzero_idx = ids[0].clone().detach().nonzero() #get rid of pad tokens for demonstration purposes
    trimmed_ids = ids[0].clone().detach()[trimmed_ids_nonzero_idx]
    input_sentence = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(trimmed_ids))
    outputs = model(ids, masks)
    logits = outputs.logits[0][mask_id]
    probs, ids = torch.sort(torch.softmax(logits, dim = 0), dim = 0, descending = True)

    print("Input Sentence: "+ input_sentence)
    print("Correct Label: "+label)
    print("Probabilities: "+str(probs[:10].detach().numpy()))
    print("Top 10 Guesses: "+str(tokenizer.convert_ids_to_tokens(ids[:10])))
    print("\n")
  return


In [None]:
get_samples("/content/drive/MyDrive/ECE1786/Project/Models/Deberta", tokenized_data_ds)

Input Sentence: [CLS]Jan. 31, 1997; Intelligence report, Cooperation [MASK] Usama Bin Ladin’s Islamic Army, Iran, and the NIF Jan. 31[SEP]
Correct Label: Among
Probabilities: [0.43145758 0.2556131  0.07712317 0.02376668 0.01986501 0.01929748
 0.01297779 0.01029907 0.00985409 0.00826528]
Top 10 Guesses: ['to', 'of', 'on', 'from', 'and', 'for', 'against', 'with', 'in', 'at']


Input Sentence: [CLS]Intelligence reports made available to the Commission. The information is puzzling, since Bin [MASK] left Sudan for[SEP]
Correct Label: Ladin
Probabilities: [9.8851985e-01 2.9001944e-04 2.4873056e-04 2.4595187e-04 2.1924368e-04
 1.9627853e-04 1.2918253e-04 1.1752348e-04 1.0804707e-04 1.0604166e-04]
Top 10 Guesses: ['Ladin', 'Clarke', 'Laden', 'Mihdhar', 'Omar', 'Binalshibh', 'Bin', 'Jarrah', 'Hazmi', 'Ressam']


Input Sentence: [CLS]attack to make a policy decision.” NSC email, [MASK] to Hadley, “Need for Terrorism DC Next Week,” Mar. 22,[SEP]
Correct Label: Cressey
Probabilities: [0.8835033  0