In [1]:
!pip install transformers[torch]
!pip install datasets

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.24.1
Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.5-py3-none-any.whl (7.8 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m

In [2]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModelForMaskedLM, TrainingArguments, Trainer
import string
import numpy as np

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Read CSV as Pandas DataFrame
data_df = pd.read_csv("trainingdata.csv", header=None, names=["sentence", "label"])

In [5]:
# Remove trailing punctuation from masked token labels

def remove_punctuation(text):
  return text.translate(str.maketrans('', '', string.punctuation))

data_df['label'] = data_df['label'].apply(remove_punctuation)

In [6]:
data_df

Unnamed: 0,sentence,label
0,"MEMORANDUM FOR: Inspector General, [MASK]",CIA
1,document that would help [MASK] and the IC imp...,CIA
2,sets a precedent for demands that the [MASK] also,CIA
3,failures of the [MASK] and might have led to,CIA
4,constraints that governed [MASK] decisions and...,CTC
...,...,...
10065,"[MASK] or Hezbollah worldwide, overseas, and i...",Qaeda
10066,"some sheer bad luck.” [MASK] Schelling, forewo...",Thomas
10067,"James Locher, Victory on the Potomac: The Gold...",Pentagon
10068,8. For a history of the DCI’s authority over t...,CIA


In [7]:
model_name = "bert-base-cased" # Just change this to try out different models
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

In [8]:
# # with open('/content/drive/MyDrive/ECE1786/Project/Textfiles/master', 'r') as master:

# with open('master', 'r') as master:
#   m = master.read()

# worddict = {}
# lines = m.split("\n")
# for line in lines:
#   words = line.split()
#   for word in words:
#     if word not in worddict:
#       worddict[word] = 1
#     else:
#       worddict[word] += 1
# s = sorted(worddict, key=worddict.get, reverse=True)

# Add masked token labels to vocabulary (cannot be split into multiple tokens)
add_to_vocab = []
for word in data_df["label"].values:
  if word not in tokenizer.vocab:
    print(word)
    tokenizer.add_tokens(word)

CTC
Usama
Ladin
Laden
Ramzi
Mihdhar
Islamist
Shehhi
Ghamdi
Hanjour
Nawaf
Hazmi
Saeed
Jarrah
Condoleezza
Rumsfeld
Sudanese
Yousef
Nashiri
Hambali
Khallad
Binalshibh
Ressam
Ashcroft
Bayoumi
Moussaoui
Cressey
NYPD
DOJ
DOD


In [9]:
tokenizer.tokenize("MEMORANDUM FOR: Inspector General, [MASK]	")

['ME',
 '##MO',
 '##RA',
 '##ND',
 '##UM',
 'F',
 '##OR',
 ':',
 'Inspector',
 'General',
 ',',
 '[MASK]']

In [10]:
# data_df = data_df[:20] # For testing purposes

# Convert Pandas DataFrame to HuggingFace Dataset
data_ds = Dataset.from_pandas(data_df)
# Split dataset into train-test sets
data_ds = data_ds.train_test_split(test_size=0.01, shuffle=False)
data_ds

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 9969
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 101
    })
})

In [11]:
# Apply tokenizer

def tokenize_function(batch):
  # Tokenize sentences
  encoded_batch = tokenizer(batch["sentence"], padding="max_length", truncation=True, max_length=64)
  # Create label for each sentence: -100 for unmasked token, actual ID for masked token (list)
  tokenized_sentences = encoded_batch.input_ids.copy()
  encoded_labels = []
  for i, sent in enumerate(tokenized_sentences):
    encoded_labels += [[-100]*len(sent)]
    for j, token_id in enumerate(sent):
      if token_id == tokenizer.mask_token_id:
        label = tokenizer.tokenize(batch["label"][i])[0]
        encoded_labels[-1][j] = tokenizer.convert_tokens_to_ids(label)
        break # Can break as only one mask per sentence
  encoded_batch["label"] = encoded_labels
  return encoded_batch

tokenized_data_ds = data_ds.map(tokenize_function, batched=True)

Map:   0%|          | 0/9969 [00:00<?, ? examples/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

In [12]:
tokenized_data_ds["train"]["label"][0] # All -100 except 1

[-100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 9878,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100,
 -100]

In [13]:
def compute_metrics(eval_preds):
  logits, labels = eval_preds
  # Get ground truths of masked tokens and predicted tokens
  masked_tokens_gts = []
  masked_tokens_inds = []
  for label in labels:
    for i in range(len(label)):
      if label[i] != -100:
        masked_tokens_inds += [i]
        masked_tokens_gts += [label[i]]
        break
  # Get predicted masked token (index = token_id)
  predicted_sent = logits.argmax(axis=2)
  predicted_masked_tokens = []
  for i, ind in enumerate(masked_tokens_inds):
    predicted_masked_tokens += [predicted_sent[i][ind]]
  # Compute accuracy
  accuracy = sum(masked_tokens_gts==np.array(predicted_masked_tokens))/len(masked_tokens_gts)
  return {'accuracy': accuracy}

In [14]:
# Initialize model and training arguments

model = AutoModelForMaskedLM.from_pretrained(model_name)
model.config.vocab_size = tokenizer.vocab_size
model.resize_token_embeddings(len(tokenizer))
# model.config.pad_token_id = model.config.eos_token_id

training_args = TrainingArguments(
    output_dir="models",
    overwrite_output_dir=True,
    logging_steps = 129,
    evaluation_strategy = "steps",
    per_device_eval_batch_size=len(tokenized_data_ds["test"]),
    save_total_limit=1,
    save_strategy="epoch"
)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [15]:
# Initialize trainer
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_data_ds["train"],
    eval_dataset=tokenized_data_ds["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Train model
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
129,3.1903,3.197066,0.366337
258,1.6972,2.631854,0.376238
387,1.4327,2.284537,0.50495
516,1.2371,1.852929,0.534653
645,1.2179,1.704285,0.584158
774,1.1187,1.488977,0.643564
903,1.0217,1.752078,0.633663
1032,1.0664,1.554542,0.623762
1161,0.9773,1.4525,0.693069
1290,0.8141,1.401094,0.693069


TrainOutput(global_step=3741, training_loss=0.7051140194556127, metrics={'train_runtime': 556.0825, 'train_samples_per_second': 53.782, 'train_steps_per_second': 6.727, 'total_flos': 983941111556352.0, 'train_loss': 0.7051140194556127, 'epoch': 3.0})

In [16]:
trainer.save_model("drive/MyDrive/Project/Models/Bert")