In [1]:
from transformers import BertTokenizer, DataCollatorForLanguageModeling, BertForMaskedLM, TrainingArguments, Trainer
import torch, torchvision
import tensorflow as tf
import pandas as pd
from datasets import Dataset
import multiprocessing
import numpy as np
import evaluate

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Set up GPU backend
# https://pytorch.org/docs/main/notes/mps.html
# https://stackoverflow.com/questions/63423463/using-pytorch-cuda-on-macbook-pro
if torch.backends.mps.is_available():
    mps_device = torch.device("mps")
    x = torch.ones(1, device=mps_device)
    print (x)
    # output expected:
    # tensor([1.], device='mps:0')

else:
    print ("MPS device not found.")

tensor([1.], device='mps:0')


In [3]:
# Read in events, clean and store as dataset
# def clean_text(event):
#     event = re.sub(r'-+',' ',event)
#     event = re.sub(r'[^a-zA-Z, ]+'," ",event)
#     event = re.sub(r'[ ]+'," ",event)
#     line += "."
#     return line
df = pd.read_csv('eid_eventText.csv', usecols=['eid','event_text']).dropna(how='any',axis=0)
print(len(df))
df.drop_duplicates(subset=['event_text'],keep='first',inplace=True)
print(len(df))
df = df[df['event_text'].str.split().str.len().gt(20)] # drops events with fewer than 20 words  
df.rename(columns={"event_text":"text"},inplace=True)
print(len(df))
dataset = Dataset.from_pandas(df).shuffle(seed=242)

516259
448453
387683


In [5]:
# Tokenization
# model = BertForPreTraining.from_pretrained("bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(reports):
    return tokenizer(reports["text"], return_tensors="np", truncation=True, padding="max_length")

tokenized_ds = dataset.map(tokenize_function, batched=True, num_proc=multiprocessing.cpu_count())

Map (num_proc=24): 100%|██████████| 387683/387683 [01:14<00:00, 5201.55 examples/s] 


In [6]:
# Setup model
# https://github.com/google-research/bert#pre-training-with-bert
# https://huggingface.co/learn/nlp-course/chapter3/3
# https://huggingface.co/blog/pretraining-bert
# https://huggingface.co/transformers/v3.0.2/model_doc/bert.html#overview
# https://www.kaggle.com/code/thierryneusius/pretraining-bert-with-hugging-face-transformers
# https://www.analyticsvidhya.com/blog/2022/09/fine-tuning-bert-with-masked-language-modeling/
# https://huggingface.co/docs/transformers/en/training
# https://huggingface.co/docs/transformers/en/tasks/masked_language_modeling


# Model configuration
# model = BertForMaskedLM.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('./event_trainer/checkpoint-43500') 

# Evaluation 
metric = evaluate.load("accuracy")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# Data
train_eval_ds = tokenized_ds.train_test_split(test_size=0.1, shuffle=True,seed=42)
# tokenizer.pad_token = tokenizer.eos_token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

# Trainer
training_args = TrainingArguments(
    output_dir="event_trainer", 
    eval_strategy="epoch")

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_eval_ds["train"],
    eval_dataset= train_eval_ds["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [None]:
trainer.train()

  0%|          | 0/130845 [00:00<?, ?it/s]

  0%|          | 500/130845 [03:18<14:19:14,  2.53it/s]

{'loss': 1.368, 'grad_norm': 7.638306140899658, 'learning_rate': 4.9808934235163746e-05, 'epoch': 0.01}


  1%|          | 1000/130845 [06:38<14:22:25,  2.51it/s]

{'loss': 1.3508, 'grad_norm': 7.813417911529541, 'learning_rate': 4.961786847032749e-05, 'epoch': 0.02}


  1%|          | 1500/130845 [09:58<14:18:21,  2.51it/s]

{'loss': 1.3534, 'grad_norm': 6.1073503494262695, 'learning_rate': 4.942680270549123e-05, 'epoch': 0.03}


  2%|▏         | 2000/130845 [13:17<14:08:31,  2.53it/s]

{'loss': 1.3235, 'grad_norm': 4.402304649353027, 'learning_rate': 4.9235736940654976e-05, 'epoch': 0.05}


  2%|▏         | 2500/130845 [16:35<13:56:24,  2.56it/s]

{'loss': 1.3254, 'grad_norm': 7.6424713134765625, 'learning_rate': 4.904467117581872e-05, 'epoch': 0.06}


  2%|▏         | 2923/130845 [19:24<14:03:03,  2.53it/s]