In [1]:
import pandas as pd
import numpy as np
import random
import os, pickle5 as pickle, json, requests

from datasets import load_dataset, load_from_disk
from pathlib import Path
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
import torch

In [3]:
data_files = [
  "/gpfs/data/geraslab/Jan/data/rad_reports/2022.01.combined_rad_reports/ultrasound.xlsx",
  "/gpfs/data/geraslab/Jan/data/rad_reports/2022.01.combined_rad_reports/MRI_no_biopsies.xlsx",
  "/gpfs/data/geraslab/Jan/data/rad_reports/2022.01.combined_rad_reports/MG.pkl",
]
report_path = "/gpfs/data/geraslab/ekr6072/projects/study_indication/data/mlm_dataset"

In [4]:
def load_data(path):
  path = Path(path)
  if path.suffix == ".xlsx":
    data = pd.read_excel(path, engine='openpyxl', index_col=0)
    data = data.drop_duplicates(subset='Acc', keep='first')
    return data
  elif path.suffix == ".pkl":
    with open(path, 'rb') as f:
      data = pickle.load(f)
    return data

In [5]:
all_data = []
for data_file in data_files:
  data = load_data(data_file)
  if Path(data_file).suffix == '.pkl':
    data = list(data['RawReport'].values)
  else:
    data = list(data['Report'].values)
  all_data.extend(data)

In [6]:
text_data = []
for i, data in enumerate(all_data):
  if isinstance(data, str):
    text_data.append(data)
del all_data

In [7]:
df = pd.DataFrame({'text': text_data})

In [8]:
from datasets import Dataset
dataset = Dataset.from_pandas(df)

In [5]:
tokenizer = AutoTokenizer.from_pretrained("yikuan8/Clinical-Longformer")

In [6]:
model = AutoModelForMaskedLM.from_pretrained("yikuan8/Clinical-Longformer")

In [7]:
def tokenize_and_chunk(texts):
  return tokenizer(
                  texts["text"],
                  truncation=True,
                  max_length=1024,
                  return_overflowing_tokens=True
                  )

In [12]:
tokenized_datasets = dataset.map(tokenize_and_chunk, batched=True, remove_columns=["text"])

  0%|          | 0/1773 [00:00<?, ?ba/s]

In [8]:
# tokenized_datasets.save_to_disk(report_path)
tokenized_datasets = load_from_disk(report_path)

In [9]:
tokenized_datasets

Dataset({
    features: ['input_ids', 'attention_mask', 'overflow_to_sample_mapping'],
    num_rows: 1799431
})

In [10]:
dataset = tokenized_datasets.train_test_split(test_size=5000)

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 1794431
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'overflow_to_sample_mapping'],
        num_rows: 5000
    })
})

In [12]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [13]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./models",
    overwrite_output_dir=True,
    do_train=True,
    do_eval=True,
    num_train_epochs=1,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=1000,
    logging_strategy="steps",
    logging_steps=1,
    logging_first_step=True,
    save_strategy="steps",
    save_steps=2000,
    evaluation_strategy = "steps",
    eval_steps=2000,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test']
)

Using amp half precision backend


In [14]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `LongformerForMaskedLM.forward` and have been ignored: overflow_to_sample_mapping. If overflow_to_sample_mapping are not expected by `LongformerForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1794431
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 56076


Step,Training Loss,Validation Loss
10,3.7467,3.378708
20,3.2909,3.219486
30,3.1562,2.93049
40,2.6071,2.623046
50,2.4932,2.30386
60,2.2215,2.020538
70,1.9749,1.764351
80,1.728,1.563342
90,1.5689,1.39544
100,1.3977,1.263626


Input ids are automatically padded from 838 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 838 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 838 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 838 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 698 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 698 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 698 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 698 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 835 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 835 to 1024 to be a multiple of `

KeyboardInterrupt: 

In [None]:
trainer.save_model("./EsperBERTo")