<a href="https://colab.research.google.com/github/danielsaggau/IR_LDC/blob/main/model/MIMIC/mimic_bregman_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence_transformers transformers datasets

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from datasets import load_dataset
import json

In [None]:
!git clone https://github.com/danielsaggau/IR_LDC.git

In [None]:
!unzip /content/drive/MyDrive/mimic.jsonl.zip -d content
with open('content/mimic.jsonl') as f:
    data = [json.loads(line) for line in f]

In [None]:
import shutil
shutil.move("/content/content/mimic.jsonl", "/content/IR_LDC/model/MIMIC")
dataset = load_dataset("/content/IR_LDC/model/MIMIC/mimic-dataset.py")

In [12]:
dataset_train_test = dataset['train'].train_test_split(test_size=0.1)
dataset_test = dataset_train_test['test']
dataset_sp = dataset_train_test['train'].train_test_split(test_size=0.1/0.9)
dataset_train = dataset_sp['train']
dataset_validation = dataset_sp['test']

In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
!python -c "from huggingface_hub.hf_api import HfFolder; HfFolder.save_token('XXXX')"
tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/bregman_mimic_FT", use_auth_token=True, use_fast=True)

In [22]:
model = AutoModelForSequenceClassification.from_pretrained("/content/drive/MyDrive/bregman_mimic_FT", use_auth_token=True,num_labels=19, problem_type='multi_label_classification')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/bregman_mimic_FT and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=8) # fp16

In [16]:
train_dataset=dataset['train']

In [17]:
num_labels = train_dataset.features['labels'].feature.num_classes
label_ids = train_dataset.features['labels'].feature.names

label_names = label_ids
label_list = list(range(num_labels))

In [18]:
   def preprocess_function(examples):
        # Tokenize the texts
        batch = tokenizer(
            examples["text"],
            padding='max_length',
            max_length=512,
            truncation=True)
        
        batch = tokenizer.pad(
            batch,
            padding='max_length',
            max_length=512,
            pad_to_multiple_of=8,
        )
        batch["label_ids"] = [[1.0 if label in labels else 0.0 for label in label_list] for labels in examples["labels"]]
        return batch

In [None]:
tokenized_data = dataset.map(preprocess_function, batched=True, remove_columns=['labels'])

In [23]:
   from transformers import EvalPrediction
   def compute_metrics(p: EvalPrediction):
        logits = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
        preds = (expit(logits) > 0.5).astype(int)
        label_ids = (p.label_ids > 0.5).astype(int)
        macro_f1 = f1_score(y_true=label_ids, y_pred=preds, average='macro', zero_division=0)
        micro_f1 = f1_score(y_true=label_ids, y_pred=preds, average='micro', zero_division=0)
        return {'macro_f1': macro_f1, 'micro_f1': micro_f1}

In [20]:
from transformers import TrainingArguments, EarlyStoppingCallback
training_args = TrainingArguments(
    output_dir="/clongformer_mimic_classification_bregman",
    learning_rate=3e-5,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    num_train_epochs=10,
    weight_decay=0.01,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    fp16=True,
    push_to_hub=True,
    metric_for_best_model="micro_f1",
    greater_is_better=True,
    load_best_model_at_end = True,
    report_to="wandb",
    run_name="mimic_bregman")

In [None]:
tokenized_data['test']

In [24]:
import torch
torch.cuda.empty_cache()

In [25]:
#Bert pooling
import torch
from torch import nn
class BertMeanPooler(nn.Module):
          def __init__(self, config):
             super().__init__()
             self.dense = nn.Linear(config.hidden_size, config.hidden_size)
             self.activation = nn.Tanh()

          def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token.
              mean_token_tensor = hidden_states.mean(dim=1)
              pooled_output = self.dense(mean_token_tensor)
              pooled_output = self.activation(pooled_output)
              return pooled_output
model.bert.pooler = BertMeanPooler(model.config)
print('model mean pooler loaded')

model mean pooler loaded


In [None]:
!pip install wandb

In [27]:
from transformers import Trainer
from scipy.special import expit
from sklearn.metrics import f1_score
trainer = Trainer(
    model=model,
    compute_metrics=compute_metrics,
    args=training_args,
    eval_dataset=tokenized_data['test'],
    train_dataset=tokenized_data["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,    
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)])
trainer.train()

Cloning https://huggingface.co/danielsaggau/clongformer_mimic_classification_bregman into local empty directory.
Using cuda_amp half precision backend
The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: summary_id, text. If summary_id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 30000
  Num Epochs = 10
  Instantaneous batch size per device = 6
  Total train batch size (w. parallel, distributed & accumulation) = 6
  Gradient Accumulation steps = 1
  Total optimization steps = 50000
  Number of trainable parameters = 109496851
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
ERROR:wandb.jupyter:Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit: 

··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss,Macro F1,Micro F1
1,0.3684,0.367972,0.643399,0.710382
2,0.3499,0.358627,0.66155,0.714283
3,0.3239,0.361013,0.684809,0.725553
4,0.3064,0.370703,0.681337,0.723355
5,0.272,0.388913,0.687537,0.722388
6,0.2427,0.409592,0.682667,0.717118


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: summary_id, text. If summary_id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 6
Saving model checkpoint to /clongformer_mimic_classification_bregman/checkpoint-5000
Configuration saved in /clongformer_mimic_classification_bregman/checkpoint-5000/config.json
Model weights saved in /clongformer_mimic_classification_bregman/checkpoint-5000/pytorch_model.bin
tokenizer config file saved in /clongformer_mimic_classification_bregman/checkpoint-5000/tokenizer_config.json
Special tokens file saved in /clongformer_mimic_classification_bregman/checkpoint-5000/special_tokens_map.json
tokenizer config file saved in /clongformer_mimic_classification_bregman/tokenizer_config.json
Special tokens file saved in /clongformer_mimic_class

TrainOutput(global_step=30000, training_loss=0.31416226857503254, metrics={'train_runtime': 9117.1754, 'train_samples_per_second': 32.905, 'train_steps_per_second': 5.484, 'total_flos': 4.736721881088e+16, 'train_loss': 0.31416226857503254, 'epoch': 6.0})

In [28]:
trainer.evaluate(eval_dataset=tokenized_data['validation'])

The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: summary_id, text. If summary_id, text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 10000
  Batch size = 6


{'eval_loss': 0.35984161496162415,
 'eval_macro_f1': 0.6777420171475501,
 'eval_micro_f1': 0.7257478529594579,
 'eval_runtime': 126.0732,
 'eval_samples_per_second': 79.319,
 'eval_steps_per_second': 13.222,
 'epoch': 6.0}