In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow
from sklearn.model_selection import train_test_split
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, 
                             classification_report, confusion_matrix)
import transformers
from transformers import (AutoTokenizer , AutoModelForSequenceClassification, TrainingArguments, 
                          Trainer, pipeline, DataCollatorWithPadding, AutoModelForSeq2SeqLM, 
                          EarlyStoppingCallback, IntervalStrategy)
from datasets import load_dataset, Dataset, load_metric
import torch
from GPUtil import showUtilization as gpu_usage
from numba import cuda
from torch.utils.tensorboard import SummaryWriter
import random
import wandb
import collections
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.model_selection import StratifiedKFold
import collections
import mlflow
from mlflow.tracking import MlflowClient

2023-05-30 10:07:15.517645: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-30 10:07:16.188139: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-05-30 10:07:16.188176: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-05-30 10:07:16.293789: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-05-30 10:07:17.693705: W tensorflow/stream_executor/pla

In [6]:
def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    cuda.select_device(0)
    cuda.close()
    cuda.select_device(0)

    print("GPU Usage after emptying the cache")
    gpu_usage()

free_gpu_cache()                           

Initial GPU Usage
| ID | GPU | MEM |
------------------
|  0 | 11% | 13% |
GPU Usage after emptying the cache
| ID | GPU | MEM |
------------------
|  0 | 24% | 14% |


In [7]:
max_length = 256
def load_data(train_file_path, valid_file_path):
    if train_file_path and valid_file_path:
        dataset = load_dataset("csv", data_files = {'train': train_file_path, 'validation': valid_file_path})
    elif not valid_file_path:
        dataset = load_dataset("csv", data_files = {'train': train_file_path})
    return dataset
model_checkpoint = 'microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)
def tokenize_data(example):
    return tokenizer(preprocess(example['text']),  padding = 'max_length', truncation=True, max_length=max_length)

#training and validation - 60/40
#dataset = load_data(['RT_train.csv'],  ['RT_test.csv'])
#train_tokenized = dataset['train'].map(tokenize_data, batched = True) 
#valid_tokenized = dataset['validation'].map(tokenize_data, batched = True)


In [8]:
def model_init():
    model = AutoModelForSequenceClassification.from_pretrained(
        model_checkpoint,
        num_labels=4,
        #id2label={index: label for index, label in enumerate(labels.names)},
        #label2id={label: index for index, label in enumerate(labels.names)}
    )
    return model
model = model_init()

Some weights of the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Ber

In [None]:
sweep_config = {
    'method': 'random'
}


# hyperparameters
parameters_dict = {
    'epochs': {
        'values': [1,2,4]
        },
    'batch_size': {
        'values': [4, 8, 16]
        },
    'learning_rate': {
        'distribution': 'log_uniform_values',
        'min': 2e-7,
        'max': 2e-4
    },
    'weight_decay': {
        'values': [0.0, 0.01, 0.005, 0.001]
    },
}


sweep_config['parameters'] = parameters_dict


In [None]:
sweep_id = wandb.sweep(sweep_config, project='ft_all+PK_PubMedBERT3')

In [9]:
#tokenize/process title and abstract separately
def customized_data_collator(examples):
    titles = [example['Title'] for example in examples]
    abstracts = [example['Abstract'] for example in examples]
    
    tokenized_title = tokenizer(titles , padding = 'max_length', truncation=True, max_length=30)
    tokenized_abstract = tokenizer(abstracts, padding = 'max_length', truncation=True, max_length=256)
    
    #inputs = {
        #'input_ids' : torch.cat([tokenized_title['input_ids'] , tokenized_abstract['input_ids']], dim =1),
        #'attention_mask': torch.cat([tokenized_title['attention_mask'] , tokenized_abstract['attention_mask']], dim =1),
        #'token_type_ids' : torch.cat([tokenized_title['token_type_ids'] , tokenized_abstract['token_type_ids']], dim =1)
    #}
    
    inputs = {
        'input_ids' : [a+b for a,b in zip(tokenized_title['input_ids'],tokenized_abstract['input_ids'])],
        'attention_mask': [a+b for a,b in zip(tokenized_title['attention_mask'],tokenized_abstract['attention_mask'])],
        'token_type_ids' : [a+b for a,b in zip(tokenized_title['token_type_ids'],tokenized_abstract['token_type_ids'])]
    }
    
    label = [example['labels'] for example in examples]
    batch = {
        'input_ids': torch.tensor(inputs['input_ids']),
        'attention_mask': torch.tensor(inputs['attention_mask']),
        'token_type_ids': torch.tensor(inputs['token_type_ids']),
        'labels': torch.tensor(label)
    }
    
    return batch


In [10]:
# define function to compute metrics
def compute_metrics_fn(eval_preds):
    
    res = dict()

    accuracy_metric = load_metric('accuracy')
    precision_metric = load_metric('precision')
    recall_metric = load_metric('recall')
    f1_metric = load_metric('f1')
    
    logits = eval_preds.predictions
    labels = eval_preds.label_ids
    preds = np.argmax(logits, axis=-1)  
    
    report = classification_report(labels, preds)
    cm = confusion_matrix(labels, preds)
    print(report)
    print('confusion matrix: ', cm)
    
    res.update(accuracy_metric.compute(predictions=preds, references=labels))
    res.update(precision_metric.compute(predictions=preds, references=labels, average='macro'))
    res.update(recall_metric.compute(predictions=preds, references=labels, average='macro'))
    res.update(f1_metric.compute(predictions=preds, references=labels, average='macro'))
    
    return res 


In [9]:
#experiments using 60/40 - hyperparam tuning
def train(config=None):
    with wandb.init(config=config):
        config = wandb.config


        training_args = TrainingArguments(
            output_dir='ft_all+PK_PubMedBERT3',
            report_to='wandb', 
            num_train_epochs=config.epochs,
            learning_rate=config.learning_rate,
            weight_decay=config.weight_decay,
            per_device_train_batch_size=config.batch_size,
            per_device_eval_batch_size=16,
            save_strategy='epoch',
            evaluation_strategy='epoch',
            logging_strategy='epoch',
            load_best_model_at_end=True,
            remove_unused_columns=False,
            fp16=True,
            save_total_limit = 1,
            run_name= 'ft_all+PK_PubMedBERT3',
        )
        

        trainer = Trainer(
            model_init=model_init,
            args=training_args,
            train_dataset=train_tokenized,
            eval_dataset=valid_tokenized,
            compute_metrics=compute_metrics_fn
        )


        trainer.train()


In [None]:
wandb.agent(sweep_id, train, count=15)

In [40]:
#train final model with all data - test with early stopping , return to normal if did not work, add class weights
torch.manual_seed(42)
class CustomTrainer(Trainer):
    def calculate_class_weights(training_set):
        print("hi")
        labels = [set(training_set)]
        class_distribution = [0]*len(labels)
        for i in labels:
            class_distribution[i] = training_set.count(i)
        weights = []
        class_distribution = np.array(class_distribution)
        num_classes = len(labels)
        weight = np.sum(class_distribution)/(num_classes * class_distribution)
        return weight
    def compute_custom_loss(model, inputs, return_outputs=False):
        target = inputs.get('labels')
        weights = calculate_class_weights(target)
        print(weights)
        print("hi2")
        ce_loss = nn.CrossEntropyLoss(weight=torch.tensor(weights))
        outputs = model(*inputs)
        logits = outputs.get('logits')
        probs = nn.functional.softmax(logits, dim = -1)
        
        loss = ce_loss(probs.view(-1, model.config.num_labels), target.view(-1))
        return (loss, outputs) if return_outputs else loss
def train(train_set, validation_set, learning_rate, weight_decay, run_name, epochs, batch_size):
    mlflow.start_run()

    # Log training parameters
    mlflow.log_param("learning_rate", learning_rate)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("num_epochs", epochs)

    training_args = TrainingArguments(
        output_dir= run_name,
        num_train_epochs= epochs,
        learning_rate= learning_rate,
        weight_decay= weight_decay,
        per_device_train_batch_size= batch_size,
        save_strategy= 'epoch' ,#IntervalStrategy.STEPS,
        logging_strategy= 'epoch', #IntervalStrategy.STEPS,
        remove_unused_columns=False,
        save_total_limit = 1,
        run_name= run_name,
        fp16 = True,
        load_best_model_at_end=True,
        evaluation_strategy = 'epoch', #IntervalStrategy.STEPS,
        eval_steps = 1,
        metric_for_best_model = 'eval_f1',
        seed = 42
    )


    trainer = CustomTrainer(
        model_init= model_init,
        args= training_args,
        #train_tokenized for joined title and abstract, dataset['train'] for seperate title and abstract
        train_dataset= train_set,
        #use to create seperate title and abstract
        data_collator = customized_data_collator,
        compute_metrics= compute_metrics_fn,
        #valid_tokenized for joined title and abstract, dataset['validation'] for seperate title and abstract
        eval_dataset = validation_set,
        callbacks = [EarlyStoppingCallback(early_stopping_patience=2)]
        
    )


    trainer.train()
    eval_result = trainer.evaluate()
    mlflow.end_run()

    return eval_result


#final model training with all data
#train(1.098e-4, 0.01, "final_PubMedBERT_longer_longerepoch", 5, 8)
#final model training with all data + 50% coded 2021 data, joined abstract and title
#train(1.098e-4, 0.01, "final_PubMedBERT_2021",4, 8)
#final model training with all data + 50% coded 2021 data , separete title and abstract
#train(train_set, test_set, 1.098e-4, 0.01, "final_PubMedBERT_2021_separateInput", 5, 8)

#final model training with all data + 50% coded 2021 data , separate title and abstract
#final = train(train_set, test_set, 1.098e-4, 0.01, "final_PubMedBERT_2021_separateInput_v3", 6, 8)
#final = train(train_set, test_set, 1.098e-4, 0.01, "final_PubMedBERT_2021_separateInput_test2", 20, 8)


In [16]:
#stratified sampling to make sure enough sample from each class exists in the classification
mlflow.end_run()
# Set the experiment path
experiment_path = "PubClassifier"

# Set the experiment
mlflow.set_experiment(experiment_path)
#stratified k_fold cross validation for imbalanced dataset
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#heldout_set
k_fold , test = next(skf.split(dataset['train'], dataset['train']['labels']))
#kfold training and validation
eval_result = []
i = 0
for train_idx , valid_idx in skf.split(dataset['train'][k_fold]['Title'], dataset['train'][k_fold]['labels']):
    train_set = [dataset['train'][int(i)] for i in train_idx]
    validation_set = [dataset['train'][int(i)] for i in valid_idx]
    eval_result.append(train(train_set, validation_set, 3.098e-5, 0.001, "PubMedBERT_2021_lr35"+str(i), 10, 16))
    i += 1
    


In [10]:
#adding class weights to imptove IRGT class performance
class CustomTrainer(Trainer):
    def calculate_class_weights(training_set):
        print("hi")
        labels = [set(training_set)]
        class_distribution = [0]*len(labels)
        for i in labels:
            class_distribution[i] = training_set.count(i)
        weights = []
        class_distribution = np.array(class_distribution)
        num_classes = len(labels)
        weight = np.sum(class_distribution)/(num_classes * class_distribution)
        return weight
    def compute_custom_loss(model, inputs, return_outputs=False):
        target = inputs.get('labels')
        weights = calculate_class_weights(target)
        print(weights)
        print("hi2")
        ce_loss = nn.CrossEntropyLoss(weight=torch.tensor(weights))
        outputs = model(*inputs)
        logits = outputs.get('logits')
        probs = nn.functional.softmax(logits, dim = -1)
        
        loss = ce_loss(probs.view(-1, model.config.num_labels), target.view(-1))
        return (loss, outputs) if return_outputs else loss
    
def train(train_set, validation_set, learning_rate, weight_decay, run_name, epochs, batch_size):
    training_args = TrainingArguments(
        output_dir= run_name,
        num_train_epochs= epochs,
        learning_rate= learning_rate,
        weight_decay= weight_decay,
        per_device_train_batch_size= batch_size,
        save_strategy='epoch',
        logging_strategy='epoch',
        remove_unused_columns=False,
        save_total_limit = 1,
        run_name= run_name,
        fp16 = True
    )


    trainer = CustomTrainer(
        model_init= model_init,
        args= training_args,
        #train_tokenized for joined title and abstract, dataset['train'] for seperate title and abstract
        train_dataset= train_set,
        #use to create seperate title and abstract
        data_collator = customized_data_collator,
        compute_metrics= compute_metrics_fn,
        #valid_tokenized for joined title and abstract, dataset['validation'] for seperate title and abstract
        eval_dataset = validation_set,
        
    )


    trainer.train()
    eval_result = trainer.evaluate()
    return eval_result



NameError: name 'train_set' is not defined

In [17]:
#stratified train and test set
test_set = [dataset['train'][int(i)] for i in test]
train_set = [dataset['train'][int(i)] for i in k_fold]

In [28]:
#evaluate candidate model on test set
train(train_set, test_set, 3.098e-5, 0.001, "candidate_PubMedBERT_2021_test2", 4, 16)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
loading configuration file https://huggingface.co/microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext/resolve/main/config.json from cache at /home/elahehaa/.cache/huggingface/transformers/76e7b0967140f134278c3209cffe98f69eb013b9de505a434b3359c057aedaa3.2411d0fafcf181e9b95d9cb7972d93b27c57a2cb75819924f8fc7ec848b708f2
Model config BertConfig {
  "_name_or_path": "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1,0,1,2,3
1,0.4485,0.257617,0.905738,0.68692,0.70927,0.696922,0.648148,0.209677,1.0,0.5
2,0.1809,0.283669,0.919399,0.833679,0.856207,0.844052,0.647059,0.369565,0.421053,0.5
3,0.0896,0.311875,0.938525,0.883923,0.874187,0.878921,0.432432,0.552632,0.538462,0.5
4,0.0472,0.313196,0.935792,0.889207,0.875452,0.881781,0.575,0.4,0.583333,0.5


***** Running Evaluation *****
  Num examples = 732
  Batch size = 8


              precision    recall  f1-score   support

           0       0.96      0.92      0.94       439
           1       0.82      0.95      0.88       238
           2       0.00      0.00      0.00        20
           3       0.97      0.97      0.97        35

    accuracy                           0.91       732
   macro avg       0.69      0.71      0.70       732
weighted avg       0.89      0.91      0.89       732



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to PubMedBERT_2021_test2/checkpoint-183
Configuration saved in PubMedBERT_2021_test2/checkpoint-183/config.json
Model weights saved in PubMedBERT_2021_test2/checkpoint-183/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 732
  Batch size = 8


              precision    recall  f1-score   support

           0       0.96      0.92      0.94       439
           1       0.88      0.93      0.91       238
           2       0.52      0.60      0.56        20
           3       0.97      0.97      0.97        35

    accuracy                           0.92       732
   macro avg       0.83      0.86      0.84       732
weighted avg       0.92      0.92      0.92       732



Saving model checkpoint to PubMedBERT_2021_test2/checkpoint-366
Configuration saved in PubMedBERT_2021_test2/checkpoint-366/config.json
Model weights saved in PubMedBERT_2021_test2/checkpoint-366/pytorch_model.bin
Deleting older checkpoint [PubMedBERT_2021_test2/checkpoint-183] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 732
  Batch size = 8


              precision    recall  f1-score   support

           0       0.95      0.96      0.96       439
           1       0.93      0.91      0.92       238
           2       0.68      0.65      0.67        20
           3       0.97      0.97      0.97        35

    accuracy                           0.94       732
   macro avg       0.88      0.87      0.88       732
weighted avg       0.94      0.94      0.94       732



Saving model checkpoint to PubMedBERT_2021_test2/checkpoint-549
Configuration saved in PubMedBERT_2021_test2/checkpoint-549/config.json
Model weights saved in PubMedBERT_2021_test2/checkpoint-549/pytorch_model.bin
Deleting older checkpoint [PubMedBERT_2021_test2/checkpoint-366] due to args.save_total_limit
***** Running Evaluation *****
  Num examples = 732
  Batch size = 8


              precision    recall  f1-score   support

           0       0.96      0.95      0.95       439
           1       0.90      0.93      0.92       238
           2       0.72      0.65      0.68        20
           3       0.97      0.97      0.97        35

    accuracy                           0.94       732
   macro avg       0.89      0.88      0.88       732
weighted avg       0.94      0.94      0.94       732



Saving model checkpoint to PubMedBERT_2021_test2/checkpoint-732
Configuration saved in PubMedBERT_2021_test2/checkpoint-732/config.json
Model weights saved in PubMedBERT_2021_test2/checkpoint-732/pytorch_model.bin
Deleting older checkpoint [PubMedBERT_2021_test2/checkpoint-549] due to args.save_total_limit


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from PubMedBERT_2021_test2/checkpoint-732 (score: 0.8817807275030399).
***** Running Evaluation *****
  Num examples = 732
  Batch size = 8


              precision    recall  f1-score   support

           0       0.96      0.95      0.95       439
           1       0.90      0.93      0.92       238
           2       0.72      0.65      0.68        20
           3       0.97      0.97      0.97        35

    accuracy                           0.94       732
   macro avg       0.89      0.88      0.88       732
weighted avg       0.94      0.94      0.94       732



{'eval_loss': 0.31319618225097656,
 'eval_accuracy': 0.9357923497267759,
 'eval_precision': 0.8892072120160331,
 'eval_recall': 0.8754524702819625,
 'eval_f1': 0.8817807275030399,
 'eval_0': 0.575,
 'eval_1': 0.4,
 'eval_2': 0.5833333333333334,
 'eval_3': 0.5,
 'eval_runtime': 14.6419,
 'eval_samples_per_second': 49.993,
 'eval_steps_per_second': 6.283,
 'epoch': 4.0}

In [9]:
#deploy
torch.manual_seed(42)
class CustomTrainer(Trainer):
    def calculate_class_weights(training_set):
        labels = [set(training_set)]
        class_distribution = [0]*len(labels)
        for i in labels:
            class_distribution[i] = training_set.count(i)
        weights = []
        class_distribution = np.array(class_distribution)
        num_classes = len(labels)
        weight = np.sum(class_distribution)/(num_classes * class_distribution)
        return weight
    def compute_custom_loss(model, inputs, return_outputs=False):
        target = inputs.get('labels')
        weights = calculate_class_weights(target)
        ce_loss = nn.CrossEntropyLoss(weight=torch.tensor(weights))
        outputs = model(*inputs)
        logits = outputs.get('logits')
        probs = nn.functional.softmax(logits, dim = -1)
        
        loss = ce_loss(probs.view(-1, model.config.num_labels), target.view(-1))
        return (loss, outputs) if return_outputs else loss
def train(train_set, learning_rate, weight_decay, run_name, epochs, batch_size):
    mlflow.start_run()

    # Log training parameters
    mlflow.log_param("learning_rate", learning_rate)
    mlflow.log_param("batch_size", batch_size)
    mlflow.log_param("num_epochs", epochs)

    training_args = TrainingArguments(
        output_dir= run_name,
        num_train_epochs= epochs,
        learning_rate= learning_rate,
        weight_decay= weight_decay,
        per_device_train_batch_size= batch_size,
        save_strategy= 'epoch' ,#IntervalStrategy.STEPS,
        logging_strategy= 'epoch', #IntervalStrategy.STEPS,
        remove_unused_columns=False,
        save_total_limit = 1,
        run_name= run_name,
        fp16 = True,
        #load_best_model_at_end=True,
        #evaluation_strategy = 'epoch', #IntervalStrategy.STEPS,
        metric_for_best_model = 'eval_f1',
        seed = 42
    )


    trainer = CustomTrainer(
        model_init= model_init,
        args= training_args,
        #train_tokenized for joined title and abstract, dataset['train'] for seperate title and abstract
        train_dataset= train_set,
        #use to create seperate title and abstract
        data_collator = customized_data_collator,
        compute_metrics= compute_metrics_fn,
        #valid_tokenized for joined title and abstract, dataset['validation'] for seperate title and abstract
        
    )


    trainer.train()
    mlflow.end_run()

mlflow.end_run()
# Set the experiment path
experiment_path = "PubClassifier"
# Set the experiment
mlflow.set_experiment(experiment_path)

#train model on all available data to classify unseen 2022 data
train(dataset['train'], 3.098e-5, 0.001, "candidate_PubMedBERT_2021_5/27/23", 4, 16)

loading configuration file https://huggingface.co/microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext/resolve/main/config.json from cache at /home/elahehaa/.cache/huggingface/transformers/76e7b0967140f134278c3209cffe98f69eb013b9de505a434b3359c057aedaa3.2411d0fafcf181e9b95d9cb7972d93b27c57a2cb75819924f8fc7ec848b708f2
Model config BertConfig {
  "_name_or_path": "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "nu

Step,Training Loss
229,0.3872
458,0.1697
687,0.0925
916,0.0447


Saving model checkpoint to candidate_PubMedBERT_2021_5/27/23/checkpoint-229
Configuration saved in candidate_PubMedBERT_2021_5/27/23/checkpoint-229/config.json
Model weights saved in candidate_PubMedBERT_2021_5/27/23/checkpoint-229/pytorch_model.bin
Saving model checkpoint to candidate_PubMedBERT_2021_5/27/23/checkpoint-458
Configuration saved in candidate_PubMedBERT_2021_5/27/23/checkpoint-458/config.json
Model weights saved in candidate_PubMedBERT_2021_5/27/23/checkpoint-458/pytorch_model.bin
Deleting older checkpoint [candidate_PubMedBERT_2021_5/27/23/checkpoint-229] due to args.save_total_limit
Saving model checkpoint to candidate_PubMedBERT_2021_5/27/23/checkpoint-687
Configuration saved in candidate_PubMedBERT_2021_5/27/23/checkpoint-687/config.json
Model weights saved in candidate_PubMedBERT_2021_5/27/23/checkpoint-687/pytorch_model.bin
Deleting older checkpoint [candidate_PubMedBERT_2021_5/27/23/checkpoint-458] due to args.save_total_limit
Saving model checkpoint to candidate_P

In [None]:
#fine-tuning for final classification with whole data
run_name = "ftw_PubMedBert_GRT_256_lr77_13_2"
def fine_tune(model_checkpoint, output_dir, lr, batch_size, epochs, weight_decay):
    seed = 123
    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels = 2)
    training_args = TrainingArguments(
        output_dir = output_dir,
        learning_rate = lr,
        per_device_train_batch_size = batch_size,
        per_device_eval_batch_size = batch_size,
        num_train_epochs= epochs,
        weight_decay = weight_decay,
        #report_to='wandb',
        #logging_steps = 'epoch',
        #load_best_model_at_end = True,
        #evaluation_strategy = 'epoch',
        save_strategy= 'epoch',
        save_total_limit = 1,
        #run_name= run_name,
        
        )

    trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    #eval_dataset=valid_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator,
    #callbacks = [early_stopping],
     )
    a = trainer.train()
    #wandb.finish()
    return a

fine_tune(model_checkpoint, "ftw_PubMedBert_GRT_256_lr77_13_2", 7e-7, 4, 12, 0.001)

In [None]:
#experiments
ft_model = AutoModelForSequenceClassification.from_pretrained("ft_all_PubMedBERT_selected32/checkpoint-465")
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
clf = pipeline("text-classification", model = ft_model, tokenizer = tokenizer)

In [None]:
#experiments
def res(example):
    return clf(example['text'],  truncation = True)
#predictions = dataset['test'].map(res, batched = True)
#print(predictions)
predictions = {}
for i in range(len(dataset['validation'])):
    predictions[i] = clf(dataset['validation']['text'][i], truncation = 'longest_first', max_length = max_length)[0]['label']
#for i in range(12, len(dataset['validation'])):
    #print(i)
    #predictions[i] = clf(dataset['validation']['text'][i], truncation = 'longest_first', max_length = 20)[0]['label']
print (predictions)

In [28]:
from sklearn.metrics import multilabel_confusion_matrix
import numpy as np

def multi_class_performance(y_true, y_pred):

    mcm = multilabel_confusion_matrix(y_true, y_pred)

    tps = mcm[:, 1, 1]
    tns = mcm[:, 0, 0]

    recall      = tps / (tps + mcm[:, 1, 0])         # Sensitivity
    specificity = tns / (tns + mcm[:, 0, 1])         # Specificity
    precision   = tps / (tps + mcm[:, 0, 1])         # PPV
    return recall, specificity, precision

recall , specificity, precision = multi_class_performance(y_true, y_pred)
print(recall, specificity, precision)

[0.96583144 0.90756303 0.6        0.97142857] [0.92491468 0.96963563 0.98876404 0.99856528] [0.95067265 0.93506494 0.6        0.97142857]


In [None]:
#validation of joined abstract and title
max_length = 256
ft_model = "PubMedBERT_2021_2epoch/checkpoint-714"
#ft_model = 'final_PubMedBERT/checkpoint-771'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
clf = pipeline("text-classification", model = ft_model, tokenizer = tokenizer)

y_pred_true = collections.defaultdict(list)
y_pred = []
y_true = []
for i in range(len(dataset['validation'])):
    p = clf(dataset['validation']['text'][i], truncation = True, max_length = max_length)[0]['label']
    if p == 'LABEL_0':
        y_pred_true[dataset['validation'][i]['PMID']].append(0)
        y_pred.append(0)
    elif p == 'LABEL_1':
        y_pred_true[dataset['validation'][i]['PMID']].append(1)
        y_pred.append(1)
    elif p == 'LABEL_2':
        print('hi')
        y_pred_true[dataset['validation'][i]['PMID']].append(2)
        y_pred.append(2)
    else:
        print('hi')
        y_pred_true[dataset['validation'][i]['PMID']].append(3)
        y_pred.append(3)
    y_pred_true[dataset['validation'][i]['PMID']].append(dataset['validation'][i]['labels'])
    y_true.append(dataset['validation'][i]['labels'])

In [None]:
#deployment
y_pred = collections.defaultdict(list)
for i in range(len(dataset['unknown'])):
    p = clf(dataset['unknown']['text'][i], truncation = True, max_length = max_length)[0]
    
    if p['label'] == 'LABEL_0':
        y_pred[dataset['unknown'][i]['PMID']].append(0)
    elif p['label'] == 'LABEL_1':
        y_pred[dataset['unknown'][i]['PMID']].append(1)
    elif p['label'] == 'LABEL_2':
        y_pred[dataset['unknown'][i]['PMID']].append(2)
    else:
        y_pred[dataset['unknown'][i]['PMID']].append(3)
    y_pred[dataset['unknown'][i]['PMID']].append(p['score'])
