# Cross-validation comparison
Compare the performance between Bio-BERT, Pubmed-BERT, and COVID-Twitter-BERT models for our classification task

In [1]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/home/david.yang1/.cache/huggingface/'
os.environ['HF_HOME'] = '/home/david.yang1/.cache/huggingface/'

In [2]:
from transformers import AutoTokenizer, AutoModel, pipeline, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import defaultdict
import pandas as pd
from datasets import Dataset, DatasetDict
from huggingface_hub import login
import evaluate
from huggingface_hub import login
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

There was a problem when trying to write in your cache folder (/home/david.yang/.cache/huggingface/hub). You should set the environment variable TRANSFORMERS_CACHE to a writable directory.
2024-06-17 11:09:36.498296: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-17 11:09:37.559171: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-06-17 11:09:40.016566: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-17 11:09:40.016609: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin c

In [3]:
# login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Split data into K-folds

In [3]:
# Load dataset consisting of 309 papers that talk about viral variants and 309 papers that do not
df = pd.read_csv('bert_dataset.csv')

# Check class distribution
print(df['label'].value_counts())

# Balance classes if needed
df = df.groupby('label').sample(n=min(df['label'].value_counts()), random_state=42)

# Shuffle the dataset
df = df.sample(frac=1, random_state=42)
df = df[["text", "label"]]

label
0    316
1    309
Name: count, dtype: int64


In [5]:
# Create 5 folds for cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=1111)

# Tokenize training data

In [7]:
# Tokenization with maximum of 512 tokens (padding and truncation)
def tokenize_function(df):
    return tokenizer(
        df['text'],
        padding="longest",
        truncation=True,
        max_length = 512
    )

In [8]:
# Create DatasetDict for train and validation split
def create_dataset(tds, vds, tokenize_function):
    # Apply the tokenizer to the datasets
    tds = tds.map(tokenize_function, batched=True)
    vds = vds.map(tokenize_function, batched=True)
    
    # Set the format of the datasets to include only the required columns
    tds = tds.rename_column('__index_level_0__', 'index').remove_columns(['text', 'index'])
    vds = vds.rename_column('__index_level_0__', 'index').remove_columns(['text', 'index'])
    
    # Define DatasetDict
    ds = DatasetDict({
        "train": tds,
        "validation": vds
    })

    return ds

In [9]:
# Compute accuracy, f1, precision, and recall
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [10]:
# BERT models that we will be using
bert_models = ["NeuML/pubmedbert-base-embeddings", "digitalepidemiologylab/covid-twitter-bert", "dmis-lab/biobert-v1.1"]

# Specify tokenizer and sequence classification model
tokenizer = AutoTokenizer.from_pretrained(bert_models)

def model_init():
        return AutoModelForSequenceClassification.from_pretrained(bert_models, num_labels=2)



In [11]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy = "steps",
    eval_steps=500,
    num_train_epochs=3,    # number of training epochs
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_ratio=0.01,
    weight_decay=0.01,
    logging_dir='./logs',
)

In [None]:
# Variables for data
train_results = []
validation_results = pd.DataFrame()

In [13]:
# For each BERT model, perform cross-validation with 5-folds
for bert in bert_models:
    # Specify tokenizer
    tokenizer = AutoTokenizer.from_pretrained(bert_models)

    # Specify sequence classification model
    def model_init():
            return AutoModelForSequenceClassification.from_pretrained(bert_models, num_labels=2)
    
    # Variable to track CV fold
    trial = 0
    
    # Cross-validation
    for train_indices, valid_indices in kfold.split(df, df['label']):
        trial += 1
        df_train = df.iloc[train_indices]
        df_val = df.iloc[valid_indices]
    
        tds = Dataset.from_pandas(df_train)
        vds = Dataset.from_pandas(df_val)
    
        ds = create_dataset(tds, vds, tokenize_function)
    
        # Create the Trainer and start training
        trainer = Trainer(
            args=training_args,
            train_dataset=ds["train"],
            eval_dataset=ds["validation"],
            model_init=model_init,
            compute_metrics=compute_metrics,
        )
    
        train = trainer.train()
        train_results.append(train.metrics["train_loss"])
    
        # Evaluate the model
        eval = trainer.evaluate(ds["validation"])

        # Save model metrics
        eval_df = pd.DataFrame(eval, index=[trial,])
        eval_df["model"] = bert
        eval_df["fold"] = str(trial)

        validation_results = pd.concat([validation_results, eval_df])

    path = "./" + bert
    trainer.save_model(path)

Map:   0%|          | 0/312 [00:00<?, ? examples/s]

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NeuML/pubmedbert-base-embeddings and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NeuML/pubmedbert-base-embeddings and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Map:   0%|          | 0/312 [00:00<?, ? examples/s]

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NeuML/pubmedbert-base-embeddings and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NeuML/pubmedbert-base-embeddings and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Map:   0%|          | 0/312 [00:00<?, ? examples/s]

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NeuML/pubmedbert-base-embeddings and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NeuML/pubmedbert-base-embeddings and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Map:   0%|          | 0/312 [00:00<?, ? examples/s]

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NeuML/pubmedbert-base-embeddings and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NeuML/pubmedbert-base-embeddings and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


Map:   0%|          | 0/312 [00:00<?, ? examples/s]

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NeuML/pubmedbert-base-embeddings and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NeuML/pubmedbert-base-embeddings and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


In [14]:
print(validation_results)

   eval_loss  eval_accuracy  eval_f1  eval_precision  eval_recall  \
1   0.000468            1.0      1.0             1.0          1.0   
2   0.000511            1.0      1.0             1.0          1.0   
3   0.000440            1.0      1.0             1.0          1.0   
4   0.000430            1.0      1.0             1.0          1.0   
5   0.000469            1.0      1.0             1.0          1.0   

   eval_runtime  eval_samples_per_second  eval_steps_per_second  epoch  
1       18.3824                    4.243                  0.109    3.0  
2       17.7110                    4.404                  0.113    3.0  
3       18.1452                    4.299                  0.110    3.0  
4       17.0197                    4.583                  0.118    3.0  
5       17.0268                    4.581                  0.117    3.0  
