In [1]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/home/david.yang1/.cache/huggingface/'
os.environ['HF_HOME'] = '/home/david.yang1/.cache/huggingface/'

In [2]:
from transformers import AutoTokenizer, AutoModel, pipeline, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import defaultdict
import pandas as pd
from datasets import Dataset, DatasetDict
from huggingface_hub import login
import evaluate
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from nltk.tokenize import sent_tokenize
# from ray.tune.search.hyperopt import HyperOptSearch
# from ray.tune.schedulers import ASHAScheduler

2024-07-02 14:06:01.889587: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-02 14:06:01.892940: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-02 14:06:01.932883: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-02 14:06:01.932923: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-02 14:06:01.932948: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to regi

In [None]:
# login()

# Load and inspect training and validation data

In [3]:
df = pd.read_csv('bert_dataset.csv')

# Check class distribution
print(df['label'].value_counts())

# Balance classes if needed
df = df.groupby('label').sample(n=min(df['label'].value_counts()), random_state=42)

# Shuffle the dataset
df = df.sample(frac=1, random_state=42)
df = df[["text", "label"]]

# Split dataset into test & train
df_train = df[128:]
df_val = df[:128]

tds = Dataset.from_pandas(df_train)
vds = Dataset.from_pandas(df_val)

label
0    309
1    309
Name: count, dtype: int64


In [None]:
type(df["text"][0])

https://medium.com/@fhirfly/fine-tuning-biobert-v1-1-on-a-large-dataset-classifying-medical-queries-c33b4d08ec6a

# Preprocess text dataset

In [4]:
# Load BioBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("NeuML/pubmedbert-base-embeddings")
# model = AutoModel.from_pretrained('dmis-lab/biobert-v1.1')

# Tokenize the data
def tokenize_function(df):
    return tokenizer(
        df['text'],
        padding="longest",
        truncation=True,
        max_length = 512
    )



In [6]:
# Apply the tokenizer to the datasets
tds = tds.map(tokenize_function, batched=True)
vds = vds.map(tokenize_function, batched=True)

# Set the format of the datasets to include only the required columns
tds = tds.rename_column('__index_level_0__', 'index').remove_columns(['text', 'index'])
vds = vds.rename_column('__index_level_0__', 'index').remove_columns(['text', 'index'])

# Define DatasetDict
ds = DatasetDict({
    "train": tds,
    "validation": vds
})

Map:   0%|          | 0/490 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

# Model fine tuning
Parameter tuning: 
https://kaitchup.substack.com/p/a-guide-on-hyperparameters-and-training
https://medium.com/distributed-computing-with-ray/hyperparameter-optimization-for-transformers-a-guide-c4e32c6c989b
https://huggingface.co/blog/ray-tune


In [9]:
# Load the pre-trained model
# def model_init():
#     return AutoModelForSequenceClassification.from_pretrained("NeuML/pubmedbert-base-embeddings", num_labels=2)

# metric = evaluate.load("confusion_matrix")

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [10]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy = "steps",
    eval_steps=500,
    num_train_epochs=3,    # number of training epochs
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_ratio=0.01,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Create the Trainer and start training
trainer = Trainer(
    args=training_args,
    train_dataset=ds["train"],
    eval_dataset=ds["validation"],
    model_init=model_init,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
# Train the model
train = trainer.train()

In [None]:
# from local folder
# train = AutoModelForSequenceClassification.from_pretrained("./chunks-pubmed-bert")

In [11]:
# Evaluate the model
eval = trainer.evaluate()

In [12]:
eval

{'eval_loss': 0.3069172203540802,
 'eval_accuracy': 0.8984375,
 'eval_f1': 0.8959999999999999,
 'eval_precision': 0.9180327868852459,
 'eval_recall': 0.875,
 'eval_runtime': 19.0202,
 'eval_samples_per_second': 6.73,
 'eval_steps_per_second': 0.105}

In [None]:
trainer.save_model("./chunks-pubmed-bert")

In [8]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("./chunks-pubmed-bert", num_labels=2)

# Predictions

In [5]:
def split_text_into_chunks(text, tokenizer, max_tokens=512, overlap_sentences=2):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Initialize variables
    chunks = []
    current_chunk = []
    current_chunk_len = 0

    for i, sentence in enumerate(sentences):
        # Tokenize the sentence using BERT Tokenizer
        tokens = tokenizer.tokenize(sentence)
        token_count = len(tokens)

        # Finalize the current chunk if adding this sentence exceed token limit
        if current_chunk_len + token_count > max_tokens:
            text_chunk = "".join(current_chunk)
            chunks.append(text_chunk)

            # Create the next chunk with overlap
            overlap_start = max(0, i-overlap_sentences)
            current_chunk = []
            for j in range(overlap_start, i):
                current_chunk.append(sentences[j])
            current_chunk_len = len(current_chunk)

        # Add the current sentence tokens to the chunk
        current_chunk.append(sentence)
        current_chunk_len += token_count

    # Add the last chunk if it has content
    if current_chunk:
        text_chunk = "".join(current_chunk)
        chunks.append(text_chunk)

    return chunks

In [10]:
# # Create dataset consisting of chunks of text
df_test = df[64:128] 
# sample = df_test.head(15)
sample = df_test.sample(n=15, random_state=42)
test_df = df_test.drop(sample.index)

# all_chunks = []
df_test_chunks = pd.DataFrame()
for text in sample["text"]:
    chunks = split_text_into_chunks(text, tokenizer)
    temp = pd.DataFrame(chunks, columns=["text"])
    temp['position'] = temp.index
    # all_chunks.extend(chunks)
    df_test_chunks = pd.concat([df_test_chunks, temp])

df_test_chunks = df_test_chunks.reset_index(drop=True)

# df_test_chunks = pd.DataFrame(all_chunks, columns=['text'])

df_test_chunks.to_csv('test_chunks_positions.csv')

In [None]:
# Load prediction chunks
pred_chunks_0 = pd.read_csv("0_chunks_labelled.csv")
pred_chunks_1 = pd.read_csv("1_chunks_labelled.csv")
pred_chunks_2 = pd.read_csv("2_chunks_labelled.csv")

# Concatenate data
df_test = pd.concat([pred_chunks_0, pred_chunks_1, pred_chunks_2])

# Load dataframe as dataset
test = Dataset.from_pandas(df_test)

# Tokenize test dataset
test = test.map(tokenize_function, batched=True)

# Set the format of the datasets to include only the required columns
test = test.rename_column('__index_level_0__', 'index').remove_columns(['text', 'index'])

# Define DatasetDict
ds_test = DatasetDict({
    "test": test
})

In [None]:
# Test performance of the model on labeled chunks
pred = trainer.predict(ds_test["test"])
pred
df_test["prediction"] = pred.predictions.argmax(-1)

In [None]:
compute_metrics(pred)

In [None]:
df_test.to_csv("df_test_chunks.csv")

# View model prediction on chunks

In [None]:


# Given a chunk, provide prediction of the chunk
def predict_chunk(chunk, trainer, tokenizer):
    tokens = tokenizer.tokenize(chunk)
    pred = trainer.predict(tokens)
    
    # Print results
    print("------------------------------------------------")
    print("GIVEN TEXT: ")
    print(chunk)
    print("================================================")
    print(pred.predictions.argmax(-1))

In [None]:
chunks = split_text_into_chunks(df["text"][0], tokenizer)

In [None]:
chunks[5]

In [None]:
chunk_df = pd.DataFrame(chunks, columns=["text"])

In [None]:
type(chunk_df["text"][0])

In [None]:
chunk_df["label"] = df["label"][0]
chunk_df

In [None]:
t = Dataset.from_pandas(chunk_df)
t = t.map(tokenize_function, batched=True)
# t = t.rename_column('__index_level_0__', 'index').remove_columns(['text', 'index'])

# Define DatasetDict
ds_t = DatasetDict({
    "test": t
})

In [None]:
pred = trainer.predict(ds_t["test"])
pred.predictions.argmax(-1)

In [None]:
def prediction_chunks(df):
    output = pd.DataFrame()
    for i, text in enumerate(df["text"]):
        if i > 3:
            break
            
        chunks = split_text_into_chunks(text, tokenizer)
        
        chunks_df = pd.DataFrame(chunks, columns=["text"])
        chunks_df["label"] = df["label"][i]
        chunks_df["position"] = chunks_df.index

        t = Dataset.from_pandas(chunks_df)
        t = t.map(tokenize_function, batched=True)
        ds_t = DatasetDict({
            "test": t
        })

        pred = trainer.predict(ds_t["test"])
        chunks_df["prediction"] = pred.predictions.argmax(-1)
        output = pd.concat([output, chunks_df], ignore_index=True)
        print(chunks_df)
        print("=========")
    return output

In [None]:
data = prediction_chunks(df)

In [None]:
data.to_csv("chunk_labelled.csv")

In [None]:
sample = df_test.sample(n=15, random_state=42)
df_test = df_test.drop(sample.index)

new_sample = df_test.sample(n=10, random_state=42)

# all_chunks = []

for text in new_sample["text"]:
#     chunks = split_text_into_chunks(text, tokenizer)
#     for chunk in chunks:
#         predict_chunk(chunk)

# df_test_chunks = pd.DataFrame(all_chunks, columns=['text'])

# Hyperparameter search

In [None]:
# Default objective is the sum of all metrics
# when metrics are provided, so we have to maximize it.
trainer.hyperparameter_search(
    direction="maximize", 
    backend="ray", 
    n_trials=10 # number of trials
)

In [None]:
best_trial = trainer.hyperparameter_search(
    direction="maximize",
    backend="ray",
    search_alg=HyperOptSearch(metric="objective", mode="max"),
    scheduler=ASHAScheduler(metric="objective", mode="max")
)

# ARCHIVE

In [None]:
# def encode_data(tokenizer, text, max_length):
#     encoded = tokenizer.batch_encode_plus(
#         text,
#         truncation=True,
#         padding='longest',
#         max_length=max_length,
#         return_tensors='pt'  # return PyTorch tensors
#     )
#     return encoded["input_ids"], encoded["attention_mask"]
# # Use an appropriate max_length 
# input_ids_train, attention_mask_train = encode_data(tokenizer, df_train['text'].tolist(), max_length=512)
# input_ids_val, attention_mask_val = encode_data(tokenizer, df_val['text'].tolist(), max_length=512)