In [1]:
import os
os.environ['TRANSFORMERS_CACHE'] = '/home/david.yang1/.cache/huggingface/'
os.environ['HF_HOME'] = '/home/david.yang1/.cache/huggingface/'

In [2]:
from transformers import AutoTokenizer, AutoModel, pipeline, AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from collections import defaultdict
import pandas as pd
from datasets import Dataset, DatasetDict
# from huggingface_hub import login
import evaluate
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from nltk.tokenize import sent_tokenize
from torch.nn import functional as F
import torch
# from ray.tune.search.hyperopt import HyperOptSearch
# from ray.tune.schedulers import ASHAScheduler

2024-07-18 09:51:03.705982: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-18 09:51:03.749374: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-18 09:51:03.749412: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-18 09:51:03.749448: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-07-18 09:51:03.758072: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: A

# Load training and validation data

In [3]:
# Load BioBERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("NeuML/pubmedbert-base-embeddings")

# Tokenize the data
def tokenize_function(df):
    return tokenizer(
        df['text'],
        padding="longest",
        truncation=True,
        max_length = 512
    )



In [4]:
def ds_preparation(df, val_count=0):
    # Balance classes if needed
    df = df.groupby('label').sample(n=min(df['label'].value_counts()), random_state=42)
    
    # Shuffle the dataset
    df = df.sample(frac=1, random_state=42)
    df = df[["text", "label"]]
    
    # Split dataset into test & train
    df_train = df[val_count:]
    df_val = df[:val_count]
    
    tds = Dataset.from_pandas(df_train)
    vds = Dataset.from_pandas(df_val)

    # Apply the tokenizer to the datasets
    tds = tds.map(tokenize_function, batched=True)
    vds = vds.map(tokenize_function, batched=True)
    
    # Set the format of the datasets to include only the required columns
    tds = tds.rename_column('__index_level_0__', 'index').remove_columns(['text', 'index'])
    vds = vds.rename_column('__index_level_0__', 'index').remove_columns(['text', 'index'])
    
    # Define DatasetDict
    ds = DatasetDict({
        "train": tds,
        "validation": vds
    })

    return ds

# Model fine tuning
Parameter tuning: 
https://kaitchup.substack.com/p/a-guide-on-hyperparameters-and-training
https://medium.com/distributed-computing-with-ray/hyperparameter-optimization-for-transformers-a-guide-c4e32c6c989b
https://huggingface.co/blog/ray-tune


In [5]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [6]:
checkpoint = True

# Load initial fine-tuned model
if checkpoint:
    def model_init():
        return AutoModelForSequenceClassification.from_pretrained("./chunks-pubmed-bert", num_labels=2)
else:
    def model_init():
        return AutoModelForSequenceClassification.from_pretrained("NeuML/pubmedbert-base-embeddings", num_labels=2)
        

In [7]:
# Fine tune model
def fine_tune_model(ds, model_init, train=False):
    # Define the training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy = "steps",
        eval_steps=500,
        num_train_epochs=3,    # number of training epochs
        per_device_train_batch_size=16,
        per_device_eval_batch_size=64,
        warmup_ratio=0.01,
        weight_decay=0.01,
        logging_dir='./logs',
    )

    # Create the Trainer and start training
    trainer = Trainer(
        args=training_args,
        train_dataset=ds["train"],
        eval_dataset=ds["validation"],
        model_init=model_init,
        compute_metrics=compute_metrics,
    )

    if train:
        trainer.train()

    if ds["validation"]:
        trainer.evaluate()

    return trainer

# Predictions

In [8]:
# Split text into <512 token chunks
def split_text_into_chunks(text, tokenizer, max_tokens=512, overlap_sentences=2):
    # Tokenize the text into sentences
    sentences = sent_tokenize(text)
    
    # Initialize variables
    chunks = []
    current_chunk = []
    current_chunk_len = 0

    for i, sentence in enumerate(sentences):
        # Tokenize the sentence using BERT Tokenizer
        tokens = tokenizer.tokenize(sentence)
        token_count = len(tokens)

        # Finalize the current chunk if adding this sentence exceed token limit
        if current_chunk_len + token_count > max_tokens:
            text_chunk = "".join(current_chunk)
            chunks.append(text_chunk)

            # Create the next chunk with overlap
            overlap_start = max(0, i-overlap_sentences)
            current_chunk = []
            for j in range(overlap_start, i):
                current_chunk.append(sentences[j])
            current_chunk_len = len(current_chunk)

        # Add the current sentence tokens to the chunk
        current_chunk.append(sentence)
        current_chunk_len += token_count

    # Add the last chunk if it has content
    if current_chunk:
        text_chunk = "".join(current_chunk)
        chunks.append(text_chunk)

    return chunks

In [9]:
# Predict label of dataframe
def prediction_chunks(df, tokenizer, trainer):
    output = pd.DataFrame()
    for i, text in enumerate(df["text"]):
        chunks = split_text_into_chunks(text, tokenizer)
        
        chunks_df = pd.DataFrame(chunks, columns=["text"])
        # chunks_df["label"] = df["label"][i]
        chunks_df["position"] = chunks_df.index
        chunks_df["paper"] = i
        
        t = Dataset.from_pandas(chunks_df)
        t = t.map(tokenize_function, batched=True)
        ds_t = DatasetDict({
            "test": t
        })

        pred = trainer.predict(ds_t["test"])
        chunks_df["prediction"] = pred.predictions.argmax(-1)

        # convert logit score to torch array
        torch_logits = torch.from_numpy(pred.predictions)

        # get probabilities using softmax from logit score and convert it to numpy array
        probabilities_scores = F.softmax(torch_logits, dim = -1).numpy()

        chunks_df["probability"] = probabilities_scores.max(-1)

        # save into output
        output = pd.concat([output, chunks_df], ignore_index=True)
        
    return output, pred

# Dataset to validate chunk prediction results

In [10]:
# Load prediction chunks
pred_chunks_0 = pd.read_csv("../data/pipeline_data/paper_flagging_data/0_chunks_labelled.csv")
pred_chunks_1 = pd.read_csv("../data/pipeline_data/paper_flagging_data/1_chunks_labelled.csv")
pred_chunks_2 = pd.read_csv("../data/pipeline_data/paper_flagging_data/2_chunks_labelled.csv")
pred_chunks_3 = pd.read_csv("../data/pipeline_data/paper_flagging_data/3_chunks_labelled.csv")
pred_chunks_4 = pd.read_csv("../data/pipeline_data/paper_flagging_data/4_chunks_labelled.csv")
pred_chunks_5 = pd.read_csv("../data/pipeline_data/paper_flagging_data/5_chunks_labelled.csv")
pred_chunks_6 = pd.read_csv("../data/pipeline_data/paper_flagging_data/6_chunks_labelled.csv")
pred_chunks_7 = pd.read_csv("../data/pipeline_data/paper_flagging_data/7_chunks_labelled.csv")
pred_chunks_8 = pd.read_csv("../data/pipeline_data/paper_flagging_data/8_chunks_labelled.csv")
pred_chunks_9 = pd.read_csv("../data/pipeline_data/paper_flagging_data/9_chunks_labelled.csv")
pred_chunks_10 = pd.read_csv("../data/pipeline_data/paper_flagging_data/10_chunks_labelled.csv")
pred_chunks_11 = pd.read_csv("../data/pipeline_data/paper_flagging_data/11_chunks_labelled.csv")


# Concatenate data
df_test = pd.concat([pred_chunks_0, pred_chunks_1, pred_chunks_2, pred_chunks_3, pred_chunks_4, pred_chunks_5, pred_chunks_6, pred_chunks_7, 
                     pred_chunks_8, pred_chunks_9, pred_chunks_10, pred_chunks_11])

# Load dataframe as dataset
test = Dataset.from_pandas(df_test)

# Tokenize test dataset
test = test.map(tokenize_function, batched=True)

# Set the format of the datasets to include only the required columns
test = test.rename_column('__index_level_0__', 'index').remove_columns(['text', 'index'])

# Define DatasetDict
ds_test = DatasetDict({
    "test": test
})

Map:   0%|          | 0/125 [00:00<?, ? examples/s]

In [11]:
# from evaluate import load
# perplexity = load("perplexity", module_type="metric")

# pred = trainer.predict(ds_test["test"])

# results = perplexity.compute(predictions=predictions, model_id='gpt2')

In [12]:
def validate_model(trainer):
    # Test performance of the model on labeled chunks
    pred = trainer.predict(ds_test["test"])
    
    df_test["prediction"] = pred.predictions.argmax(-1)
    
    metrics = compute_metrics(pred)

    # df_test.to_csv("df_test_chunks.csv")

    return metrics    

In [13]:
# probabilities_scores.max(-1)

In [14]:
# # convert logit score to torch array
# torch_logits = torch.from_numpy(pred.predictions)

# # get probabilities using softmax from logit score and convert it to numpy array
# probabilities_scores = F.softmax(torch_logits, dim = -1).numpy()

In [15]:
# pred.predictions

In [16]:
# probabilities_scores

# View model prediction on chunks

In [17]:
# chunks = split_text_into_chunks(df["text"][0], tokenizer)

In [18]:
# chunks[5]

In [19]:
# chunk_df = pd.DataFrame(chunks, columns=["text"])

In [20]:
# type(chunk_df["text"][0])

In [21]:
# chunk_df["label"] = df["label"][0]
# chunk_df

In [22]:
# t = Dataset.from_pandas(chunk_df)
# t = t.map(tokenize_function, batched=True)
# # t = t.rename_column('__index_level_0__', 'index').remove_columns(['text', 'index'])

# # Define DatasetDict
# ds_t = DatasetDict({
#     "test": t
# })

In [23]:
# pred = trainer.predict(ds_t["test"])
# pred.predictions.argmax(-1)

data = prediction_chunks(df, tokenizer)

In [24]:
# data.to_csv("chunk_labelled.csv")

# Perform prediction and retraining

**Documentation**: June 26 - Transfer learning of PubmedBERT with Pokay dataset at ./chunks-pubmed-bert

In [25]:
# Load model
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("NeuML/pubmedbert-base-embeddings", num_labels=2)

# Load data
df = pd.read_csv('../data/pipeline_data/paper_flagging_data/bert_dataset.csv')
ds = ds_preparation(df, val_count=128)

# Train model
trainer = fine_tune_model(ds, model_init, train=True)

# Checkpoint model
trainer.save_model("../models/chunks-pubmed-bert")

# Validate current model
metrics = validate_model(trainer)

print(metrics)

Map:   0%|          | 0/490 [00:00<?, ? examples/s]

Map:   0%|          | 0/128 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NeuML/pubmedbert-base-embeddings and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at NeuML/pubmedbert-base-embeddings and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss,Validation Loss


{'accuracy': 0.656, 'f1': 0.6666666666666665, 'precision': 0.5119047619047619, 'recall': 0.9555555555555556}


In [26]:
# metrics

In [None]:
# Load model
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("../models/chunks-pubmed-bert", num_labels=2)

trainer = fine_tune_model(ds, model_init, train=False)

# Check performance
metrics = validate_model(trainer)
print(metrics)
# Save predictions 
df_test.to_csv("test_predictions_model_1.csv")

## Load retrain data

In [None]:
# Load retrain data
retrain_df = pd.read_csv('chunks_dataset.csv')
# print(retrain_df)
retrain_df = retrain_df.sample(frac=1, random_state=42)

In [None]:
# # Load model
# def model_init():
#     return AutoModelForSequenceClassification.from_pretrained("../models/chunks-pubmed-bert", num_labels=2)

# # Load data
# df = pd.read_csv('bert_dataset.csv')
# ds = ds_preparation(df)

# # Train model
# trainer = fine_tune_model(ds, model_init, train=False)

retrain_df_1 = retrain_df[:100]
retrain_df_1, _ = prediction_chunks(retrain_df_1, tokenizer, trainer)
# retrain_df_1

retrain_df_1.describe()

# Rename column
retrain_df_1 = retrain_df_1.rename(columns={'prediction': 'label'})

print(retrain_df_1[retrain_df_1["probability"] < 0.7].count())

# Filter for predictions by the decision boundary
retrain_df_1 = retrain_df_1[retrain_df_1["probability"] < 0.7]

# Balance out 0 and 1 labels 
retrain_df_1 = retrain_df_1.groupby('label').sample(n=min(retrain_df_1['label'].value_counts()), random_state=42)

In [None]:
retrain_df_1.describe()

In [None]:
# Save dataset locally to verify validity of predictions
retrain_df_1.to_csv("retrain_df_1.csv")

In [None]:
# Load modified dataset
retrain_df_1 = pd.read_csv("../data/processed/train_data/retrain_df_1.csv")

In [None]:
# remove german paper
retrain_df_1 = retrain_df_1.drop(retrain_df_1[retrain_df_1['paper'] == 43].index)

# balance out classes
retrain_df_1 = retrain_df_1.groupby('label').sample(n=min(retrain_df_1['label'].value_counts()), random_state=42)

# Drop unneccessary columns
retrain_df = retrain_df_1.drop(['position', 'paper', 'probability'], axis=1)

In [None]:
retrain_df.describe()

**Documentation**: June 28 - Retrain of model #1 saved at ./chunks-pubmed-bert-v2

In [None]:
# Load data
retrain_ds = ds_preparation(retrain_df)

# Load model 
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("../models/chunks-pubmed-bert", num_labels=2)

# Train model
trainer = fine_tune_model(retrain_ds, model_init, train=True)

# Checkpoint model
trainer.save_model("./chunks-pubmed-bert-v2")

# Validate current model
metrics = validate_model(trainer)

In [None]:
metrics

In [None]:
# Load model
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("chunks-pubmed-bert-v2", num_labels=2)

trainer = fine_tune_model(ds, model_init, train=False)

# Check performance
metrics = validate_model(trainer)
print(metrics)

# Save predictions 
df_test.to_csv("test_predictions_model_v2.csv")

# Create dataset for chunking model

In [None]:
full_text_data = pd.read_csv("bert_dataset.csv")

In [None]:
# Load model
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("../models/chunks-pubmed-bert-v2", num_labels=2)

trainer = fine_tune_model(ds, model_init, train=False)

In [None]:
chunked_data = prediction_chunks(full_text_data, tokenizer, trainer)

In [None]:
chunked_data_df, chunked_data_pred = chunked_data

In [None]:
chunked_data_df.to_csv("../data/pipeline_data/paper_flagging_data/lightgbm.csv")

In [None]:
import pandas as pd

In [None]:
# Load new papers
data = pd.read_csv("../data/pipeline_data/paper_flagging_data/new_papers_dataset.csv")

In [None]:
# Load model
def model_init():
    return AutoModelForSequenceClassification.from_pretrained("../models/chunks-pubmed-bert-v2", num_labels=2)

trainer = fine_tune_model(ds, model_init, train=False)

In [None]:
chunked_data_df, chunked_data_pred = prediction_chunks(data[:10], tokenizer, trainer)

In [None]:
import lightgbm as lgb

In [None]:
# Load lightbgm model
bst = lgb.Booster(model_file='../models/lightbgm_model.txt')

In [None]:
data = chunked_data_df

# Format data for lightBGM
grouped = data.groupby('paper')

# Maximum number of data points in any group
# max_len = max(grouped.size())
max_len = 133
print(max_len)

# Create DataFrame with appropriate number of columns
columns = [f'prediction_{i}' for i in range(max_len)]
columns.append("paper")
print(columns)

df = pd.DataFrame(columns=columns)

for name, group in grouped:
    predictions = group["prediction"].values.astype(float)
    entry = np.pad(predictions, (0, max_len - len(predictions)), constant_values=np.nan)
    # entry = np.pad(predictions, (0, 133 - len(predictions)), constant_values=np.nan)
    entry = np.append(entry, name)
    df.loc[name] = entry

In [None]:
predictions = bst.predict(df.drop(columns="paper"), num_iteration=bst.best_iteration)

In [None]:
pred = np.where(predictions < 0.5, 0, 1)

In [None]:
df["prediction"] = pred.T

In [None]:
df.head(5)

In [None]:
df['paper'] = df['paper'].astype(int)
flagged_papers = df[df['prediction'] == 1]['paper']

In [None]:
p = flagged_papers.tolist()

In [None]:
flagged = chunked_data_df[chunked_data_df["paper"].isin(p)]

In [None]:
flagged["text"][0] 

In [None]:
import requests

def query_plain(text, url="http://localhost:8888/plain"):
    return requests.post(url, json={'text': text}).json()

port = "http://172.19.5.205:8888/plain"

# if __name__ == '__main__':
#     text = "Autophagy maintains tumour growth through circulating arginine."
#     print(query_plain(text))

In [None]:
grouped_papers = flagged.groupby('paper')

for name, group in grouped_papers:
    # print(group["text"])
    for text in group["text"]:
        # print((query_plain(text, url = "http://bern2.korea.ac.kr/plain")))
        print((query_plain(text)))