In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import Dataset, DatasetDict

from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer

import torch
from torch.utils.data import DataLoader
import logging
import numpy as np
import re

from tqdm.auto import tqdm
from tqdm.notebook import tqdm
from transformers import AutoModel, AutoTokenizer

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
#set the data folder
data_folder = "/home/gckc123/Documents/Testing/data"

In [6]:
#set the file name
dev_file_name = "20240827_dev_set.csv"

In [7]:
dev_file = pd.read_csv(data_folder + "/" + dev_file_name)

In [8]:
dev_file["obj_sel"] = "[RTI] " + dev_file["Review_Title"].fillna("") + "[BG] " + dev_file["Background"].fillna("") + "[OBJ] " + dev_file["Objective"].fillna("") + "[SEL] " + dev_file["Selection_criteria"].fillna("")
dev_file["tit_abs"] = "[TIT] " + dev_file["Title"].fillna("") + "[ABS] " + dev_file["Abstract_clean"].fillna("")

In [238]:
model_ckpt = "dmis-lab/biobert-large-cased-v1.1"

#model_ckpt = "dmis-lab/biobert-base-cased-v1.1"

#model_ckpt = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

new_tokens = ["[RTI]", "[BG]", "[OBJ]", "[SEL]", "[TIT]", "[ABS]"]



In [240]:
dev_set = Dataset.from_pandas(dev_file)

In [241]:
for part in ["obj_sel", "tit_abs"]:
    dev_set = dev_set.map(
        lambda x: tokenizer(
            x[part], max_length = 512, padding = True, truncation = True),
        batched = True, batch_size = None
    )
    for col in ['input_ids', 'attention_mask']:
        dev_set = dev_set.rename_column(
            col, part+"_"+col
        )

all_cols = ['label', 'obj_sel_input_ids', 'obj_sel_attention_mask', 'tit_abs_input_ids', 'tit_abs_attention_mask']
dev_set.set_format(type = 'torch', columns = all_cols)

Map:   0%|          | 0/466745 [00:00<?, ? examples/s]

Map:   0%|          | 0/466745 [00:00<?, ? examples/s]

In [242]:
batch_size = 64
loader = torch.utils.data.DataLoader(
    dev_set, batch_size = batch_size, shuffle = True)

In [243]:
def mean_pool(token_embeds, attention_mask):
    in_mask = attention_mask.unsqueeze(-1).expand(token_embeds.size()).float()
    pool = torch.sum(token_embeds * in_mask, 1) / torch.clamp(
        in_mask.sum(1), min=1e-9
    )
    return pool

In [244]:
ffnn = torch.nn.Linear(1024*3, 1)

#ffnn = torch.nn.Linear(768*3, 1)
loss_func = torch.nn.MSELoss()

In [245]:
from transformers.optimization import get_linear_schedule_with_warmup

optim = torch.optim.Adam(model.parameters(), lr = 2e-5)
total_steps = int(len(dev_set) / batch_size)
warmup_steps = 500
scheduler = get_linear_schedule_with_warmup(
    optim, num_warmup_steps=warmup_steps,
    num_training_steps=total_steps - warmup_steps
)



In [246]:
import torch.nn as nn

model = nn.DataParallel(model)
ffnn = nn.DataParallel(ffnn)

model = model.to(device)
ffnn = ffnn.to(device)

In [247]:
for epoch in range(1):
    model.train()
    loop = tqdm(loader, leave= True)
    step = 0
    for batch in loop:
        optim.zero_grad()
        inputs_ids_a = batch['obj_sel_input_ids'].to(device)
        inputs_ids_b = batch['tit_abs_input_ids'].to(device)
        attention_a = batch['obj_sel_attention_mask'].to(device)
        attention_b = batch['tit_abs_attention_mask'].to(device)
        label = batch['label'].to(device)
        u = model(inputs_ids_a, attention_mask = attention_a)[0]
        v = model(inputs_ids_b, attention_mask = attention_b)[0]
        u = mean_pool(u, attention_a)
        v = mean_pool(v, attention_b)
        uv = torch.sub(u, v)
        uv_abs = torch.abs(uv)
        x = torch.cat([u, v, uv_abs], dim = -1)
        x = ffnn(x)
        x = x.squeeze()
        loss = loss_func(x, label)
        optim.zero_grad()
        loss.backward()
        optim.step()
        scheduler.step()

        if step % 100 == 0:
            print(f"Step {step}, Loss: {loss.item()}")

        step = step + 1
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())

  0%|          | 0/7293 [00:00<?, ?it/s]

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Step 0, Loss: 0.6500090956687927
Step 100, Loss: 0.12030091136693954
Step 200, Loss: 0.06970946490764618
Step 300, Loss: 0.053962960839271545
Step 400, Loss: 0.06465943902730942
Step 500, Loss: 0.052706994116306305
Step 600, Loss: 0.05833008885383606
Step 700, Loss: 0.06320291012525558
Step 800, Loss: 0.04279603436589241
Step 900, Loss: 0.058049559593200684
Step 1000, Loss: 0.06046520173549652
Step 1100, Loss: 0.024503251537680626
Step 1200, Loss: 0.08358778059482574
Step 1300, Loss: 0.059533871710300446
Step 1400, Loss: 0.04453762620687485
Step 1500, Loss: 0.05321890860795975
Step 1600, Loss: 0.06352779269218445
Step 1700, Loss: 0.05680865794420242
Step 1800, Loss: 0.04784209653735161
Step 1900, Loss: 0.062095992267131805
Step 2000, Loss: 0.04491308331489563
Step 2100, Loss: 0.06464462727308273
Step 2200, Loss: 0.0373610258102417
Step 2300, Loss: 0.05767640843987465
Step 2400, Loss: 0.029944488778710365
Step 2500, Loss: 0.06343723833560944
Step 2600, Loss: 0.04289017990231514
Step 270

In [21]:
######################################################################################################