In [1]:
import json
import re
from tqdm import tqdm
import pandas as pd
import torch
import numpy as np
import random
from itertools import islice
from random import randint
from model import Encoder1
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import BigBirdTokenizer, BigBirdForSequenceClassification
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler

In [2]:
def same_seeds(seed):
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.set_device(3)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
same_seeds(42)

In [3]:
# device setting
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [4]:
train_ans = pd.read_csv('dataset/train_ans.csv')
train_ans.head() # 7913

Unnamed: 0,topic,doc
0,2,2740164 2759595 2759599 2767143 2911928 291756...
1,4,1459158 2529301 2700561 2769425 2835690 283699...
2,6,1533809 1910597 2503992 2577108 2703746 272215...
3,9,1090616 1679810 2627448 2629012 2645689 271457...
4,11,1892007 2275746 2503962 2516438 2688349 276296...


In [5]:
train_ans['doc'] = train_ans['doc'].str.strip(',').str.split(' ')
train_ans.head()

Unnamed: 0,topic,doc
0,2,"[2740164, 2759595, 2759599, 2767143, 2911928, ..."
1,4,"[1459158, 2529301, 2700561, 2769425, 2835690, ..."
2,6,"[1533809, 1910597, 2503992, 2577108, 2703746, ..."
3,9,"[1090616, 1679810, 2627448, 2629012, 2645689, ..."
4,11,"[1892007, 2275746, 2503962, 2516438, 2688349, ..."


In [6]:
document = pd.read_csv('document.csv')
document.head()

Unnamed: 0,doc,document
0,13915,Background: Resistance to mammary tumorigenesi...
1,15027,Background: Standard archival sequence databas...
2,17824,Our results show that cytokines derived from m...
3,28992,Background: This study evaluated the feasibili...
4,28996,Background: In order to test the hypothesis th...


In [7]:
document['short_doc'] = 0
for i in range(len(document['document'])):
    document['short_doc'][i] = document['document'][i][0:500]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [8]:
document.head()

Unnamed: 0,doc,document,short_doc
0,13915,Background: Resistance to mammary tumorigenesi...,Background: Resistance to mammary tumorigenesi...
1,15027,Background: Standard archival sequence databas...,Background: Standard archival sequence databas...
2,17824,Our results show that cytokines derived from m...,Our results show that cytokines derived from m...
3,28992,Background: This study evaluated the feasibili...,Background: This study evaluated the feasibili...
4,28996,Background: In order to test the hypothesis th...,Background: In order to test the hypothesis th...


In [9]:
train_query = pd.read_csv('train_query.csv')
train_query.head()

Unnamed: 0,topic,train_query
0,2,An elderly female with past medical history of...
1,4,"An 87 yo woman with h/o osteoporosis, DM2, dem..."
2,6,"A 94 year old female with hx recent PE/DVT, at..."
3,9,Infant with respiratory distress syndrome and ...
4,11,80 yo male with demantia and past medical hist...


In [10]:
com_topic = []
com_topic_doc = []
com_doc = []
com_doc_doc = []
for i in range(len(train_query['topic'])):
    com_topic.extend([train_query['topic'][i]]*document.shape[0])
    com_topic_doc.extend([train_query['train_query'][i]]*document.shape[0])
    com_doc.extend(document['doc'])
    com_doc_doc.extend(document['short_doc'])
print(len(com_topic))
print(len(com_doc))
print(len(com_topic_doc))
print(len(com_doc_doc))

1500000
1500000
1500000
1500000


In [11]:
combine_train = pd.DataFrame({'topic':com_topic, 'doc':com_doc, 'query':com_topic_doc, 'document':com_doc_doc})
combine_train['label'] = 0
combine_train.head()

Unnamed: 0,topic,doc,query,document,label
0,2,13915,An elderly female with past medical history of...,Background: Resistance to mammary tumorigenesi...,0
1,2,15027,An elderly female with past medical history of...,Background: Standard archival sequence databas...,0
2,2,17824,An elderly female with past medical history of...,Our results show that cytokines derived from m...,0
3,2,28992,An elderly female with past medical history of...,Background: This study evaluated the feasibili...,0
4,2,28996,An elderly female with past medical history of...,Background: In order to test the hypothesis th...,0


In [12]:
for i in tqdm(range(len(combine_train['topic']))):
    if combine_train['doc'][i] in list(map(int, train_ans['doc'][list(train_ans['topic'].values).index(combine_train['topic'][i])])):
        combine_train['label'][i] = 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
100%|██████████| 1500000/1500000 [01:00<00:00, 24724.76it/s]


In [13]:
combine_train['combine'] = combine_train['query'] + ' [SEP] ' + combine_train['document']

In [14]:
combine_train.head()

Unnamed: 0,topic,doc,query,document,label,combine
0,2,13915,An elderly female with past medical history of...,Background: Resistance to mammary tumorigenesi...,0,An elderly female with past medical history of...
1,2,15027,An elderly female with past medical history of...,Background: Standard archival sequence databas...,0,An elderly female with past medical history of...
2,2,17824,An elderly female with past medical history of...,Our results show that cytokines derived from m...,0,An elderly female with past medical history of...
3,2,28992,An elderly female with past medical history of...,Background: This study evaluated the feasibili...,0,An elderly female with past medical history of...
4,2,28996,An elderly female with past medical history of...,Background: In order to test the hypothesis th...,0,An elderly female with past medical history of...


In [15]:
test_query = pd.read_csv('test_query.csv')
test_query.head()

Unnamed: 0,topic,test_query
0,1,A 78 year old male presents with frequent stoo...
1,3,A 75F found to be hypoglycemic with hypotensio...
2,5,An 82 man with multiple chronic conditions and...
3,7,A 41-year-old male patient with medical histor...
4,8,"A 26 year-old diabetic woman, estimated to 10 ..."


In [16]:
tokenizer = BigBirdTokenizer.from_pretrained('google/bigbird-roberta-base')
model = BigBirdForSequenceClassification.from_pretrained('google/bigbird-roberta-base', num_labels = 2)

Some weights of the model checkpoint at google/bigbird-roberta-base were not used when initializing BigBirdForSequenceClassification: ['cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BigBirdForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BigBirdForSequenceClassifica

In [17]:
def tokenize(text, tokenizer, label=None):

    input_ids = []
    attention_masks = []

    for sent in tqdm(text):

        encoded_dict = tokenizer.encode_plus(sent,                      # Sentence to encode.
                            add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                            max_length = 1024,           # Pad & truncate all sentences.
                            pad_to_max_length = True,
                            return_attention_mask = True,   # Construct attn. masks.
                            return_tensors = 'pt',     # Return pytorch tensors.
                    )
        input_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
        
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])
    if label is not None:  
        labels = torch.tensor(label)
        return input_ids, attention_masks, labels
    return input_ids, attention_masks

In [18]:
label_dict = {0: 0, 1: 1}
label_dict_inverse = {v: k for k, v in label_dict.items()}
label_dict_inverse

{0: 0, 1: 1}

In [None]:
train_input_ids, train_attention_masks, train_labels = tokenize(combine_train['combine'], tokenizer, combine_train['label'])

  0%|          | 0/1500000 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
 57%|█████▋    | 857527/1500000 [11:17<09:49, 1089.11it/s]

In [None]:
input_ids = torch.cat(train_input_ids, dim=0)
attention_masks = torch.cat(train_attention_masks, dim=0)

In [None]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, train_labels)

# Create a train-validation split.
train_size = int(0.2 * len(dataset))
valid_size = len(dataset) - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, valid_dataset = random_split(dataset, [train_size, valid_size])

batch_size = 4

In [None]:
train_dataloader = DataLoader(train_dataset, sampler = RandomSampler(train_dataset), batch_size = 4)
valid_dataloader = DataLoader(valid_dataset,sampler = SequentialSampler(valid_dataset),batch_size = 4)

In [None]:
EPOCHS = 10

optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)#, betas=BETAS, eps=EPS)     
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_dataloader)*EPOCHS
)

In [None]:
loss_values, validation_loss_values, valid_acc = [], [100], [0]
patient = 0
for _ in range(EPOCHS):

    model.train()
    total_loss = 0

    # Training loop
    for step, batch in tqdm(enumerate(train_dataloader)):

        # add batch to gpu
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch
        model.zero_grad()

        outputs = model(b_input_ids, token_type_ids=None,
                        attention_mask=b_input_mask, labels=b_labels)
        # get the loss
        loss = outputs[0]
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # track train loss
        total_loss += loss.item()
        # Clip the norm of the gradient
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)
        # update parameters
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    avg_train_loss = total_loss / len(train_dataloader)

    model.eval()

    # Reset the validation loss for this epoch.
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    predictions , true_labels = [], []

    for batch in valid_dataloader:
        batch = tuple(t.to(device) for t in batch)
        b_input_ids, b_input_mask, b_labels = batch

        # saving memory and speeding up validation
        with torch.no_grad():

            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)

        logits = outputs[1].detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        eval_loss += outputs[0].item()
        predictions.extend(np.argmax(logits, axis=1))
        true_labels.extend(label_ids)

    eval_loss = eval_loss / len(valid_dataloader)

    print(f'''Epoch [{_+1}/{EPOCHS}] total loss complete. Train Loss: {avg_train_loss:.5f}. Val Loss: {eval_loss:.5}''')

    pred_tags = [label_dict_inverse[int(p)] for p in predictions]
    valid_tags = [label_dict_inverse[int(l)] for l in true_labels]
    print("Validation Accuracy: {}".format(accuracy_score(pred_tags, valid_tags)))

    # condition setting (model saved)
    if accuracy_score(pred_tags, valid_tags)>max(valid_acc):
        max_acc = accuracy_score(pred_tags, valid_tags)
        min_lss = eval_loss
        patient = 0
        print("saving state dict")
        torch.save(model.state_dict(), f"qa_model.pt")
    else:
        # early stopping
        patient += 1
        if patient == 3:
            print(f'Early Stop. Best Acc {max_acc}')
            break

    validation_loss_values.append(eval_loss)     
    valid_acc.append(accuracy_score(pred_tags, valid_tags))

    print()