In [1]:
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn import metrics, model_selection
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config = dict(
    seed = 2020,
    num_labels=2,
    num_folds=1,
    fold_to_train = [0],
    device = 'cuda' if torch.cuda.is_available() else 'cpu',
    model_checkpoint = '../input/d/datasets/ferroxrocks/robertalarge/RoBERTa-large-PM-M3-Voc/RoBERTa-large-PM-M3-Voc-hf', 
    learning_rate = 1e-5,
    weight_decay = 1e-2,
    max_length = 512,
    train_batch_size = 4,
    valid_batch_size = 8,
    epochs_to_train = 4,
    total_epochs = 4,
    grad_acc_steps = 4,
    num_cycles=0.5,
    scheduler='linear', # ['linear', 'cosine']
    output_dir = '',
    debug = None,
    precompute_tokens = True
)

tokenizers.__version__: 0.11.6
transformers.__version__: 4.16.2
env: TOKENIZERS_PARALLELISM=true


# Importing required libraries

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from ast import literal_eval
import ast
from itertools import chain
from sklearn.metrics import precision_recall_fscore_support
from tqdm.notebook import tqdm, trange
from sklearn.model_selection import StratifiedKFold

import torch
from transformers import AutoModel, AutoTokenizer

In [3]:
pn_df=pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/patient_notes.csv')
feat_df=pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/features.csv')

In [4]:
train_df=pd.read_csv('/kaggle/input/nbme-score-clinical-patient-notes/train.csv')
# train_df['annotation']=train_df['annotation'].apply(ast.literal_eval)
# train_df['location']=train_df['location'].apply(ast.literal_eval)
# train_df.head()

# Extra Data

In [5]:
#pn_df,feat_df,train_df
pn_dict = {}
for idx, row in pn_df.iterrows():
    pn_dict[row['pn_num']] = row['pn_history']
    
new_annotation = []
for case_id in feat_df['case_num'].unique():
    
    all_pn_id = set(pn_df[pn_df['case_num']==case_id]['pn_num'].tolist())
    
    for feature_id in feat_df[feat_df['case_num']==case_id]['feature_num'].unique():
        # get all the pn_num that have already been annotated
        annotated_pn = set(train_df[train_df['feature_num']==feature_id]['pn_num'].tolist())
        # get all the pn_num that have NOT been annotated
        pn_to_annotate = all_pn_id-annotated_pn
        
        # get all current annotations
        # we will use them to find more annotations
        annotations = train_df[train_df['feature_num']==feature_id]['annotation'].tolist()
        annotation_texts = set()
        for a in annotations:
            anns = eval(a)
            for at in anns:
                annotation_texts.add(at)
                
        # annotate       
        for pn_id in pn_to_annotate:
            new_annotation_pn, new_location_pn = [], []
            pn_text = pn_dict[pn_id]
            for at in annotation_texts:
                start = pn_text.find(at)
                if start>=0:
                    new_annotation_pn.append(at)
                    new_location_pn.append(f'{start} {start+len(at)}')
            if len(new_annotation_pn)>0:
                new_annotation.append((
                    f'{pn_id:04d}_{feature_id:03d}',
                    case_id,
                    pn_id,
                    feature_id,
                    new_annotation_pn,
                    new_location_pn
                ))
        break
    break
    # break to get sample results quickly
#pn_dict

In [6]:
#Using Literal-eval
train_df['annotation']=train_df['annotation'].apply(ast.literal_eval)
train_df['location']=train_df['location'].apply(ast.literal_eval)
train_df.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724]
1,00016_001,0,16,1,"[mom with ""thyroid disease]",[668 693]
2,00016_002,0,16,2,[chest pressure],[203 217]
3,00016_003,0,16,3,"[intermittent episodes, episode]","[70 91, 176 183]"
4,00016_004,0,16,4,[felt as if he were going to pass out],[222 258]


In [7]:
new_annotation_df=pd.DataFrame(new_annotation,columns=train_df.columns)
new_annotation_df.head()
train_df1 = pd.concat([train_df, new_annotation_df], axis=0).reset_index(drop=True)

In [8]:
merged_data=train_df1.merge(pn_df,on=['case_num','pn_num']).merge(feat_df,on=['case_num','feature_num'])
len(merged_data)

15176

In [9]:
def preprocessing_features(text):
    return text.replace('-OR-',' or ').replace('-',' ')
def clean_backslah_chars(text):
    return text.replace('\n',' ').replace('\s',' ').replace('\t',' ')
merged_data['feature_text']=merged_data['feature_text'].apply(preprocessing_features)
merged_data['pn_clean_history']=merged_data['pn_history'].apply(lambda x: x.strip()).apply(clean_backslah_chars)
skf = StratifiedKFold(n_splits = 5)
merged_data["stratify_on"] = merged_data["case_num"].astype(str) + merged_data["feature_num"].astype(str)
merged_data["fold"] = -1
for fold, (_, valid_idx) in enumerate(skf.split(merged_data["id"], y = merged_data["stratify_on"])):
    merged_data.loc[valid_idx, "fold"] = fold

In [10]:
merged_data.head()

Unnamed: 0,id,case_num,pn_num,feature_num,annotation,location,pn_history,feature_text,pn_clean_history,stratify_on,fold
0,00016_000,0,16,0,[dad with recent heart attcak],[696 724],HPI: 17yo M presents with palpitations. Patien...,Family history of MI or Family history of myoc...,HPI: 17yo M presents with palpitations. Patien...,0,0
1,00041_000,0,41,0,[],[],17 Y/O M CAME TO THE CLINIC C/O HEART POUNDING...,Family history of MI or Family history of myoc...,17 Y/O M CAME TO THE CLINIC C/O HEART POUNDING...,0,0
2,00046_000,0,46,0,[father: heart attack],[824 844],Mr. Cleveland is a 17yo M who was consented by...,Family history of MI or Family history of myoc...,Mr. Cleveland is a 17yo M who was consented by...,0,0
3,00082_000,0,82,0,[Father MI],[622 631],17 yo M w/ no cardiac or arrhythmia PMH presen...,Family history of MI or Family history of myoc...,17 yo M w/ no cardiac or arrhythmia PMH presen...,0,0
4,00100_000,0,100,0,[Dad-MI],[735 741],HPI: Dillon Cleveland is an otherwise healthy ...,Family history of MI or Family history of myoc...,HPI: Dillon Cleveland is an otherwise healthy ...,0,0


In [11]:
merged_data = merged_data.loc[merged_data["annotation"] != "[]"].copy().reset_index(drop = True)

In [12]:
# incorrect annotation
merged_data.loc[338, 'annotation'] = ast.literal_eval('[["father heart attack"]]')
merged_data.loc[338, 'location'] = ast.literal_eval('[["764 783"]]')

merged_data.loc[621, 'annotation'] = ast.literal_eval('[["for the last 2-3 months"]]')
merged_data.loc[621, 'location'] = ast.literal_eval('[["77 100"]]')

merged_data.loc[655, 'annotation'] = ast.literal_eval('[["no heat intolerance"], ["no cold intolerance"]]')
merged_data.loc[655, 'location'] = ast.literal_eval('[["285 292;301 312"], ["285 287;296 312"]]')

merged_data.loc[1262, 'annotation'] = ast.literal_eval('[["mother thyroid problem"]]')
merged_data.loc[1262, 'location'] = ast.literal_eval('[["551 557;565 580"]]')

merged_data.loc[1265, 'annotation'] = ast.literal_eval('[[\'felt like he was going to "pass out"\']]')
merged_data.loc[1265, 'location'] = ast.literal_eval('[["131 135;181 212"]]')

merged_data.loc[1396, 'annotation'] = ast.literal_eval('[["stool , with no blood"]]')
merged_data.loc[1396, 'location'] = ast.literal_eval('[["259 280"]]')

merged_data.loc[1591, 'annotation'] = ast.literal_eval('[["diarrhoe non blooody"]]')
merged_data.loc[1591, 'location'] = ast.literal_eval('[["176 184;201 212"]]')

merged_data.loc[1615, 'annotation'] = ast.literal_eval('[["diarrhea for last 2-3 days"]]')
merged_data.loc[1615, 'location'] = ast.literal_eval('[["249 257;271 288"]]')

merged_data.loc[1664, 'annotation'] = ast.literal_eval('[["no vaginal discharge"]]')
merged_data.loc[1664, 'location'] = ast.literal_eval('[["822 824;907 924"]]')

merged_data.loc[1714, 'annotation'] = ast.literal_eval('[["started about 8-10 hours ago"]]')
merged_data.loc[1714, 'location'] = ast.literal_eval('[["101 129"]]')

merged_data.loc[1929, 'annotation'] = ast.literal_eval('[["no blood in the stool"]]')
merged_data.loc[1929, 'location'] = ast.literal_eval('[["531 539;549 561"]]')

merged_data.loc[2134, 'annotation'] = ast.literal_eval('[["last sexually active 9 months ago"]]')
merged_data.loc[2134, 'location'] = ast.literal_eval('[["540 560;581 593"]]')

merged_data.loc[2191, 'annotation'] = ast.literal_eval('[["right lower quadrant pain"]]')
merged_data.loc[2191, 'location'] = ast.literal_eval('[["32 57"]]')

merged_data.loc[2553, 'annotation'] = ast.literal_eval('[["diarrhoea no blood"]]')
merged_data.loc[2553, 'location'] = ast.literal_eval('[["308 317;376 384"]]')

merged_data.loc[3124, 'annotation'] = ast.literal_eval('[["sweating"]]')
merged_data.loc[3124, 'location'] = ast.literal_eval('[["549 557"]]')

merged_data.loc[3858, 'annotation'] = ast.literal_eval('[["previously as regular"], ["previously eveyr 28-29 days"], ["previously lasting 5 days"], ["previously regular flow"]]')
merged_data.loc[3858, 'location'] = ast.literal_eval('[["102 123"], ["102 112;125 141"], ["102 112;143 157"], ["102 112;159 171"]]')

merged_data.loc[4373, 'annotation'] = ast.literal_eval('[["for 2 months"]]')
merged_data.loc[4373, 'location'] = ast.literal_eval('[["33 45"]]')

merged_data.loc[4763, 'annotation'] = ast.literal_eval('[["35 year old"]]')
merged_data.loc[4763, 'location'] = ast.literal_eval('[["5 16"]]')

merged_data.loc[4782, 'annotation'] = ast.literal_eval('[["darker brown stools"]]')
merged_data.loc[4782, 'location'] = ast.literal_eval('[["175 194"]]')

merged_data.loc[4908, 'annotation'] = ast.literal_eval('[["uncle with peptic ulcer"]]')
merged_data.loc[4908, 'location'] = ast.literal_eval('[["700 723"]]')

merged_data.loc[6016, 'annotation'] = ast.literal_eval('[["difficulty falling asleep"]]')
merged_data.loc[6016, 'location'] = ast.literal_eval('[["225 250"]]')

merged_data.loc[6192, 'annotation'] = ast.literal_eval('[["helps to take care of aging mother and in-laws"]]')
merged_data.loc[6192, 'location'] = ast.literal_eval('[["197 218;236 260"]]')

merged_data.loc[6380, 'annotation'] = ast.literal_eval('[["No hair changes"], ["No skin changes"], ["No GI changes"], ["No palpitations"], ["No excessive sweating"]]')
merged_data.loc[6380, 'location'] = ast.literal_eval('[["480 482;507 519"], ["480 482;499 503;512 519"], ["480 482;521 531"], ["480 482;533 545"], ["480 482;564 582"]]')

merged_data.loc[6562, 'annotation'] = ast.literal_eval('[["stressed due to taking care of her mother"], ["stressed due to taking care of husbands parents"]]')
merged_data.loc[6562, 'location'] = ast.literal_eval('[["290 320;327 337"], ["290 320;342 358"]]')

merged_data.loc[6862, 'annotation'] = ast.literal_eval('[["stressor taking care of many sick family members"]]')
merged_data.loc[6862, 'location'] = ast.literal_eval('[["288 296;324 363"]]')

merged_data.loc[7022, 'annotation'] = ast.literal_eval('[["heart started racing and felt numbness for the 1st time in her finger tips"]]')
merged_data.loc[7022, 'location'] = ast.literal_eval('[["108 182"]]')

merged_data.loc[7422, 'annotation'] = ast.literal_eval('[["first started 5 yrs"]]')
merged_data.loc[7422, 'location'] = ast.literal_eval('[["102 121"]]')

merged_data.loc[8876, 'annotation'] = ast.literal_eval('[["No shortness of breath"]]')
merged_data.loc[8876, 'location'] = ast.literal_eval('[["481 483;533 552"]]')

merged_data.loc[9027, 'annotation'] = ast.literal_eval('[["recent URI"], ["nasal stuffines, rhinorrhea, for 3-4 days"]]')
merged_data.loc[9027, 'location'] = ast.literal_eval('[["92 102"], ["123 164"]]')

merged_data.loc[9938, 'annotation'] = ast.literal_eval('[["irregularity with her cycles"], ["heavier bleeding"], ["changes her pad every couple hours"]]')
merged_data.loc[9938, 'location'] = ast.literal_eval('[["89 117"], ["122 138"], ["368 402"]]')

merged_data.loc[9973, 'annotation'] = ast.literal_eval('[["gaining 10-15 lbs"]]')
merged_data.loc[9973, 'location'] = ast.literal_eval('[["344 361"]]')

merged_data.loc[10513, 'annotation'] = ast.literal_eval('[["weight gain"], ["gain of 10-16lbs"]]')
merged_data.loc[10513, 'location'] = ast.literal_eval('[["600 611"], ["607 623"]]')

merged_data.loc[11551, 'annotation'] = ast.literal_eval('[["seeing her son knows are not real"]]')
merged_data.loc[11551, 'location'] = ast.literal_eval('[["386 400;443 461"]]')

merged_data.loc[11677, 'annotation'] = ast.literal_eval('[["saw him once in the kitchen after he died"]]')
merged_data.loc[11677, 'location'] = ast.literal_eval('[["160 201"]]')

merged_data.loc[12124, 'annotation'] = ast.literal_eval('[["tried Ambien but it didnt work"]]')
merged_data.loc[12124, 'location'] = ast.literal_eval('[["325 337;349 366"]]')

merged_data.loc[12279, 'annotation'] = ast.literal_eval('[["heard what she described as a party later than evening these things did not actually happen"]]')
merged_data.loc[12279, 'location'] = ast.literal_eval('[["405 459;488 524"]]')

merged_data.loc[12289, 'annotation'] = ast.literal_eval('[["experienced seeing her son at the kitchen table these things did not actually happen"]]')
merged_data.loc[12289, 'location'] = ast.literal_eval('[["353 400;488 524"]]')

merged_data.loc[13238, 'annotation'] = ast.literal_eval('[["SCRACHY THROAT"], ["RUNNY NOSE"]]')
merged_data.loc[13238, 'location'] = ast.literal_eval('[["293 307"], ["321 331"]]')

merged_data.loc[13297, 'annotation'] = ast.literal_eval('[["without improvement when taking tylenol"], ["without improvement when taking ibuprofen"]]')
merged_data.loc[13297, 'location'] = ast.literal_eval('[["182 221"], ["182 213;225 234"]]')

merged_data.loc[13299, 'annotation'] = ast.literal_eval('[["yesterday"], ["yesterday"]]')
merged_data.loc[13299, 'location'] = ast.literal_eval('[["79 88"], ["409 418"]]')

merged_data.loc[13845, 'annotation'] = ast.literal_eval('[["headache global"], ["headache throughout her head"]]')
merged_data.loc[13845, 'location'] = ast.literal_eval('[["86 94;230 236"], ["86 94;237 256"]]')

merged_data.loc[14083, 'annotation'] = ast.literal_eval('[["headache generalized in her head"]]')
merged_data.loc[14083, 'location'] = ast.literal_eval('[["56 64;156 179"]]')

# Sample data

In [13]:
first = merged_data.loc[0]
example = {
    "feature_text": first.feature_text,
    "pn_clean_history": first.pn_clean_history,
    "location": first.location,
    "annotation": first.annotation
}
for key in example.keys():
    print(key)
    print(example[key])
    print("=" * 100)

feature_text
Family history of MI or Family history of myocardial infarction
pn_clean_history
 SHx: Freshmen in college. Endorses 3-4 drinks 3 nights / week (on weekends), denies tabacco, endorses trying marijuana. Sexually active with girlfriend x 1 year, uses condoms
location
['696 724']
annotation
['dad with recent heart attcak']


In [14]:
def loc_list_to_ints(loc_list):
    int_list=[]
    for i in loc_list:
        for j in i.split(';'):
            start,end=map(int,j.split(' '))
            int_list.append((start,end))
    return int_list
print(example['location'])
print(loc_list_to_ints(example['location']))
print(example['annotation'])
print(example['pn_clean_history'][loc_list_to_ints(example['location'])[0][0]:loc_list_to_ints(example['location'])[0][1]])

['696 724']
[(696, 724)]
['dad with recent heart attcak']
dad with recent heart attcak


# Roberta Test

In [15]:
MODEL_NAME = "../input/robertalarge/RoBERTa-large-PM-M3-Voc/RoBERTa-large-PM-M3-Voc-hf" # we cant connect internet for submission
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
# tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [16]:
def tokenize_and_add_labels(tokenizer, example):
    tokenized_inputs = tokenizer(
        example["feature_text"],
        example["pn_clean_history"],
        truncation = "only_second",
        max_length = 416, # max length is 406
        padding = "max_length",
        return_offsets_mapping = True
    )
    labels = [0.0] * len(tokenized_inputs["input_ids"])
    tokenized_inputs["location_int"] = loc_list_to_ints(example["location"])
    tokenized_inputs["sequence_ids"] = tokenized_inputs.sequence_ids()

    for idx, (seq_id, offsets) in enumerate(zip(tokenized_inputs["sequence_ids"], tokenized_inputs["offset_mapping"])):
        if seq_id is None or seq_id == 0:
            labels[idx] = -100
            continue
        exit = False
        token_start, token_end = offsets
        for feature_start, feature_end in tokenized_inputs["location_int"]:
            if exit:
                break
            if token_start >= feature_start and token_end <= feature_end:
                labels[idx] = 1.0
                exit = True
    tokenized_inputs["labels"] = labels
    
    return tokenized_inputs

In [17]:
tokenized_inputs = tokenize_and_add_labels(tokenizer, example)
for key in tokenized_inputs.keys():
    print(key)
    print(tokenized_inputs[key])
    print("=" * 100)

input_ids
[0, 29553, 2819, 281, 5777, 397, 18216, 2819, 281, 5071, 6835, 2, 2, 43, 3786, 29, 2232, 9346, 409, 6361, 324, 46668, 462, 17, 11536, 3318, 465, 16, 23, 1748, 281, 13220, 8093, 281, 2869, 28167, 28439, 18, 83, 616, 296, 764, 281, 1705, 6744, 15632, 369, 1500, 11153, 841, 261, 27354, 13143, 807, 280, 11755, 15, 720, 491, 797, 807, 6744, 2079, 290, 10536, 340, 1534, 470, 352, 14247, 302, 4440, 764, 301, 3829, 481, 17093, 732, 2877, 1204, 378, 4961, 5778, 1092, 23638, 279, 551, 9340, 450, 776, 594, 15, 5147, 302, 534, 301, 20, 16, 22, 2050, 445, 1572, 378, 14057, 2223, 27354, 13143, 15, 7203, 450, 776, 85, 594, 8895, 1568, 290, 12349, 281, 13143, 17, 12193, 442, 39742, 281, 11990, 15, 13956, 7188, 15, 46359, 15, 49052, 15, 11100, 15, 7648, 15, 1237, 276, 3702, 15, 1237, 276, 8551, 18, 48630, 15, 5395, 12742, 327, 15, 1237, 276, 7719, 397, 5621, 11989, 17, 224, 205, 5122, 43, 91, 29, 6330, 205, 473, 91, 29, 6671, 14875, 450, 776, 85, 594, 205, 18073, 91, 29, 7802, 324, 2869, 3306

In [18]:
class NBMEData(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        example = self.data.loc[idx]
        tokenized = tokenize_and_add_labels(self.tokenizer, example)

        input_ids = np.array(tokenized["input_ids"]) # for input BERT
        attention_mask = np.array(tokenized["attention_mask"]) # for input BERT
        labels = np.array(tokenized["labels"]) # for calculate loss and cv score

        offset_mapping = np.array(tokenized["offset_mapping"]) # for calculate cv score
        sequence_ids = np.array(tokenized["sequence_ids"]).astype("float16") # for calculate cv score
        
        return input_ids, attention_mask, labels, offset_mapping, sequence_ids

In [19]:
class NBMEModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = AutoModel.from_pretrained(MODEL_NAME)
        self.config = AutoConfig.from_pretrained(MODEL_NAME)
#         self.backbone = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
#         self.config = AutoConfig.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
        self.dropout = torch.nn.Dropout(p=0.2)
        self.classifier = torch.nn.Linear(self.config.hidden_size,self.config.hidden_size)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(self.config.hidden_size, 1)
        
    def forward(self, input_ids, attention_mask):
        pooler_outputs = self.backbone(input_ids=input_ids, 
                                       attention_mask=attention_mask)[0]
        #logits = self.classifier(self.dropout(pooler_outputs)).squeeze(-1)
        logits = self.classifier(self.dropout(pooler_outputs)).squeeze(-1)
        logits = torch.nn.ReLU()(logits)
        #logits = self.fc2(self.dropout(logits))
        logits = self.fc3(self.dropout(logits)).squeeze(-1)
        return logits

In [20]:
#print(NBMEModel())

# Training

In [21]:
fold = 0
BATCH_SIZE = config['train_batch_size']
EPOCHS = 4
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

model = NBMEModel().to(DEVICE)
criterion = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr = 1e-5)

train = merged_data
#train = merged_data.loc[merged_data["fold"] != fold].reset_index(drop = True)
valid = merged_data.loc[merged_data["fold"] == fold].reset_index(drop = True)
train_ds = NBMEData(train, tokenizer)
valid_ds = NBMEData(valid, tokenizer)
train_dl = torch.utils.data.DataLoader(train_ds, batch_size = BATCH_SIZE, pin_memory = True, shuffle = True, drop_last = True)
valid_dl = torch.utils.data.DataLoader(valid_ds, batch_size = BATCH_SIZE * 2, pin_memory = True, shuffle = False, drop_last = False)

Some weights of the model checkpoint at ../input/robertalarge/RoBERTa-large-PM-M3-Voc/RoBERTa-large-PM-M3-Voc-hf were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n = 1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def get_location_predictions(preds, offset_mapping, sequence_ids, test = False):
    all_predictions = []
    for pred, offsets, seq_ids in zip(preds, offset_mapping, sequence_ids):
        pred = sigmoid(pred)
        start_idx = None
        current_preds = []
        for p, o, s_id in zip(pred, offsets, seq_ids):
            if s_id is None or s_id == 0:
                continue
            if p > 0.5:
                if start_idx is None:
                    start_idx = o[0]
                end_idx = o[1]
            elif start_idx is not None:
                if test:
                    current_preds.append(f"{start_idx} {end_idx}")
                else:
                    current_preds.append((start_idx, end_idx))
                start_idx = None
        if test:
            all_predictions.append("; ".join(current_preds))
        else:
            all_predictions.append(current_preds)
    return all_predictions

def calculate_char_CV(predictions, offset_mapping, sequence_ids, labels):
    all_labels = []
    all_preds = []
    for preds, offsets, seq_ids, labels in zip(predictions, offset_mapping, sequence_ids, labels):
        num_chars = max(list(chain(*offsets)))
        char_labels = np.zeros((num_chars))
        for o, s_id, label in zip(offsets, seq_ids, labels):
            if s_id is None or s_id == 0:
                continue
            if int(label) == 1:
                char_labels[o[0]:o[1]] = 1
        char_preds = np.zeros((num_chars))
        for start_idx, end_idx in preds:
            char_preds[start_idx:end_idx] = 1
        all_labels.extend(char_labels)
        all_preds.extend(char_preds)
    results = precision_recall_fscore_support(all_labels, all_preds, average = "binary")
    return {
        "precision": results[0],
        "recall": results[1],
        "f1": results[2]
    }

def compute_metrics(p):
    predictions, y_true = p
    y_true = y_true.astype(int)
    y_pred = [
        [int(p > 0.5) for (p, l) in zip(pred, label) if l != -100]
        for pred, label in zip(predictions, y_true)
    ]
    y_true = [
        [l for l in label if l != -100] for label in y_true
    ]
    results = precision_recall_fscore_support(list(chain(*y_true)), list(chain(*y_pred)), average = "binary")
    return {
        "token_precision": results[0],
        "token_recall": results[1],
        "token_f1": results[2]
    }


In [23]:
def get_scheduler(optimizer, num_warmup_steps, num_training_steps):
    if config["scheduler"]=='linear':
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps
        )
    elif config["scheduler"]=='cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps, num_cycles=config["num_cycles"]
        )
    return scheduler

num_training_steps = (len(train_ds) // (config['train_batch_size'] * config['grad_acc_steps'])) * config['total_epochs']
num_warmup_steps = int(num_training_steps * 0.01)
scheduler = get_scheduler(optimizer, num_warmup_steps, num_training_steps)

In [24]:
#EPOCHS=1

In [25]:
history = {"train": [], "valid": []}
best_loss = np.inf

for epoch in range(EPOCHS):
    #training
    model.train()
    train_loss = AverageMeter()
    pbar = tqdm(train_dl)
    optimizer.zero_grad()
    for batch_idx,batch in enumerate(pbar):
        #optimizer.zero_grad()
        input_ids = batch[0].to(DEVICE)
        attention_mask = batch[1].to(DEVICE)
        labels = batch[2].to(DEVICE)
        offset_mapping = batch[3]
        sequence_ids = batch[4]
        logits = model(input_ids, attention_mask)
        loss_fct = torch.nn.BCEWithLogitsLoss(reduction = "none")
        loss = loss_fct(logits, labels)
        loss = torch.masked_select(loss, labels > -1).mean() # we should calculate at "pn_history"; labels at "feature_text" are -100 < -1
        loss.backward()
        #optimizer.step()
        train_loss.update(val = loss.item(), n = len(input_ids))
        pbar.set_postfix(Loss = train_loss.avg)
        if batch_idx % config['grad_acc_steps'] == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad() 
    print(epoch, train_loss.avg)
    history["train"].append(train_loss.avg)

    #evaluation
    model.eval()
    valid_loss = AverageMeter()
    with torch.no_grad():
        for batch in tqdm(valid_dl):
            input_ids = batch[0].to(DEVICE)
            attention_mask = batch[1].to(DEVICE)
            labels = batch[2].to(DEVICE)
            offset_mapping = batch[3]
            sequence_ids = batch[4]
            logits = model(input_ids, attention_mask)
            loss_fct = torch.nn.BCEWithLogitsLoss(reduction = "none")
            loss = loss_fct(logits, labels)
            loss = torch.masked_select(loss, labels > -1).mean()
            valid_loss.update(val = loss.item(), n = len(input_ids))
            pbar.set_postfix(Loss = valid_loss.avg)
    print(epoch, valid_loss.avg)
    history["valid"].append(valid_loss.avg)

    # save model
    if valid_loss.avg < best_loss:
        best_loss = valid_loss.avg
        torch.save(model.state_dict(), "nbme.pth")



  0%|          | 0/3794 [00:00<?, ?it/s]

0 0.06418552250393438


  0%|          | 0/380 [00:00<?, ?it/s]

0 0.013334528253765594


  0%|          | 0/3794 [00:00<?, ?it/s]

1 0.013242511316439618


  0%|          | 0/380 [00:00<?, ?it/s]

1 0.009096885861337565


  0%|          | 0/3794 [00:00<?, ?it/s]

2 0.009767354627538805


  0%|          | 0/380 [00:00<?, ?it/s]

2 0.007724853760961336


  0%|          | 0/3794 [00:00<?, ?it/s]

3 0.008114975370931648


  0%|          | 0/380 [00:00<?, ?it/s]

3 0.00712324957490791


In [26]:
model.load_state_dict(torch.load("nbme.pth", map_location = DEVICE))

model.eval()
preds = []
offsets = []
seq_ids = []
lbls = []
with torch.no_grad():
    for batch in tqdm(valid_dl):
        input_ids = batch[0].to(DEVICE)
        attention_mask = batch[1].to(DEVICE)
        labels = batch[2].to(DEVICE)
        offset_mapping = batch[3]
        sequence_ids = batch[4]
        logits = model(input_ids, attention_mask)
        preds.append(logits.cpu().numpy())
        offsets.append(offset_mapping.numpy())
        seq_ids.append(sequence_ids.numpy())
        lbls.append(labels.cpu().numpy())
preds = np.concatenate(preds, axis = 0)
offsets = np.concatenate(offsets, axis = 0)
seq_ids = np.concatenate(seq_ids, axis = 0)
lbls = np.concatenate(lbls, axis = 0)
location_preds = get_location_predictions(preds, offsets, seq_ids, test = False)
score = calculate_char_CV(location_preds, offsets, seq_ids, lbls)
print(score)


  0%|          | 0/380 [00:00<?, ?it/s]

{'precision': 0.8083626902442955, 'recall': 0.9548001087843351, 'f1': 0.8755003054824751}


# Inference

In [27]:
ROOT = "../input/nbme-score-clinical-patient-notes"
def create_test_df():
    feats = pd.read_csv(f"{ROOT}/features.csv")
    notes = pd.read_csv(f"{ROOT}/patient_notes.csv")
    test = pd.read_csv(f"{ROOT}/test.csv")

    merged = test.merge(notes, how = "left")
    merged = merged.merge(feats, how = "left")

    def process_feature_text(text):
        return text.replace('-OR-',' or ').replace('-',' ')
    def clean_backslash_chars(text):
        return text.replace('\n',' ').replace('\s',' ').replace('\t',' ')
    
    merged["feature_text"] = [process_feature_text(x) for x in merged["feature_text"]]
    merged['pn_history']=merged['pn_history'].apply(lambda x: x.strip()).apply(clean_backslash_chars)
    
    print(merged.shape)
    return merged

In [28]:
class NBMETestData(torch.utils.data.Dataset):
    def __init__(self, data, tokenizer):
        self.data = data
        self.tokenizer = tokenizer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        example = self.data.loc[idx]
        tokenized = self.tokenizer(
            example["feature_text"],
            example["pn_history"],
            truncation = "only_second",
            max_length = 416, ##max-length
            padding = "max_length",
            return_offsets_mapping = True
        )
        tokenized["sequence_ids"] = tokenized.sequence_ids()

        input_ids = np.array(tokenized["input_ids"])
        attention_mask = np.array(tokenized["attention_mask"])
        offset_mapping = np.array(tokenized["offset_mapping"])
        sequence_ids = np.array(tokenized["sequence_ids"]).astype("float16")

        return input_ids, attention_mask, offset_mapping, sequence_ids

In [29]:
test = create_test_df()
test_ds = NBMETestData(test, tokenizer)
test_dl = torch.utils.data.DataLoader(test_ds, batch_size = BATCH_SIZE * 2, pin_memory = True, shuffle = False, drop_last = False)

model.eval()
preds = []
offsets = []
seq_ids = []
with torch.no_grad():
    for batch in tqdm(test_dl):
        input_ids = batch[0].to(DEVICE)
        attention_mask = batch[1].to(DEVICE)
        offset_mapping = batch[2]
        sequence_ids = batch[3]
        logits = model(input_ids, attention_mask)
        preds.append(logits.cpu().numpy())
        offsets.append(offset_mapping.numpy())
        seq_ids.append(sequence_ids.numpy())

preds = np.concatenate(preds, axis = 0)
offsets = np.concatenate(offsets, axis = 0)
seq_ids = np.concatenate(seq_ids, axis = 0)




(5, 6)


  0%|          | 0/1 [00:00<?, ?it/s]

In [30]:
def post_process_spaces(target, text):
    target = np.copy(target)

    if len(text) > len(target):
        padding = np.zeros(len(text) - len(target))
        target = np.concatenate([target, padding])
    else:
        target = target[:len(text)]

    if text[0] == " ":
        target[0] = 0
    if text[-1] == " ":
        target[-1] = 0

    for i in range(1, len(text) - 1):
        if text[i] == " ":
            if target[i] and not target[i - 1]:  # space before
                target[i] = 0

            if target[i] and not target[i + 1]:  # space after
                target[i] = 0

            if target[i - 1] and target[i + 1]:
                target[i] = 1

    return target

In [31]:
# test['taget']=preds
# preds_pp = test.apply(lambda x: post_process_spaces(preds, x['pn_history']), 1)

In [32]:
location_preds = get_location_predictions(preds, offsets, seq_ids, test = True)
test["location"] = location_preds
test[["id", "location"]].to_csv("submission.csv", index = False)
pd.read_csv("submission.csv").head()

Unnamed: 0,id,location
0,00016_000,696 724
1,00016_001,668 693
2,00016_002,203 217
3,00016_003,70 91
4,00016_004,222 258
