In [1]:
!pip install seqeval



In [2]:
import pandas as pd
import numpy as np

import os

import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:

# verify GPU
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print(device)

cuda


## Convert to DataFrame for EDA

In [26]:
def convert_data(filepath):

  # Read the data from the text file
  with open(filepath, "r") as file:
      lines = file.readlines()

  # Define an empty list to store the data
  data = []

  # Iterate over each line in the file
  for line in lines:
      # Split the line by spaces
      parts = line.strip().split()

      # Check if the line has the expected number of elements
      if len(parts) == 9:
          # Extract the values from the line
          text_file_name = parts[0]
          sentence_line_number = int(parts[1])
          sentence_word_index = int(parts[2])
          sentence_seq = parts[3]
          start_token = int(parts[4])
          end_token = int(parts[5])
          original_word = parts[6]
          word = parts[7]
          label = parts[8]

          # Append the values as a tuple to the data list
          data.append((text_file_name, sentence_line_number, sentence_word_index, sentence_seq,
                      start_token, end_token, original_word, word, label))

  # Create a DataFrame from the data list with appropriate column names
  df = pd.DataFrame(data, columns=['text_file_name', 'sentence_line_number', 'sentence_word_index',
                                  'sentence_seq', 'start_token', 'end_token', 'original_word',
                                  'word', 'label'])

  return df


In [27]:
train_data_path = "/content/drive/MyDrive/266_final/data/Original_text/dataset1_train.txt"
test_data_path = "/content/drive/MyDrive/266_final/data/Original_text/dataset1_test.txt"

train = convert_data(train_data_path)
test = convert_data(test_data_path)

print(f"Length of train: {len(train)}")
print(f"Length of test: {len(test)}")


Length of train: 895141
Length of test: 585761


In [28]:
df = train.copy()
df['sentence_line_number'].nunique()

1053

In [29]:
df['label'].value_counts().sort_values()

label
I-Route           397
B-Duration        592
I-ADE             776
B-ADE             956
I-Duration       1034
I-Reason         3125
B-Reason         3791
I-Form           4173
B-Dosage         4221
I-Drug           4298
B-Route          5475
B-Frequency      6279
I-Strength       6617
B-Form           6647
B-Strength       6691
I-Dosage         8779
I-Frequency     13023
B-Drug          16222
O              802045
Name: count, dtype: int64

In [30]:
df[(df['word'] == 'ORDINAL') & (df['label'] == 'B-Dosage')]


Unnamed: 0,text_file_name,sentence_line_number,sentence_word_index,sentence_seq,start_token,end_token,original_word,word,label
2580,data/training_20180910/110727.txt,277,9,T396,12227,12228,3,ORDINAL,B-Dosage
3038,data/training_20180910/110727.txt,331,2,T167,14315,14316,1,ORDINAL,B-Dosage
3070,data/training_20180910/110727.txt,335,2,T87,14463,14464,1,ORDINAL,B-Dosage
3143,data/training_20180910/110727.txt,347,12,T264,14825,14828,One,ORDINAL,B-Dosage
3166,data/training_20180910/110727.txt,349,8,T269,14897,14902,Three,ORDINAL,B-Dosage
...,...,...,...,...,...,...,...,...,...
894405,data/training_20180910/100883.txt,125,0,T38,5767,5770,one,ORDINAL,B-Dosage
894903,data/training_20180910/100883.txt,188,3,T72,8349,8350,1,ORDINAL,B-Dosage
894959,data/training_20180910/100883.txt,194,3,T90,8571,8572,2,ORDINAL,B-Dosage
894968,data/training_20180910/100883.txt,195,3,T94,8607,8608,2,ORDINAL,B-Dosage


### Label labels

In [31]:
# Split labels based on whitespace and turn them into a list
labels = [i.split() for i in df['label'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
  [unique_labels.add(i) for i in lb if i not in unique_labels]

print(unique_labels)

{'B-Form', 'I-Dosage', 'B-Reason', 'O', 'B-Duration', 'B-Drug', 'B-Frequency', 'B-Route', 'I-ADE', 'I-Frequency', 'B-Strength', 'I-Strength', 'I-Drug', 'I-Form', 'I-Reason', 'B-ADE', 'B-Dosage', 'I-Duration', 'I-Route'}


In [32]:
# Map each label into its id representation and vice versa
labels_to_ids = {k: v for v, k in enumerate(sorted(unique_labels))}
ids_to_labels = {v: k for v, k in enumerate(sorted(unique_labels))}
print(labels_to_ids)

{'B-ADE': 0, 'B-Dosage': 1, 'B-Drug': 2, 'B-Duration': 3, 'B-Form': 4, 'B-Frequency': 5, 'B-Reason': 6, 'B-Route': 7, 'B-Strength': 8, 'I-ADE': 9, 'I-Dosage': 10, 'I-Drug': 11, 'I-Duration': 12, 'I-Form': 13, 'I-Frequency': 14, 'I-Reason': 15, 'I-Route': 16, 'I-Strength': 17, 'O': 18}


In [33]:
df[df['label'] == 'B-ADE']

Unnamed: 0,text_file_name,sentence_line_number,sentence_word_index,sentence_seq,start_token,end_token,original_word,word,label
637,data/training_20180910/110727.txt,68,3,T208,3011,3025,polyneuropathy,polyneuropathy,B-ADE
645,data/training_20180910/110727.txt,69,0,T377,3062,3070,infusion,infusion,B-ADE
700,data/training_20180910/110727.txt,74,1,T209,3237,3240,HTN,HTN,B-ADE
706,data/training_20180910/110727.txt,75,1,T210,3262,3265,DM2,DM0,B-ADE
732,data/training_20180910/110727.txt,78,1,T378,3365,3370,NAFLD,NAFLD,B-ADE
...,...,...,...,...,...,...,...,...,...
892895,data/training_20180910/118564.txt,156,6,T40,8092,8106,hallucinations,hallucinations,B-ADE
892897,data/training_20180910/118564.txt,157,1,T41,8111,8120,tachypnea,tachypnea,B-ADE
893013,data/training_20180910/118564.txt,168,10,T44,8731,8735,rash,rash,B-ADE
894424,data/training_20180910/100883.txt,128,0,T41,5871,5882,Hypotension,Hypotension,B-ADE


## Experimenting with formats for BERT

In [34]:
train_texts = []
train_labels = []

# Read the text file line by line
with open('/content/drive/MyDrive/266_final/data/Original_text/dataset1_train.txt', 'r', encoding='utf-8') as file:
    current_text = []  # To store tokens of the current text
    current_labels = []  # To store labels of the current text
    for line in file:
        if line.strip() == '':  # Empty line signifies end of text
            train_texts.append(current_text)
            train_labels.append(current_labels)
            current_text = []
            current_labels = []
        else:
            parts = line.strip().split()
            token = parts[-3]  # Token is second-to-last part
            label = parts[-1]   # Label is last part
            current_text.append(token)
            current_labels.append(label)

# Check the first few samples
print(train_texts[:5])
print(train_labels[:5])


[['Admission', 'Date', ':', '[', '*', '*', '2202', '-', '1', '-', '8', '*', '*', ']', 'Discharge', 'Date', ':', '[', '*', '*', '2202', '-', '2', '-', '1', '*', '*', ']'], ['Date', 'of', 'Birth', ':', '[', '*', '*', '2163', '-', '9', '-', '18', '*', '*', ']', 'Sex', ':', 'M'], ['Service', ':', 'MEDICINE'], ['Allergies', ':', 'Keflex', '/', 'Orencia', '/', 'Remicade'], ['Attending', ':', '[', '*', '*', 'First', 'Name3', '(', 'LF', ')', '2751', '*', '*', ']', 'Chief', 'Complaint', ':', 'L', 'leg', 'pain', 'and', 'erythema']]
[['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O'], ['O', 'O', 'B-Drug', 'O', 'B-Drug', 'O', 'B-Drug'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']]


### Not sure if this is the correct format. Let's look into another way to turn it into a sentence.

In [35]:
def formatted_df(df):
  df['sentence'] = df[[
      'text_file_name',
      'sentence_line_number',
      'original_word',
      'label']].groupby(
          ['text_file_name', 'sentence_line_number'])['original_word'].transform(lambda x: ' '.join(x))

  df['word_labels'] = df[[
      'text_file_name',
      'sentence_line_number',
      'original_word',
      'label']].groupby(
          ['text_file_name', 'sentence_line_number'])['label'].transform(lambda x: ','.join(x))

  return df

In [36]:
df = formatted_df(df)
df_test = formatted_df(test)

In [37]:
def sentence_level_data_fn(df):
  sentence_level_data = df[["text_file_name", "sentence_line_number", "sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
  return sentence_level_data

sentence_level_train = sentence_level_data_fn(df)
sentence_level_test = sentence_level_data_fn(df_test)

In [38]:
sentence_level_data = sentence_level_train[["text_file_name", "sentence_line_number", "sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)
sentence_level_test = sentence_level_test[["text_file_name","sentence_line_number",  "sentence", "word_labels"]].drop_duplicates().reset_index(drop=True)

sentence_level_data.head()

Unnamed: 0,text_file_name,sentence_line_number,sentence,word_labels
0,data/training_20180910/110727.txt,1,Admission Date : [ * * 2202 - 1 - 8 * * ] Disc...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,data/training_20180910/110727.txt,3,Date of Birth : [ * * 2163 - 9 - 18 * * ] Sex : M,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
2,data/training_20180910/110727.txt,5,Service : MEDICINE,"O,O,O"
3,data/training_20180910/110727.txt,7,Allergies :,"O,O"
4,data/training_20180910/110727.txt,8,Keflex / Orencia / Remicade,"B-Drug,O,B-Drug,O,B-Drug"


In [39]:
len(sentence_level_data)

83321

In [40]:
sentence_level_data.iloc[7020].sentence

'your chemistries .'

In [41]:
sentence_level_data.iloc[7020].word_labels

'O,O,O'

In [42]:
print(sentence_level_data.iloc[21].sentence)
print(sentence_level_data.iloc[21].word_labels)

fx on x - ray . This was thought to be a psoriatic arthritis flare ,
O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O


In [47]:
sentence_level_data

Unnamed: 0,text_file_name,sentence_line_number,sentence,word_labels
0,data/training_20180910/110727.txt,1,Admission Date : [ * * 2202 - 1 - 8 * * ] Disc...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,data/training_20180910/110727.txt,3,Date of Birth : [ * * 2163 - 9 - 18 * * ] Sex : M,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
2,data/training_20180910/110727.txt,5,Service : MEDICINE,"O,O,O"
3,data/training_20180910/110727.txt,7,Allergies :,"O,O"
4,data/training_20180910/110727.txt,8,Keflex / Orencia / Remicade,"B-Drug,O,B-Drug,O,B-Drug"
...,...,...,...,...
83316,data/training_20180910/100883.txt,219,Dictated By : [ * * Last Name ( NamePattern1 )...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
83317,data/training_20180910/100883.txt,220,MEDQUIST36,O
83318,data/training_20180910/100883.txt,221,D : [ * * 2102 - 6 - 3 * * ] 13 : 37 : 57,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"
83319,data/training_20180910/100883.txt,222,T : [ * * 2102 - 6 - 3 * * ] 15 : 11 : 44,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"


#### Some sentences are really short. *Try* concatenating sentence to get longer inputs

In [43]:
# df = sentence_level_data.sort_values(by=['text_file_name', 'sentence_line_number'])
# df_test = sentence_level_test.sort_values(by=['text_file_name', 'sentence_line_number'])

# # Function to concatenate up to 5 consecutive sentences and labels
# def concatenate_sentences(group):
#     """
#     This function takes a group of sentences and labels from a dataframe
#     and concatenates up to 5 consecutive sentences and t
#     heir corresponding labels into a single string.
#     Args:
#         group (Datafram): A group of sentences and labels from a
#         DataFrame grouped by a column (e.g., text_file_name).
#     Returns:
#         pandas.DataFrame: A DataFrame containing the
#         original text_file_name, a list of
#         concatenated sentences, and a
#         list of concatenated labels.
#     """
#     concatenated_sentences = []
#     concatenated_labels = []

#     for i in range(len(group)):
#         # Select up to 5 consecutive sentences starting from the current one
#         sentences_to_concat = group['sentence'].iloc[i:i+5].tolist()
#         labels_to_concat = group['word_labels'].iloc[i:i+5].tolist()

#         # Concatenate selected sentences and labels
#         concatenated_sentence = ' '.join(sentences_to_concat)
#         concatenated_label = ','.join(labels_to_concat)

#         concatenated_sentences.append(concatenated_sentence)
#         concatenated_labels.append(concatenated_label)

#     # Return a DataFrame for the concatenated sentences and labels
#     return pd.DataFrame({
#         'text_file_name': group['text_file_name'].iloc[0],
#         'concatenated_sentence': concatenated_sentences,
#         'concatenated_labels': concatenated_labels
#     })

# # Step 2: Apply the concatenation function to each group and combine the results
# concatenated_df = pd.concat([concatenate_sentences(group) for _, group in df.groupby('text_file_name')])
# concatenated_test = pd.concat([concatenate_sentences(group) for _, group in df_test.groupby('text_file_name')])

# # Reset index of the resulting DataFrame
# concatenated_df = concatenated_df.reset_index(drop=True)

# df = sentence_level_data.sort_values(by=['text_file_name', 'sentence_line_number'])
# df_test = sentence_level_test.sort_values(by=['text_file_name', 'sentence_line_number'])# concatenated_df.head()

In [49]:
df = sentence_level_data.sort_values(by=['text_file_name', 'sentence_line_number'])
df_test = sentence_level_test.sort_values(by=['text_file_name', 'sentence_line_number'])

def cs(dataframe):
    processed_data_list = []

    # Group by 'text_file_name'
    grouped = dataframe.groupby("text_file_name")

    for name, group in grouped:
        sentences = group['sentence'].tolist()
        # Labels are already strings, so we take them as is
        labels = group['word_labels'].tolist()

        for i in range(0, len(sentences), 5):
            end_index = min(i + 5, len(sentences))
            current_batch = sentences[i:end_index]
            current_labels = labels[i:end_index]

            concatenated_sentence = " ".join(current_batch)
            # Concatenate labels as they are, assuming they're correctly formatted strings
            concatenated_label = ",".join(current_labels)

            processed_data_list.append({
                "text_file_name": name,
                "sentences": concatenated_sentence,
                "labels": concatenated_label
            })

    processed_data = pd.DataFrame(processed_data_list)
    return processed_data


# Correcting the function to properly handle labels
concatenated_df = cs(df)
concatenated_test= cs(df_test)

concatenated_df



Unnamed: 0,text_file_name,sentences,labels
0,data/training_20180910/100035.txt,Admission Date : [ * * 2115 - 2 - 22 * * ] Dis...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,data/training_20180910/100035.txt,Attending : [ * * First Name3 ( LF ) 4891 * * ...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,data/training_20180910/100035.txt,Removal of chest tubes placed at an outside ho...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,data/training_20180910/100035.txt,to an OSH with dyspnea now admitted to the MIC...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,data/training_20180910/100035.txt,"and was intubated . He received epinephrine , ...","O,O,O,O,O,O,B-Drug,O,B-Drug,O,B-Drug,O,O,B-Dru..."
...,...,...,...
16783,data/training_20180910/198406.txt,# Contact : [ * * Name ( NI ) * * ] [ * * Tele...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
16784,data/training_20180910/198406.txt,"4 . Ergocalciferol 50 , 000 unit weekly 5 . Ri...","O,O,B-Drug,B-Dosage,I-Dosage,I-Dosage,I-Dosage..."
16785,data/training_20180910/198406.txt,9 . Acetaminophen 1000 mg PO Q6H 10 . Mirtazap...,"O,O,B-Drug,B-Strength,I-Strength,B-Route,B-Fre..."
16786,data/training_20180910/198406.txt,Expired Discharge Diagnosis : Chief cause of d...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O"


In [26]:
# concatenated_df['labels']= concatenated_df['labels'].map(lambda x: ','.join(map(str, x)))

In [35]:
# concatenated_df_with_labels.drop_duplicates(subset=['sentences'], inplace=True)
# concatenated_df_with_labels.drop_duplicates(subset=['sentences'], inplace=True)

In [50]:
# make sure test is formatted same way
concatenated_df.iloc[301].sentences

"could be due to patient's liver dysfunction/third spacing from CHF . If cholecystitis is of clinical concern , HIDA scan can be performed provided the total bilirubin is not elevated . 3 ) Hyperdense renal cortex in left lower quadrant transplanted kidney . Findings are most likely due to chronic rejection or"

In [51]:
concatenated_df.iloc[301].labels

'O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O'

In [57]:
concatenated_df.iloc[16765].sentences

'evidence of acute focal pneumonia . Brief Hospital Course : [ * * Age over 90 * * ] year old female with past medical history significant for CAD , critical AS , and radiation proctitis presenting after a fall with a humeral fracture and hypotension following opiates .'

In [58]:
concatenated_df.iloc[16765].labels

'O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-ADE,I-ADE,O,B-ADE,O,B-Drug,O'

In [54]:
print(concatenated_df.iloc[21].sentences)
print(concatenated_df.iloc[21].labels)

Recommend clinical correlation . . CTH : My read , no acute bleed .
O,O,O,O,O,O,O,O,O,O,O,O,O,O


In [56]:
def contains_ade(labels):
    return any('ADE' in label for label in labels.split(','))

ade_mask = concatenated_df['labels'].apply(contains_ade)

# Use the mask to filter the DataFrame
ade_sentences_df = concatenated_df[ade_mask]

# Display the filtered DataFrame
ade_sentences_df

Unnamed: 0,text_file_name,sentences,labels
60,data/training_20180910/100035.txt,being treated for infection . Since no new inf...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
84,data/training_20180910/100039.txt,Right heart catheterization IR guided paracent...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
92,data/training_20180910/100039.txt,OTHER MEDICAL HISTORY : - Embolic stroke in [ ...,"O,O,O,O,O,B-Reason,I-Reason,O,O,O,O,O,O,O,O,O,..."
145,data/training_20180910/100039.txt,anthracycline - induced cardiomyopathy ( EF 15...,"B-Drug,O,O,B-ADE,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O..."
149,data/training_20180910/100039.txt,She continued to have mild - moderate abdomina...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
...,...,...,...
16647,data/training_20180910/196798.txt,. 3 ) Anuric renal failure : ATN likely from T...,"O,O,O,O,O,O,O,B-ADE,O,O,B-Drug,O,B-Drug,O,O,O,..."
16667,data/training_20180910/196798.txt,. 11 ) Rash : Patient noted to have morbillifo...,"O,O,O,B-ADE,O,O,O,O,O,B-ADE,I-ADE,O,O,O,O,O,O,..."
16699,data/training_20180910/197869.txt,flagyl and zosyn ) his WBC continues to rise w...,"B-Drug,O,B-Drug,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
16765,data/training_20180910/198406.txt,evidence of acute focal pneumonia . Brief Hosp...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [36]:
# concatenated_df.to_csv('/content/drive/MyDrive/266_final/data/concatenated_df.csv', index=False)
# concatenated_test.to_csv('/content/drive/MyDrive/266_final/data/concatenated_test.csv', index=False)


#### Comment this out if you want concatenated dataset.

In [37]:
# data = sentence_level_data.copy()
# data_test = sentence_level_test.copy()

#### Comment this out if you want sentence level dataset.

In [60]:
data = concatenated_df.copy()
data_test = concatenated_test.copy()

data.rename(columns={'concatenated_sentence': 'sentence', 'concatenated_labels': 'word_labels'}, inplace=True)
data_test.rename(columns={'concatenated_sentence': 'sentence', 'concatenated_labels': 'word_labels'}, inplace=True)

In [40]:
# from collections import Counter
# import pandas as pd

# # Assuming `df` is your DataFrame

# # Step 1: Concatenate all labels into a single list
# all_labels = []
# for labels_str in df['word_labels']:
#     labels_list = labels_str.split(',')
#     all_labels.extend(labels_list)  # Combine lists

# # Step 2: Count unique labels
# label_counts = Counter(all_labels)

# # Display the counts of each unique label
# label_counts


#### Double check the dataset you want.

In [41]:
# data = data.head(10000)
# data_test = data_test.head(10000)

In [62]:
data_test.head()

Unnamed: 0,text_file_name,sentences,labels
0,data/test_data_Task2/100130.txt,Admission Date : [ * * 2109 - 7 - 21 * * ] Dis...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,data/test_data_Task2/100130.txt,with features of an oligodendroglioma who was ...,"O,O,O,O,O,O,O,O,O,B-Drug,O,O,B-Reason,I-Reason..."
2,data/test_data_Task2/100130.txt,the night before admission which rapidly sprea...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,data/test_data_Task2/100130.txt,PAST MEDICAL HISTORY : 1 . Hypercholesterolemi...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,data/test_data_Task2/100130.txt,previously on Decadron q.i.d . tapered over on...,"O,O,B-Drug,B-Frequency,O,O,O,B-Duration,I-Dura..."


### It looks reasonable, let's proceed with training

In [63]:
### LOAD IF NECESSARY

# from transformers import AutoModelForTokenClassification, AutoTokenizer

# # Directory where the model and tokenizer are saved
# directory = "/content/drive/MyDrive/266_final/bert_base_cased_sentence_level"


# tokenizer = AutoTokenizer.from_pretrained(directory)

# model = AutoModelForTokenClassification.from_pretrained(directory)

# print('Model and tokenizer have been loaded.')

In [89]:
import os
from transformers import AutoTokenizer, AutoModelForTokenClassification

model = "emilyalsentzer/Bio_ClinicalBERT"  # Bio_ClinicalBERT model identifier
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForTokenClassification.from_pretrained(model, num_labels=len(labels_to_ids))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [90]:
# # Load model directly
# from transformers import AutoTokenizer, AutoModelForMaskedLM

# tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")
# model = AutoModelForMaskedLM.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")

In [91]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
TEST_BATCH_SIZE = 32
EPOCHS = 3
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
# tokenizer = BertTokenizerFast.from_pretrained('bert-base-cased')


In [92]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len

  def __getitem__(self, index):
        # step 1: get the sentence and word labels
        sentence = self.data.sentences[index].strip().split()
        word_labels = self.data.labels[index].split(",")
        # sentence_id = self.data.iloc[index]['sentence_id']

        # step 2: use tokenizer to encode sentence (includes padding/truncation up to max length)
        # BertTokenizerFast provides a handy "return_offsets_mapping" functionality for individual tokens
        encoding = self.tokenizer(sentence,
                             return_offsets_mapping=True,
                             padding='max_length',
                             is_split_into_words=True,
                             truncation=True,
                             max_length=self.max_len)

        # step 3: create token labels only for first word pieces of each tokenized word
        labels = [labels_to_ids[label] for label in word_labels]
        # code based on https://huggingface.co/transformers/custom_datasets.html#tok-ner
        # create an empty array of -100 of length max_length
        encoded_labels = np.ones(len(encoding["offset_mapping"]), dtype=int) * -100

        # set only labels whose first offset position is 0 and the second is not 0
        i = 0
        for idx, mapping in enumerate(encoding["offset_mapping"]):
          if mapping[0] == 0 and mapping[1] != 0:
            # overwrite label
            encoded_labels[idx] = labels[i]
            i += 1

        # step 4: turn everything into PyTorch tensors
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        item['labels'] = torch.as_tensor(encoded_labels)
        # item['sentence_id'] = sentence_id

        return item

  def __len__(self):
        return self.len

In [93]:
# data[data['text_file_name']=='data/training_20180910/100035.txt']

In [94]:
# train_dataset.sort_values(by='sentence_id', inplace=True)

In [95]:
# from collections import Counter
# import pandas as pd

# # Assuming `df` is your DataFrame

# # Step 1: Concatenate all labels into a single list
# all_labels = []
# for labels_str in train_dataset['word_labels']:
#     labels_list = labels_str.split(',')
#     all_labels.extend(labels_list)  # Combine lists

# # Step 2: Count unique labels
# label_counts = Counter(all_labels)

# # Display the counts of each unique label
# label_counts


In [96]:
# train_dataset.iloc[0]['sentence']

In [97]:
# train_dataset.iloc[11244]['sentence']

In [98]:
# train_size = 0.8
# train_dataset = data.sample(frac=train_size,random_state=200, shuffle=True)
# val_dataset = data.drop(train_dataset.index).reset_index(drop=True)
# train_dataset = train_dataset.reset_index(drop=True)


# train_dataset.to_csv('/content/drive/MyDrive/266_final/data/train_dataset.csv', index=False)
# val_dataset.to_csv('/content/drive/MyDrive/266_final/data/val_dataset.csv', index=False)
# data_test.to_csv('/content/drive/MyDrive/266_final/data/test_dataset.csv', index=False)


In [99]:
# len(data)

In [100]:
from sklearn.model_selection import train_test_split

train_dataset, val_dataset = train_test_split(data, test_size=0.2, shuffle=False)


In [101]:
train_dataset.reset_index(drop=True, inplace=True)
val_dataset.reset_index(drop=True, inplace=True)

In [102]:
train_dataset.head()

Unnamed: 0,text_file_name,sentences,labels
0,data/training_20180910/100035.txt,Admission Date : [ * * 2115 - 2 - 22 * * ] Dis...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,data/training_20180910/100035.txt,Attending : [ * * First Name3 ( LF ) 4891 * * ...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,data/training_20180910/100035.txt,Removal of chest tubes placed at an outside ho...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,data/training_20180910/100035.txt,to an OSH with dyspnea now admitted to the MIC...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,data/training_20180910/100035.txt,"and was intubated . He received epinephrine , ...","O,O,O,O,O,O,B-Drug,O,B-Drug,O,B-Drug,O,O,B-Dru..."


In [104]:
print("FULL Dataet: {}".format(data.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VAL Dataset: {}".format(val_dataset.shape))
print("TEST Dataset: {}".format(data_test.shape))

data_test.reset_index(drop=True, inplace=True)

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
val_set = dataset(val_dataset, tokenizer, MAX_LEN)
test_set = dataset(data_test, tokenizer, MAX_LEN)

FULL Dataet: (16788, 3)
TRAIN Dataset: (13430, 3)
VAL Dataset: (3358, 3)
TEST Dataset: (11146, 3)


In [106]:
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[3]["input_ids"]), training_set[3]["labels"]):
  print('{0:10}  {1}'.format(token, label))

[CLS]       -100
to          18
an          18
o           18
##sh        -100
with        18
d           18
##ys        -100
##p         -100
##nea       -100
now         18
admitted    18
to          18
the         18
mi          18
##cu        -100
after       18
p           18
##ea        -100
arrest      18
x           18
##2         -100
.           18
the         18
patient     18
initially   18
presented   18
to          18
l           18
##gh        -100
ed          18
with        18
h           18
##y         -100
##pox       -100
##em        -100
##ic        -100
respiratory  18
distress    18
.           18
while       18
at          18
the         18
o           18
##sh        -100
,           18
he          18
received    18
c           2
##t         -100
##x         -100
,           18
a           2
##zi        -100
##th        -100
##rom       -100
##y         -100
##cin       -100
,           18
s           7
##c         -100
e           2
##pine      -100
##ph        

In [107]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

val_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': TEST_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }


training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)
test_loader = DataLoader(test_set, **test_params)


In [108]:
# model = BertForTokenClassification.from_pretrained('bert-base-cased', num_labels=len(labels_to_ids))
model.to(device)




BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [109]:
inputs = training_set[2]
input_ids = inputs["input_ids"].unsqueeze(0)
attention_mask = inputs["attention_mask"].unsqueeze(0)
labels = inputs["labels"].unsqueeze(0)

input_ids = input_ids.to(device)
attention_mask = attention_mask.to(device)
labels = labels.to(device)

outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
initial_loss = outputs[0]
initial_loss

tensor(3.1042, device='cuda:0', grad_fn=<NllLossBackward0>)

In [110]:
tr_logits = outputs[1]
tr_logits.shape

torch.Size([1, 512, 19])

In [111]:
# !pip install transformers==3.5.1

import torch.nn.functional as F


def focal_loss(logits, labels, alpha=0.25, gamma=2.0, ignore_index=-100):
    """
    logits: [batch_size, seq_len, num_labels] - model predictions
    labels: [batch_size, seq_len] - ground truth labels
    """
    # Calculate Cross Entropy Loss without reduction
    ce_loss = F.cross_entropy(logits.view(-1, logits.size(-1)), labels.view(-1), reduction='none', ignore_index=ignore_index)

    # Get the predictions
    pred_probs = F.softmax(logits.view(-1, logits.size(-1)), dim=-1)
    pred_class = labels.view(-1)

    # focusing parameter
    gamma = gamma

    # Filter out 'ignore_index' labels
    filtered = labels.view(-1) != ignore_index

    # Calculate focal loss
    ce_loss_filtered = ce_loss[filtered]
    pred_probs_filtered = pred_probs[filtered]
    pred_class_filtered = pred_class[filtered]

    # Construct the loss
    pt = pred_probs_filtered.gather(1, pred_class_filtered.unsqueeze(-1)).squeeze()
    loss = ((1 - pt) ** gamma * ce_loss_filtered).mean()  # mean over the batch

    return loss



In [112]:
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

In [113]:
def train(epoch):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []

    model.train()

    for idx, batch in enumerate(training_loader):
        ids = batch['input_ids'].to(device, dtype=torch.long)
        mask = batch['attention_mask'].to(device, dtype=torch.long)
        labels = batch['labels'].to(device, dtype=torch.long)
    #     global_attention_mask = torch.zeros(
    # input_ids.shape, dtype=torch.long, device=input_ids.device

        # Perform a forward pass to get the logits
        outputs = model(input_ids=ids, attention_mask=mask)

        # Calculate the loss using focal_loss function directly
        loss = focal_loss(outputs.logits, labels)  # Use the focal_loss function here directly

        tr_loss += loss.item()
        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)

        if idx % 100 == 0:
            loss_step = tr_loss / nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # Decoding the logits to compute accuracy
        active_logits = outputs.logits.view(-1, model.num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1)

        # Ignoring the predictions of the padding tokens
        active_accuracy = labels.view(-1) != -100
        active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

        labels = torch.masked_select(active_labels, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_labels.extend(labels.cpu().numpy())
        tr_preds.extend(predictions.cpu().numpy())

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=MAX_GRAD_NORM)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")


In [114]:
for epoch in range(EPOCHS):
    print(f"Training epoch: {epoch + 1}")
    train(epoch)

Training epoch: 1
Training loss per 100 training steps: 2.8425018787384033
Training loss per 100 training steps: 0.6463727346704443
Training loss per 100 training steps: 0.42277706415735333
Training loss per 100 training steps: 0.3163785307530462
Training loss per 100 training steps: 0.26028489972496094
Training loss per 100 training steps: 0.2249096211679991
Training loss per 100 training steps: 0.20096153629211144
Training loss per 100 training steps: 0.18109726229401574
Training loss per 100 training steps: 0.16469221742797996
Training loss per 100 training steps: 0.15245983330308413
Training loss per 100 training steps: 0.14233275261676726
Training loss per 100 training steps: 0.1341221071177297
Training loss per 100 training steps: 0.12790781445224514
Training loss per 100 training steps: 0.12164558449000695
Training loss per 100 training steps: 0.116221555431198
Training loss per 100 training steps: 0.11129661202454187
Training loss per 100 training steps: 0.10676416370509313
Tra

In [115]:
def valid(model, testing_loader):
    model.eval()
    eval_loss, eval_accuracy = 0, 0

    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(testing_loader):
            ids = batch['input_ids'].to(device, dtype=torch.long)
            mask = batch['attention_mask'].to(device, dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)
            # sentence_ids = batch['sentence_id']

            outputs = model(input_ids=ids, attention_mask=mask)
            loss = focal_loss(outputs.logits, labels)  # Use the same focal_loss function

            eval_loss += loss.item()
            nb_eval_steps += 1
            nb_eval_examples += labels.size(0)

            if idx % 100 == 0:
                loss_step = eval_loss / nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # Decoding the logits to compute accuracy
            active_logits = outputs.logits.view(-1, model.num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1)

            # Ignoring the predictions of the padding tokens
            active_accuracy = labels.view(-1) != -100
            active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))

            labels = torch.masked_select(active_labels, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(labels.cpu().numpy())
            eval_preds.extend(predictions.cpu().numpy())

            tmp_eval_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

            # sentence_ids_list.extend(sentence_ids)



    labels = [ids_to_labels[id.item()] for id in eval_labels]
    predictions = [ids_to_labels[id.item()] for id in eval_preds]

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps
    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")

    return labels, predictions




In [116]:
labels, predictions = valid(model, val_loader)

Validation loss per 100 evaluation steps: 0.005812683142721653
Validation loss per 100 evaluation steps: 0.029902314047198574
Validation loss per 100 evaluation steps: 0.029637827443586992
Validation loss per 100 evaluation steps: 0.030716734786596138
Validation loss per 100 evaluation steps: 0.03146203292717865
Validation loss per 100 evaluation steps: 0.02985226105792214
Validation loss per 100 evaluation steps: 0.02920594221620661
Validation loss per 100 evaluation steps: 0.028510840986541067
Validation loss per 100 evaluation steps: 0.029072611333780227
Validation loss per 100 evaluation steps: 0.030129624426010856
Validation loss per 100 evaluation steps: 0.030498732567249026
Validation loss per 100 evaluation steps: 0.031316674119287215
Validation loss per 100 evaluation steps: 0.031505375459210276
Validation loss per 100 evaluation steps: 0.0317758949351076
Validation loss per 100 evaluation steps: 0.03205072348063346
Validation loss per 100 evaluation steps: 0.03214698020615042

In [117]:
from seqeval.metrics import classification_report

print(classification_report([labels], [predictions]))

              precision    recall  f1-score   support

         ADE       0.32      0.45      0.37       165
      Dosage       0.85      0.91      0.88       782
        Drug       0.88      0.93      0.91      3216
    Duration       0.66      0.79      0.72       120
        Form       0.87      0.88      0.88      1269
   Frequency       0.79      0.83      0.81      1270
      Reason       0.56      0.68      0.61       698
       Route       0.90      0.93      0.92      1143
    Strength       0.93      0.93      0.93      1341

   micro avg       0.83      0.88      0.86     10004
   macro avg       0.75      0.81      0.78     10004
weighted avg       0.84      0.88      0.86     10004



## Evaluation

 - convert val set back to word level

In [154]:
# split the sentence by space and expand into separate columns
split_df_label = val_dataset['labels'].str.split(',', expand=True)

# Stack the split words into a single column, dropping NaN values automatically
val_labels = split_df_label.stack().reset_index(level=1, drop=True).reset_index()

# Rename the columns
val_labels.columns = ['original_row_index', 'label']

val_labels.head()


Unnamed: 0,original_row_index,label
0,0,O
1,0,O
2,0,O
3,0,O
4,0,O


In [161]:
len(val_dataset)

3358

In [157]:
# split the sentence by space and expand into separate columns
split_dfl = val_dataset['sentences'].str.split(',', expand=True)

# Stack the split words into a single column, dropping NaN values automatically
val_sent = split_df.stack().reset_index(level=1, drop=True).reset_index()

# Rename the columns
val_sent.columns = ['original_row_index', 'word']

val_sent.head()


Unnamed: 0,original_row_index,word
0,0,Admission
1,0,Date
2,0,:
3,0,[
4,0,*


In [160]:
val_sent['preds'] = predictions
val_sent['labels_pp'] = labels

ValueError: Length of values (176679) does not match length of index (585761)

In [141]:
# merged = pd.concat([val_words, val_labels], axis=1)


Unnamed: 0,original_row_index,word,original_row_index.1,label
0,0,Admission,0,O
1,0,Date,0,O
2,0,:,0,O
3,0,[,0,O
4,0,*,0,O


In [142]:
# merged['preds'] = test_predictions
# merged['labels_pp'] = test_labels

In [152]:
# merged[merged['preds'] == 'B-Drug']

In [153]:
# test['sentence'].iloc[0]

In [121]:
test_labels, test_predictions = valid(model, test_loader)

Validation loss per 100 evaluation steps: 0.010770902968943119
Validation loss per 100 evaluation steps: 0.02729300238505596
Validation loss per 100 evaluation steps: 0.02737924360336541
Validation loss per 100 evaluation steps: 0.026840517346119117
Validation Loss: 0.026870437326005803
Validation Accuracy: 0.9811775172745457


In [123]:
from seqeval.metrics import classification_report

print(classification_report([test_labels], [test_predictions]))

              precision    recall  f1-score   support

         ADE       0.42      0.50      0.46       634
      Dosage       0.89      0.93      0.91      2703
        Drug       0.91      0.94      0.93     10594
    Duration       0.64      0.73      0.69       385
        Form       0.91      0.92      0.91      4373
   Frequency       0.83      0.86      0.84      4155
      Reason       0.54      0.60      0.57      2561
       Route       0.91      0.95      0.93      3513
    Strength       0.95      0.95      0.95      4237

   micro avg       0.86      0.89      0.88     33155
   macro avg       0.78      0.82      0.80     33155
weighted avg       0.86      0.89      0.88     33155



In [71]:
sentence_predictions = []


start_idx = 0
for index, row in data_test.iterrows():
    # Number of labels in the sentence
    num_labels = len(row['word_labels'].split(','))
    # Segment predictions corresponding to the current sentence
    sentence_pred = predictions[start_idx:start_idx + num_labels]
    # Append to our list
    sentence_predictions.append(','.join(sentence_pred))
    # Update start index for next iteration
    start_idx += num_labels

# Assigning segmented predictions back to DataFrame
data_test['predicted_labels'] = sentence_predictions

data_test.head()

Unnamed: 0,sentence_id,text_file_name,sentence,word_labels,predicted_labels
0,sent_0,data/test_data_Task2/100130.txt,Admission Date : [ * * 2109 - 7 - 21 * * ] Dis...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,sent_1,data/test_data_Task2/100130.txt,with features of an oligodendroglioma who was ...,"O,O,O,O,O,O,O,O,O,B-Drug,O,O,B-Reason,I-Reason...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-Re..."
2,sent_2,data/test_data_Task2/100130.txt,the night before admission which rapidly sprea...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,sent_3,data/test_data_Task2/100130.txt,PAST MEDICAL HISTORY : 1 . Hypercholesterolemi...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...","O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-Do..."
4,sent_4,data/test_data_Task2/100130.txt,previously on Decadron q.i.d . tapered over on...,"O,O,B-Drug,B-Frequency,O,O,O,B-Duration,I-Dura...","O,O,B-Drug,O,B-Drug,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [83]:
from collections import defaultdict

# Adjust the dictionary to hold tuples of (word, sentence_id)
misclassified_words_with_ids = defaultdict(list)

for _, row in data_test.iterrows():
    true_labels = row['word_labels'].split(',')
    pred_labels = row['predicted_labels'].split(',')
    words = row['sentence'].split()  # Assuming words are space-separated
    sentence_id = row['sentence_id']  # Capture the sentence ID

    # Ensure the lists are of the same length; this might need adjusting depending on your data's preprocessing
    assert len(true_labels) == len(pred_labels) == len(words), "Length mismatch between words and labels"

    for true_label, pred_label, word in zip(true_labels, pred_labels, words):
        if true_label != pred_label:  # Word is misclassified
            key = (true_label, pred_label)
            # Now append a tuple of (word, sentence_id)
            misclassified_words_with_ids[key].append((word, sentence_id))

# At this point, `misclassified_words_with_ids` contains information about misclassified words along with their sentence IDs


In [84]:
# Example: Print misclassified words for each label pair, along with sentence IDs
for label_pair, word_id_pairs in misclassified_words_with_ids.items():
    print(f"True label: {label_pair[0]}, Predicted label: {label_pair[1]}")
    for word, sentence_id in word_id_pairs:
        print(f"  Word: '{word}' in Sentence ID: {sentence_id}")
    print("\n")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  Word: '20' in Sentence ID: sent_10853


True label: B-Reason, Predicted label: I-Frequency
  Word: 'SCC' in Sentence ID: sent_249
  Word: 'diuresing' in Sentence ID: sent_311
  Word: 'edematous' in Sentence ID: sent_376
  Word: 'solid' in Sentence ID: sent_379
  Word: 'Clostridium' in Sentence ID: sent_380
  Word: 'pain.' in Sentence ID: sent_385
  Word: 'bloody' in Sentence ID: sent_910
  Word: 'constipation.' in Sentence ID: sent_1291
  Word: 'line' in Sentence ID: sent_2664
  Word: 'hypertension' in Sentence ID: sent_3205
  Word: 'constipation' in Sentence ID: sent_3396
  Word: 'Steroid' in Sentence ID: sent_3536
  Word: 'anxiety' in Sentence ID: sent_3550
  Word: 'wheezing' in Sentence ID: sent_3669
  Word: 'COPD' in Sentence ID: sent_4056
  Word: 'Fibromyalgia' in Sentence ID: sent_4062
  Word: 'atrial' in Sentence ID: sent_4627
  Word: 'dyspnea' in Sentence ID: sent_4871
  Word: 'aspiration' in Sentence ID: sent_5

In [85]:
from collections import defaultdict

# Dictionary to hold tuples of (word, sentence_id) for correct classifications
correctly_classified_words_with_ids = defaultdict(list)

for _, row in data_test.iterrows():
    true_labels = row['word_labels'].split(',')
    pred_labels = row['predicted_labels'].split(',')
    words = row['sentence'].split()  # Assuming words are space-separated
    sentence_id = row['sentence_id']  # Capture the sentence ID

    # Ensure the lists are of the same length; this might need adjusting depending on your data's preprocessing
    assert len(true_labels) == len(pred_labels) == len(words), "Length mismatch between words and labels"

    for true_label, pred_label, word in zip(true_labels, pred_labels, words):
        if true_label == pred_label:  # Word is correctly classified
            key = true_label  # Use true label as the key
            # Append a tuple of (word, sentence_id) for correct classifications
            correctly_classified_words_with_ids[key].append((word, sentence_id))


In [87]:
correctly_classified_words_with_ids

defaultdict(list,
            {'O': [('Admission', 'sent_0'),
              ('Date', 'sent_0'),
              (':', 'sent_0'),
              ('[', 'sent_0'),
              ('*', 'sent_0'),
              ('*', 'sent_0'),
              ('2109', 'sent_0'),
              ('-', 'sent_0'),
              ('7', 'sent_0'),
              ('-', 'sent_0'),
              ('21', 'sent_0'),
              ('*', 'sent_0'),
              ('*', 'sent_0'),
              (']', 'sent_0'),
              ('Discharge', 'sent_0'),
              ('Date', 'sent_0'),
              (':', 'sent_0'),
              ('[', 'sent_0'),
              ('*', 'sent_0'),
              ('*', 'sent_0'),
              ('2109', 'sent_0'),
              ('-', 'sent_0'),
              ('8', 'sent_0'),
              ('-', 'sent_0'),
              ('13', 'sent_0'),
              ('*', 'sent_0'),
              ('*', 'sent_0'),
              ('].', 'sent_0'),
              ('Date', 'sent_0'),
              ('of', 'sent_0'),
            

In [88]:
# # Example: Print correctly classified words for each label
# for label, word_id_pairs in correctly_classified_words_with_ids.items():
#     print(f"Label: {label}")
#     word_counts = defaultdict(int)
#     for word, sentence_id in word_id_pairs:
#         word_counts[word] += 1  # Count occurrences of each word

#     # Sort words by their frequency
#     sorted_words = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)

#     for word, count in sorted_words[:5]:  # Adjust number as needed
#         print(f"  Word: '{word}' appeared {count} times correctly")
#     print("\n")


In [89]:
# Assuming correctly_classified_words_with_ids is already populated from the previous step

# Example: Print correctly classified words for each label, including sentence IDs
for label, word_id_pairs in correctly_classified_words_with_ids.items():
    print(f"Label: {label}")

    # Instead of just counting words, store sentence IDs where they appear
    word_sentence_ids = defaultdict(list)
    for word, sentence_id in word_id_pairs:
        word_sentence_ids[word].append(sentence_id)  # Append sentence ID for each word

    # Sort words by their frequency (number of sentence IDs)
    sorted_words = sorted(word_sentence_ids.items(), key=lambda x: len(x[1]), reverse=True)

    for word, sentence_ids in sorted_words[:5]:  # Adjust number as needed
        unique_sentence_ids = set(sentence_ids)  # Remove potential duplicates
        print(f"  Word: '{word}' appeared in sentences: {', '.join(unique_sentence_ids)}")
    print("\n")


Label: O
  Word: '*' appeared in sentences: sent_8068, sent_1136, sent_1767, sent_10433, sent_2741, sent_4879, sent_260, sent_8211, sent_2587, sent_5733, sent_8141, sent_3483, sent_440, sent_5325, sent_9961, sent_546, sent_8708, sent_9304, sent_8278, sent_8031, sent_9486, sent_10202, sent_4622, sent_1964, sent_7914, sent_3627, sent_6159, sent_8689, sent_3017, sent_2169, sent_2934, sent_7919, sent_10341, sent_5584, sent_5785, sent_8590, sent_5408, sent_10645, sent_7393, sent_4236, sent_3228, sent_2335, sent_10446, sent_7656, sent_3797, sent_795, sent_733, sent_7638, sent_2600, sent_2766, sent_787, sent_5747, sent_8858, sent_10535, sent_2964, sent_2507, sent_2295, sent_7516, sent_4922, sent_9218, sent_1955, sent_7538, sent_5378, sent_4577, sent_6883, sent_8403, sent_2051, sent_2034, sent_8526, sent_3783, sent_8282, sent_1433, sent_4663, sent_10164, sent_5075, sent_2364, sent_5958, sent_6300, sent_9463, sent_2445, sent_1574, sent_11018, sent_5315, sent_10455, sent_1583, sent_9434, sent_57

In [97]:
list(data_test[data_test['sentence_id'] == 'sent_1349'].sentence)

['During initiation of her brain radiation treatments , patient had. intractable nausea , vomiting , headache , and hypertension , but. repeat CT head did not show increased edema . She continued on. IV dexamethasone 6 mg q6 hours , IV keppra 1000mg q12 hours , and. IV hydralazine for blood pressure control ( see below ) ..']

In [106]:
data_test[data_test['sentence_id'] == 'sent_1349'].predicted_labels.str.strip('')

1349    O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,...
Name: predicted_labels, dtype: object

In [98]:
list(data_test[data_test['sentence_id'] == 'sent_1349'].word_labels)

['O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,B-Route,B-Drug,B-Strength,I-Strength,B-Frequency,I-Frequency,O,B-Route,B-Drug,B-Strength,B-Frequency,I-Frequency,O,O,B-Route,B-Drug,O,B-Reason,I-Reason,I-Reason,O,O,O,O,O']

### Save

In [124]:
import os

directory = "/content/drive/MyDrive/266_final/clinical_bert_concat7.pt"

if not os.path.exists(directory):
    os.makedirs(directory)

tokenizer.save_vocabulary(directory)
model.save_pretrained(directory)

### Load

In [None]:
model.to(device)

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

Resources

### https://github.com/lcampillos/Medical-NER/blob/master/bert_ner.ipynb
### https://medium.com/analytics-vidhya/bio-tagged-text-to-original-text-99b05da6664

In [None]:
# Initialize lists to hold the restructured predictions and labels
# sentence_labels = []
# sentence_predictions = []

# # Initialize counters
# start_idx = 0

# # Loop through each row in the DataFrame
# for _, row in val_dataset.iterrows():
#     # Find out how many tokens are in the current sentence
#     num_tokens = len(row['word_labels'].split(','))

#     # Slice the flat list of labels and predictions to get the current sentence's portion
#     sentence_labels.append(labels[start_idx:start_idx + num_tokens])
#     sentence_predictions.append(predictions[start_idx:start_idx + num_tokens])

#     # Update the start index for the next sentence
#     start_idx += num_tokens

# # Now `sentence_labels` and `sentence_predictions` contain labels and predictions grouped by sentence


In [None]:
# sentence_labels

### Error Analysis

In [113]:
val_dataset.head()

Unnamed: 0,sentence_id,text_file_name,sentence,word_labels
0,sent_2,data/training_20180910/100035.txt,Removal of chest tubes placed at an outside ho...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,sent_3,data/training_20180910/100035.txt,to an OSH with dyspnea now admitted to the MIC...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
2,sent_5,data/training_20180910/100035.txt,started on an epi gtt for asthma and a cooling...,"O,O,O,B-Drug,B-Route,O,B-Reason,O,O,O,O,O,O,O,..."
3,sent_17,data/training_20180910/100035.txt,[ * * 2115 - 3 - 19 * * ] 04 : 45AM BLOOD Gluc...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,sent_18,data/training_20180910/100035.txt,Baso - 0 Atyps - 0 Metas - 0 Myelos - 0. .. CX...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."


In [132]:
# test['predictions'] = test_predictions
test['labels'] = test_labels

In [134]:
test[['label', 'labels']].head(60)

Unnamed: 0,label,labels
0,O,O
1,O,O
2,O,O
3,O,O
4,O,O
5,O,O
6,O,O
7,O,O
8,O,O
9,O,O


In [131]:
test['predictions'].value_counts()

predictions
O              523212
B-Drug          10888
I-Frequency     10093
I-Dosage         5849
B-Form           4312
B-Strength       4220
I-Strength       4144
B-Frequency      4059
B-Route          3642
I-Form           3182
B-Dosage         2772
I-Drug           2647
B-Reason         2599
I-Reason         1700
B-ADE             689
I-Duration        681
B-Duration        393
I-ADE             367
I-Route           312
Name: count, dtype: int64

In [135]:
data_test.head()

Unnamed: 0,text_file_name,sentences,labels
0,data/test_data_Task2/100130.txt,Admission Date : [ * * 2109 - 7 - 21 * * ] Dis...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
1,data/test_data_Task2/100130.txt,with features of an oligodendroglioma who was ...,"O,O,O,O,O,O,O,O,O,B-Drug,O,O,B-Reason,I-Reason..."
2,data/test_data_Task2/100130.txt,the night before admission which rapidly sprea...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
3,data/test_data_Task2/100130.txt,PAST MEDICAL HISTORY : 1 . Hypercholesterolemi...,"O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,O,..."
4,data/test_data_Task2/100130.txt,previously on Decadron q.i.d . tapered over on...,"O,O,B-Drug,B-Frequency,O,O,O,B-Duration,I-Dura..."


In [126]:
test.to_csv('/content/drive/MyDrive/266_final/data/concat7_preds.csv', index=False)