# Installation & Setup

In [1]:
#this cell only needs to be run once for every new run time
!pip install transformers[torch,sentencepiece] accelerate torch datasets -U --quiet

In [2]:
#important relevant modeling libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# Code copied from https://huggingface.co/learn/nlp-course/chapter0/1?fw=pt.
import transformers

from datasets import Dataset
# Code copied from Jennifer and https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
from datasets import load_dataset

from transformers import AutoModel, AutoTokenizer, AddedToken
# Code copied from: https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt and https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding


In [3]:
#copy this block over for all successive model iterations
import pandas as pd
import os
import json
import numpy as np
import ast

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
#CharBERT model
charbert_name = "imvladikon/charbert-bert-wiki"
charbert_model = AutoModel.from_pretrained(charbert_name)
charbert_tokenizer = AutoTokenizer.from_pretrained(charbert_name)

# Data processing for model specifically

In [5]:
# Changing max length to 512 as 512 is the maximium for BERT (copied from: https://huggingface.co/learn/nlp-course/)
max_length = 512

In [6]:
# Function to pad or truncate the key_timing data
def pad_or_truncate(array):
    if len(array) < max_length:
        # If the array is shorter than the desired length, pad it with zeros.
        # You can use np.pad to add zeros at the end of the array.
        return np.pad(array, (0, max_length - len(array)), 'constant', constant_values=(0.0))
    else:
        # If the array is longer than the desired length, truncate it.
        # You can use array slicing to keep only the first 'desired_length' elements.
        return array[:max_length]

# Function to convert a string to a Python list
def string_to_list(input_string):
    return ast.literal_eval(input_string)

In [7]:
######################## READ IN DATA ###################################################################################################
data_path = '/content/drive/My Drive/266 Assignments/266 Final Project'
files = os.listdir(data_path)
files = [x for x in files if '.csv' in x]

filt_df = pd.read_csv(os.path.join(data_path, files[files.index('cleaned_data.csv')]))

#do some data cleanup to ensure timing sequence is in right data format
filt_df['timing_sequence'] = filt_df['timing_sequence'].apply(string_to_list)
filt_df['timing_sequence'] = filt_df['timing_sequence'].apply(pad_or_truncate)


In [8]:
######################## IMPORT SPECIAL TOKENS ############################################################################################
json_file = os.path.join(data_path, "token_map.json")
with open(json_file, 'r') as json_file:
    charbert_token_map = json.load(json_file)

added_tokens = [AddedToken(token) for token in charbert_token_map.values()]

In [9]:
#split the data
# Split the data into training, validation, and test sets
X = filt_df[['key_sequence', 'timing_sequence']]
y = filt_df['diagnosis']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# #convert series of lists to list
# X_train = list(X_train)
# X_val = list(X_val)
# X_test = list(X_test)

# y_train = y_train.to_list()
# y_val = y_val.to_list()
# y_test = y_test.to_list()



In [10]:
# Put data into a pandas dataframe to then load into a hugging face dataset object
# Code and idea copied from https://huggingface.co/docs/datasets/v1.11.0/loading_datasets.html
train_df = X_train
train_df['labels']=y_train
train_df.columns = ['key_seq', 'time_seq','labels']

# Code copied from above
# Made a df of our validation data
val_df = X_val
val_df['labels']=y_val
val_df.columns = ['key_seq', 'time_seq','labels']

test_df = X_test
test_df['labels']=y_test
test_df.columns = ['key_seq', 'time_seq','labels']

In [11]:
# Code copied from https://huggingface.co/docs/datasets/v1.11.0/loading_datasets.html
# Make a hugging face dataset object from the pandas df we just made
train_dataset = Dataset.from_pandas(train_df)

# Code copied from above
# Make a validation Dataset
val_dataset = Dataset.from_pandas(val_df)

# Code copied from above
# Make a test Dataset
test_dataset = Dataset.from_pandas(test_df)

In [12]:
# Code copied from https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt and prior code above (likely from BERT lesson notebooks/assignment)
# Code also copied from https://huggingface.co/learn/nlp-course/chapter2/2?fw=pt
# Create a tokenize function we'll use to tokenize the key sequences in the dataset
def tokenize_func(a):
  return charbert_tokenizer(
    a['key_seq'],
    # Changing padding to max_length, copied from https://huggingface.co/learn/nlp-course/chapter2/6?fw=pt
    # Getting rid of padding here (copying from https://huggingface.co/learn/nlp-course/chapter3/2?fw=pt).
    # It's more efficient to do padding at the batch level later on (use "dynamic padding") according to the above source
    # padding='max_length',
    # Trunction true should truncate to max length: Copied from https://huggingface.co/docs/transformers/main_classes/tokenizer
    truncation=True,
    # Testing a shorter max length based on https://huggingface.co/learn/nlp-course/chapter2/5?fw=pt
    # Getting rid of max length from here to, copying from: https://huggingface.co/learn/nlp-course/chapter3/2?fw=pt
    # Reinstating max length since got an error
    max_length=max_length,
    # Getting rid of return tensors so that this code runs! Gave an erron when it was here
    # return_tensors='pt'
    )

In [13]:
# Code copied from https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
# Tokenize the key sequences which will add the results to the dataset
tokenized_train_dataset = train_dataset.map(tokenize_func, batched=True)

# Code copied from above
# Tokenize validation dataset
tokenized_val_dataset = val_dataset.map(tokenize_func, batched=True)

# Code copied from above
# Tokenize test dataset
tokenized_test_dataset = test_dataset.map(tokenize_func, batched=True)


Map:   0%|          | 0/2372 [00:00<?, ? examples/s]

Map:   0%|          | 0/508 [00:00<?, ? examples/s]

Map:   0%|          | 0/509 [00:00<?, ? examples/s]

In [14]:
#make data loaders
batch_size = 16

train_dataloader = DataLoader(tokenized_train_dataset, batch_size=batch_size)
val_dataloader = DataLoader(tokenized_val_dataset, batch_size=batch_size)
test_dataloader = DataLoader(tokenized_test_dataset, batch_size=batch_size)

# CharBERT Model + LSTM
Note on warning below: that ['classifier.bias', 'classifier.weight'] are newly initialized. From Jennifer's OH: The CharBert model we're using likely wasn't built to work with Hugging Face's loader. We're probably losing pre-training. Could try to load it in from git directly, but that will be difficult (3 year old repo) and we'd have to match their python version, etc. Can proceed with this as is, just be aware we're likely losing some (or all?) pre-training. Also skimmed: https://discuss.huggingface.co/t/is-some-weights-of-the-model-were-not-used-warning-normal-when-pre-trained-bert-only-by-mlm/5672

In [15]:
# Add the special tokens from charbert_token_map to the tokenizer's vocabulary
charbert_tokenizer.add_tokens(added_tokens)

26

In [16]:
# Additional features input size
additional_features_size = max_length

In [17]:
# Code copied from: https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
# Pull in a classification version of the model
charbert_model = AutoModelForSequenceClassification.from_pretrained(charbert_name, num_labels=2)

# #must also add special tokens to the model
charbert_model.resize_token_embeddings(len(charbert_tokenizer))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at imvladikon/charbert-bert-wiki and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(28996, 768, padding_idx=0)

In [38]:
class CustomModel(nn.Module):
    def __init__(self, charbert_model, additional_features_size, num_labels=2):
        super(CustomModel, self).__init__()
        self.charbert_model = charbert_model
        self.additional_features_size = additional_features_size

        # Add LSTM layers for additional features
        self.lstm1 = nn.LSTM(input_size=additional_features_size, hidden_size=64, batch_first=True)
        self.lstm2 = nn.LSTM(input_size=64, hidden_size=32, batch_first=True)
        self.lstm3 = nn.LSTM(input_size=32, hidden_size=16, batch_first=True)


        # Output layer for binary classification
        self.classifier = nn.Linear(768 + 16, num_labels)  # 768 is the size of the CharBERT embeddings

    def forward(self, input_ids, attention_mask, additional_features):
        # CharBERT forward pass
        charbert_outputs = self.charbert_model(input_ids=input_ids, attention_mask=attention_mask)
        charbert_embeddings = charbert_outputs.logits # CharBERT has no last_hidden_state

        # LSTM forward pass for additional features
        lstm_outputs1, _ = self.lstm1(additional_features)
        lstm_outputs2, _ = self.lstm2(lstm_outputs1)
        lstm_outputs3, _ = self.lstm3(lstm_outputs2)
        # last_lstm_output = lstm_outputs[:, -1, :].unsqueeze(1)

        print(charbert_embeddings.shape)
        print(lstm_outputs3.shape)

        # Concatenate CharBERT embeddings and LSTM outputs
        combined_features = torch.cat((charbert_embeddings, lstm_outputs3), dim=1)

        # Classification layer
        logits = self.classifier(combined_features)

        return logits

In [39]:
# Instantiate the custom model
model = CustomModel(charbert_model, additional_features_size)

In [40]:
model.classifier.weight.shape

torch.Size([2, 784])

## Train the model

In [41]:
learning_rate=2e-5
#Define Loss Function and Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

In [42]:
# Code copied from https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
  # Copied from https://huggingface.co/learn/nlp-course/chapter3/2?fw=pt:
  # "... apply the correct amount of padding to the items of the dataset we want to batch together.
  # ... such a function via DataCollatorWithPadding. It takes a tokenizer when you instantiate it
   #(to know which padding token to use, and whether the model expects padding to be on the left or on the right of the inputs)
# Setting max length, and padding - copied from https://huggingface.co/docs/transformers/main_classes/data_collator
# Errored out, so got rid of these
data_collator = DataCollatorWithPadding(tokenizer=charbert_tokenizer)

In [43]:
num_epochs = 5

In [44]:
# Code copied from: https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
# Set up the arguments we'll use for training
# Additional code (paramters to use in the call) copied from HuggingFaceThreeWays_2_Trainer.ipynb walkthrough
# and from https://www.philschmid.de/getting-started-pytorch-2-0-transformers#3-fine-tune--evaluate-bert-model-with-the-hugging-face-trainer

args = TrainingArguments("test-trainer",
    evaluation_strategy = 'epoch',
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=num_epochs,
    weight_decay=0.01,
    #load_best_model_at_end=True,
    #metric_for_best_model='f1'
    )

In [45]:
# Copied from https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
    tokenizer=charbert_tokenizer
)

In [46]:
# Lists to store training and validation losses and accuracies
train_losses = []
val_losses = []
val_accuracies = []

In [50]:
# Training loop
for epoch in range(num_epochs):
    trainer.train()
    train_loss = 0.0

    # Training loop
    for batch in train_dataloader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        key_timing = batch['time_seq']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, additional_features=key_timing)

        loss = criterion(outputs.logits, inputs["labels"])  # Modify this to match your labels
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_dataloader)
    train_losses.append(avg_train_loss)

    # Evaluate the model on the validation set
    results = trainer.evaluate(eval_dataset=val_dataloader)
    val_loss = results["eval_loss"]
    val_accuracy = results["eval_accuracy"]
    val_losses.append(val_loss)
    val_accuracies.append(val_accuracy)

    print(f"Epoch {epoch + 1}/{num_epochs}: Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.2%}")



TypeError: ignored

In [None]:
# You can save the trained model at the end of training
trainer.save_model("./custom_model")

# Evaluation on the test set
results = trainer.evaluate(eval_dataset=test_dataloader)
test_loss = results["eval_loss"]
test_accuracy = results["eval_accuracy"]

print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2%}")


In [None]:
# Code copied from https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
trainer.train()

In [None]:
# Get predictions from the model for the validation dataset
# Code copied from https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
predictions = trainer.predict(tokenized_val_dataset)
print(predictions.predictions.shape,predictions.label_ids.shape)

In [None]:
import numpy as np
# Copied from https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
# "As you can see, predictions is a two-dimensional array with shape 408 x 2 (408 being the number of elements in the dataset we used).
# Those are the logits for each element of the dataset we passed to predict() (as you saw in the previous chapter, all Transformer models return logits).
# To transform them into predictions that we can compare to our labels, we need to take the index with the maximum value on the second axis"
# Code copied from https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
charBert_pred_labels = np.argmax(predictions.predictions, axis=-1)

In [None]:
# Now let's evaluate the model (code copied from above)

accuracy_charBERT = accuracy_score(np.array(y_val), charBert_pred_labels)
precision_charBERT = precision_score(np.array(y_val), charBert_pred_labels)
recall_charBERT = recall_score(np.array(y_val), charBert_pred_labels)
f1_charBERT = f1_score(np.array(y_val), charBert_pred_labels)

print("Accuracy: ", accuracy_charBERT)
print("Precision: ", precision_charBERT)
print("Recall: ", recall_charBERT)
print("F1-Score: ", f1_charBERT)