# Installation & Setup

In [4]:
#this cell only needs to be run once for every new run time
!pip install transformers[torch,sentencepiece] accelerate torch datasets -U --quiet

In [5]:
#important relevant modeling libraries
import torch

# Code copied from https://huggingface.co/learn/nlp-course/chapter0/1?fw=pt.
import transformers

from datasets import Dataset
# Code copied from Jennifer and https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
from datasets import load_dataset

from transformers import AutoModel, AutoTokenizer, AddedToken
# Code copied from: https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt and https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding


In [6]:
#copy this block over for all successive model iterations
import pandas as pd
import os
import json

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from google.colab import drive
drive.mount('/content/drive')

######################## READ IN DATA ###################################################################################################
data_path = '/content/drive/My Drive/266 Assignments/266 Final Project'
files = os.listdir(data_path)
files = [x for x in files if '.csv' in x]

filt_df = pd.read_csv(os.path.join(data_path, files[files.index('cleaned_data.csv')]))

#split the data
# Split the data into training, validation, and test sets
X = filt_df['key_sequence']
y = filt_df['diagnosis']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

#convert series of lists to list
X_train = list(X_train)
X_val = list(X_val)
X_test = list(X_test)

y_train = y_train.to_list()
y_val = y_val.to_list()
y_test = y_test.to_list()

######################## IMPORT SPECIAL TOKENS ############################################################################################
json_file = os.path.join(data_path, "token_map.json")
with open(json_file, 'r') as json_file:
    charbert_token_map = json.load(json_file)

added_tokens = [AddedToken(token) for token in charbert_token_map.values()]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
#CharBERT model
charbert_name = "imvladikon/charbert-bert-wiki"
charbert_model = AutoModel.from_pretrained(charbert_name)
charbert_tokenizer = AutoTokenizer.from_pretrained(charbert_name)

Downloading (…)lve/main/config.json:   0%|          | 0.00/682 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/552M [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

# Data processing for model specifically

In [8]:
# Changing max length to 512 as 512 is the maximium for BERT (copied from: https://huggingface.co/learn/nlp-course/)
max_length = 512

In [9]:
# Put data into a pandas dataframe to then load into a hugging face dataset object
# Code and idea copied from https://huggingface.co/docs/datasets/v1.11.0/loading_datasets.html
train_df = pd.DataFrame(X_train)
train_df['labels']=y_train
train_df.columns = ['key_seq','labels']

# Code copied from above
# Made a df of our validation data
val_df = pd.DataFrame(X_val)
val_df['labels']=y_val
val_df.columns = ['key_seq','labels']

test_df = pd.DataFrame(X_test)
test_df['labels']=y_test
test_df.columns = ['key_seq','labels']

In [10]:
# Code copied from https://huggingface.co/docs/datasets/v1.11.0/loading_datasets.html
# Make a hugging face dataset object from the pandas df we just made
train_dataset = Dataset.from_pandas(train_df)

# Code copied from above
# Make a validation Dataset
val_dataset = Dataset.from_pandas(val_df)

# Code copied from above
# Make a test Dataset
test_dataset = Dataset.from_pandas(test_df)

In [11]:
# Code copied from https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt and prior code above (likely from BERT lesson notebooks/assignment)
# Code also copied from https://huggingface.co/learn/nlp-course/chapter2/2?fw=pt
# Create a tokenize function we'll use to tokenize the key sequences in the dataset
def tokenize_func(a):
  return charbert_tokenizer(
    a['key_seq'],
    # Changing padding to max_length, copied from https://huggingface.co/learn/nlp-course/chapter2/6?fw=pt
    # Getting rid of padding here (copying from https://huggingface.co/learn/nlp-course/chapter3/2?fw=pt).
    # It's more efficient to do padding at the batch level later on (use "dynamic padding") according to the above source
    # padding='max_length',
    # Trunction true should truncate to max length: Copied from https://huggingface.co/docs/transformers/main_classes/tokenizer
    truncation=True,
    # Testing a shorter max length based on https://huggingface.co/learn/nlp-course/chapter2/5?fw=pt
    # Getting rid of max length from here to, copying from: https://huggingface.co/learn/nlp-course/chapter3/2?fw=pt
    # Reinstating max length since got an error
    max_length=max_length,
    # Getting rid of return tensors so that this code runs! Gave an erron when it was here
    # return_tensors='pt'
    )

In [12]:
# Code copied from https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
# Tokenize the key sequences which will add the results to the dataset
tokenized_train_dataset = train_dataset.map(tokenize_func, batched=True)

# Code copied from above
# Tokenize validation dataset
tokenized_val_dataset = val_dataset.map(tokenize_func, batched=True)

# Code copied from above
# Tokenize test dataset
tokenized_test_dataset = test_dataset.map(tokenize_func, batched=True)


Map:   0%|          | 0/2372 [00:00<?, ? examples/s]

Map:   0%|          | 0/508 [00:00<?, ? examples/s]

Map:   0%|          | 0/509 [00:00<?, ? examples/s]

# CharBERT Model Alone
Note on warning below: that ['classifier.bias', 'classifier.weight'] are newly initialized. From Jennifer's OH: The CharBert model we're using likely wasn't built to work with Hugging Face's loader. We're probably losing pre-training. Could try to load it in from git directly, but that will be difficult (3 year old repo) and we'd have to match their python version, etc. Can proceed with this as is, just be aware we're likely losing some (or all?) pre-training. Also skimmed: https://discuss.huggingface.co/t/is-some-weights-of-the-model-were-not-used-warning-normal-when-pre-trained-bert-only-by-mlm/5672

In [13]:
# Add the special tokens from charbert_token_map to the tokenizer's vocabulary
charbert_tokenizer.add_tokens(added_tokens)

26

In [14]:
# Code copied from https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
  # Copied from https://huggingface.co/learn/nlp-course/chapter3/2?fw=pt:
  # "... apply the correct amount of padding to the items of the dataset we want to batch together.
  # ... such a function via DataCollatorWithPadding. It takes a tokenizer when you instantiate it
   #(to know which padding token to use, and whether the model expects padding to be on the left or on the right of the inputs)
# Setting max length, and padding - copied from https://huggingface.co/docs/transformers/main_classes/data_collator
# Errored out, so got rid of these
data_collator = DataCollatorWithPadding(tokenizer=charbert_tokenizer)

In [15]:
# Code copied from: https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
# Set up the arguments we'll use for training
# Additional code (paramters to use in the call) copied from HuggingFaceThreeWays_2_Trainer.ipynb walkthrough
# and from https://www.philschmid.de/getting-started-pytorch-2-0-transformers#3-fine-tune--evaluate-bert-model-with-the-hugging-face-trainer

args = TrainingArguments("test-trainer",
    evaluation_strategy = 'epoch',
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
    #load_best_model_at_end=True,
    #metric_for_best_model='f1'
    )

In [16]:
# Code copied from: https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
# Pull in a classification version of the model
model = AutoModelForSequenceClassification.from_pretrained(charbert_name, num_labels=2)

#must also add special tokens to the model
model.resize_token_embeddings(len(charbert_tokenizer))

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at imvladikon/charbert-bert-wiki and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Embedding(28996, 768, padding_idx=0)

In [17]:
# Copied from https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    data_collator=data_collator,
    tokenizer=charbert_tokenizer
)

In [18]:
# Code copied from https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,0.700572
2,No log,0.694511
3,No log,0.698004
4,0.688000,0.687013
5,0.688000,0.705173
6,0.688000,0.69281
7,0.687100,0.691397
8,0.687100,0.688404
9,0.687100,0.691179
10,0.687100,0.692572


TrainOutput(global_step=1490, training_loss=0.6862145340682676, metrics={'train_runtime': 1749.4875, 'train_samples_per_second': 13.558, 'train_steps_per_second': 0.852, 'total_flos': 4454938833812160.0, 'train_loss': 0.6862145340682676, 'epoch': 10.0})

In [19]:
# Get predictions from the model for the validation dataset
# Code copied from https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
predictions = trainer.predict(tokenized_val_dataset)
print(predictions.predictions.shape,predictions.label_ids.shape)

(508, 2) (508,)


In [20]:
import numpy as np
# Copied from https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
# "As you can see, predictions is a two-dimensional array with shape 408 x 2 (408 being the number of elements in the dataset we used).
# Those are the logits for each element of the dataset we passed to predict() (as you saw in the previous chapter, all Transformer models return logits).
# To transform them into predictions that we can compare to our labels, we need to take the index with the maximum value on the second axis"
# Code copied from https://huggingface.co/learn/nlp-course/chapter3/3?fw=pt
charBert_pred_labels = np.argmax(predictions.predictions, axis=-1)

In [21]:
# Now let's evaluate the model (code copied from above)

accuracy_charBERT = accuracy_score(np.array(y_val), charBert_pred_labels)
precision_charBERT = precision_score(np.array(y_val), charBert_pred_labels)
recall_charBERT = recall_score(np.array(y_val), charBert_pred_labels)
f1_charBERT = f1_score(np.array(y_val), charBert_pred_labels)

print("Accuracy: ", accuracy_charBERT)
print("Precision: ", precision_charBERT)
print("Recall: ", recall_charBERT)
print("F1-Score: ", f1_charBERT)

Accuracy:  0.5452755905511811
Precision:  0.0
Recall:  0.0
F1-Score:  0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
model.forward

<bound method BertForSequenceClassification.forward of BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
