In [1]:
# !pip install --upgrade tensorflow
!pip install tensorflow==2.15
!pip install -U tf_keras # Keras 2
import os
os.environ["TF_USE_LEGACY_KERAS"] = "1"

!pip install transformers datasets tensorflow

Collecting tf_keras
  Downloading tf_keras-2.16.0-py3-none-any.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow<2.17,>=2.16 (from tf_keras)
  Downloading tensorflow-2.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (589.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m589.8/589.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting h5py>=3.10.0 (from tensorflow<2.17,>=2.16->tf_keras)
  Downloading h5py-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m97.1 MB/s[0m eta [36m0:00:00[0m
Collecting ml-dtypes~=0.3.1 (from tensorflow<2.17,>=2.16->tf_keras)
  Downloading ml_dtypes-0.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [3

In [2]:
# !pip install --upgrade transformers

In [3]:
# !pip show tensorflow
# !pip show Keras

In [4]:
import os
import tensorflow as tf
from transformers import TFElectraForSequenceClassification, ElectraTokenizer
from datasets import load_dataset, load_metric
from google.colab import drive

In [5]:
drive.mount('/content/drive')

# Checkpoints and final model
checkpoint_dir = '/content/drive/My Drive/W266_Final_Project/Finetune_BASE_GLUE/exp05/electra-base-exp05-qnli'
final_model_path = '/content/drive/My Drive/W266_Final_Project/Finetune_BASE_GLUE/exp05/electra-base-exp05-qnli-final'

Mounted at /content/drive


In [6]:
# https://colab.research.google.com/drive/1VoWyzlXZVR5sNygejktJ4Uguw3HIZckb?usp=sharing

# Load ELECTRA BASE model and tokenizer
model_name = "google/electra-base-discriminator"
tokenizer = ElectraTokenizer.from_pretrained(model_name)
model = TFElectraForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Experiment #5
#   Freeze the first 4 layers
for layer in model.electra.encoder.layer[:4]:
    layer.trainable = False

#   Train the last 8 layers
for layer in model.electra.encoder.layer[-8:]:
    layer.trainable = True

# Load and preprocess the GLUE QNLI dataset
# https://huggingface.co/docs/datasets/v1.1.2/loading_datasets.html
# https://huggingface.co/docs/datasets/en/process
# https://huggingface.co/datasets/nyu-mll/glue
dataset = load_dataset("glue", "qnli")
metric = load_metric("glue", "qnli")

encoded_dataset = dataset.map(lambda examples: tokenizer(examples["question"],
                                                         examples["sentence"],
                                                         truncation=True,
                                                         padding="max_length",
                                                         return_tensors="tf"),
                              batched=True)

# Convert to TensorFlow dataset
# https://huggingface.co/docs/datasets/v1.16.1/use_dataset.html
train_data = encoded_dataset["train"].to_tf_dataset(columns=["attention_mask", "input_ids", "token_type_ids"],
                                                    label_cols=["label"],
                                                    shuffle=True,
                                                    batch_size=32)

validation_data = encoded_dataset["validation"].to_tf_dataset(columns=["attention_mask", "input_ids", "token_type_ids"],
                                                              label_cols=["label"],
                                                              shuffle=False,
                                                              batch_size=32)

# Checkpoints
# https://www.tensorflow.org/api_docs/python/tf/keras/callbacks/ModelCheckpoint
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=os.path.join(checkpoint_dir, 'checkpoint_epoch_{epoch}.weights.h5'),
    save_weights_only=True,
    save_best_only=True,
    monitor='val_loss',
    mode='min',
    verbose=1
)

# Hyperparameters from the ELECTRA paper https://arxiv.org/abs/2003.10555
# "Table 6: Fine-tune hyperparameters"
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5,
                                     epsilon=1e-6,
                                     beta_1=0.9,
                                     beta_2=0.999)

# Compile
model.compile(optimizer=optimizer,
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Finetune
model.fit(train_data, validation_data=validation_data, epochs=3, callbacks=[checkpoint_callback])

# Save the final model
model.save_pretrained(final_model_path)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

tf_model.h5:   0%|          | 0.00/438M [00:00<?, ?B/s]

Some layers from the model checkpoint at google/electra-base-discriminator were not used when initializing TFElectraForSequenceClassification: ['discriminator_predictions']
- This IS expected if you are initializing TFElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFElectraForSequenceClassification were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/872k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/877k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/104743 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/5463 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/5463 [00:00<?, ? examples/s]

  metric = load_metric("glue", "qnli")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

Map:   0%|          | 0/104743 [00:00<?, ? examples/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

Map:   0%|          | 0/5463 [00:00<?, ? examples/s]

Map:   0%|          | 0/5463 [00:00<?, ? examples/s]

Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor)  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor)  
New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor})  
             : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) 


Epoch 1/3


Cause: for/else statement not yet supported


Cause: for/else statement not yet supported
Epoch 1: val_loss improved from inf to 0.19938, saving model to /content/drive/My Drive/W266_Final_Project/Finetune_BASE_GLUE/exp05/electra-base-exp05-qnli/checkpoint_epoch_1.weights.h5




Epoch 2/3
Epoch 2: val_loss did not improve from 0.19938
Epoch 3/3
Epoch 3: val_loss did not improve from 0.19938


- ELECTRA BASE fine tuned on GLUE QNLI for Question Answering
  - Freeze first 4 layers, fine-tune on last 8 layers
- Ran in Google Colab Pro+ on A100 GPU
  - System RAM 5.1GB
  - GPU RAM 32.5GB
  - Ran in about 4165 seconds
  - Final model weights saved
  - loss: 0.1152 - accuracy: 0.9585 - val_loss: 0.2527 - val_accuracy: 0.9171
- Did not implement the the warmup and decay, and instead used a common learning rate of 5e-5
- Next step: Benchmark against GLUE QNLI