In [1]:
import logging
import os

import datasets
import torch
from prettytable import PrettyTable

from bert import *
from transformers import AutoTokenizer

logging.basicConfig(level=logging.INFO)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/fionamuntwyler/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/fionamuntwyler/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/fionamuntwyler/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
FULL = False
FORCE = True

MODEL = 'distilbert-base-uncased'
TOKENIZER = 'bert-base-uncased'
FT = 'CLASS_LNORM'  # mode of fine-tuning
# 'CLASS_LNORM': train classifier and layer norm parameters
# 'PREC_CLASS': pre-classifier and classifier

EPOCHS = 5
BATCH_SIZE = 32

IDENT = '_'.join([MODEL, "ep", str(EPOCHS), "bs", str(BATCH_SIZE), str(FT)])
DIR = "bert_data/" + IDENT

In [3]:
try:
    os.makedirs(DIR)
except FileExistsError:
    pass

In [4]:
dataset_train, dataset_val = load()
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER)
train_tokenized = tokenize(
    dataset_train,
    tokenizer,
    path=f'bert/cache/train_tokenized__{TOKENIZER}{"__full" if FULL else ""}',
    force=FORCE)
val_tokenized = tokenize(
    dataset_val,
    tokenizer,
    path=f'bert/cache/val_tokenized__{TOKENIZER}{"__full" if FULL else ""}',
    force=FORCE)

Casting the dataset:   0%|          | 0/16 [00:00<?, ?ba/s]

Casting the dataset:   0%|          | 0/4 [00:00<?, ?ba/s]



  0%|          | 0/160 [00:00<?, ?ba/s]

  0%|          | 0/40 [00:00<?, ?ba/s]

In [5]:
model = get_BERT(MODEL, device)

In [6]:
print(model)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       

In [7]:
for param in model.parameters():
    param.requires_grad = False

In [8]:
req_grad = 0
not_grad = 0
for param in model.distilbert.parameters():
    if param.requires_grad:
        req_grad+=1
    else:
        not_grad+=1
print(req_grad)
print(not_grad)

req_grad = 0
not_grad = 0
for param in model.pre_classifier.parameters():
    if param.requires_grad:
        req_grad+=1
    else:
        not_grad+=1
print(req_grad)
print(not_grad)

req_grad = 0
not_grad = 0
for param in model.classifier.parameters():
    if param.requires_grad:
        req_grad+=1
    else:
        not_grad+=1
print(req_grad)
print(not_grad)

0
100
0
2
0
2


In [9]:
for param in model.classifier.parameters():
    param.requires_grad = True

In [10]:
for param in model.pre_classifier.parameters():
    param.requires_grad = True

In [11]:
for param in model.pre_classifier.parameters():
    param.requires_grad = False

In [12]:
def count_parameters(model):
    table = PrettyTable(["Modules", "Trainable Parameters", "Parameters"])
    total_params_train = 0
    total_params = 0
    for name, parameter in model.named_parameters():
        params = parameter.numel()
        params_t = params
        if not parameter.requires_grad:
            params_t = 0
        table.add_row([name, params_t, params])
        total_params_train+=params_t
        total_params+=params
    print(table)
    print(f"Total Trainable Params: {total_params_train}")
    pourcent = float(total_params_train)/total_params
    print(f"Percentage of trainable parameters: {pourcent}")

    return total_params_train, pourcent

In [13]:
def set_trainable(model):
    count = 0
    for name, parameter in model.named_parameters():
        if "layer_norm" in str(name) or "LayerNorm" in str(name):
            count+=1
            parameter.requires_grad = True

    print(count)
    return count_parameters(model)[0]

In [14]:
set_trainable(model)

26
+---------------------------------------------------------+----------------------+------------+
|                         Modules                         | Trainable Parameters | Parameters |
+---------------------------------------------------------+----------------------+------------+
|       distilbert.embeddings.word_embeddings.weight      |          0           |  23440896  |
|     distilbert.embeddings.position_embeddings.weight    |          0           |   393216   |
|          distilbert.embeddings.LayerNorm.weight         |         768          |    768     |
|           distilbert.embeddings.LayerNorm.bias          |         768          |    768     |
|  distilbert.transformer.layer.0.attention.q_lin.weight  |          0           |   589824   |
|   distilbert.transformer.layer.0.attention.q_lin.bias   |          0           |    768     |
|  distilbert.transformer.layer.0.attention.k_lin.weight  |          0           |   589824   |
|   distilbert.transformer.layer.0.at

21506

In [15]:
training_args = TrainingArguments(output_dir=DIR,
                                  num_train_epochs=EPOCHS,
                                  save_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=BATCH_SIZE,
                                  load_best_model_at_end=True,
                                  metric_for_best_model="accuracy")
metric = load_metric("accuracy")


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)


trainer = Trainer(
    model,
    training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text, token_type_ids, __index_level_0__, index. If text, token_type_ids, __index_level_0__, index are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 160000
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 25000


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
val_pred = trainer.predict(val_tokenized)
y_pred = np.argmax(val_pred.predictions, axis=1)
y = val_tokenized.to_pandas()['label']
metrics = evaluate(y, y_pred)

In [None]:
count_parameters(model)

In [None]:
import datasets
datasets.list_metrics()