# Style Transfer Intensity (STI)

Style Transfer Strength is often evaluated by training a classifier on the labeled dataset and measuring the number of outputes classified as having the target style. 

[This paper](https://arxiv.org/pdf/1904.02295.pdf) proposes an alternative method.

Rather than count how many output texts achieve a target style, we can capture more nuanced differences between the style distributions of x and x', using Earth Mover’s Distance.EMD is the minimum “cost” to turn one distribution into the other, or how “intense” the transfer is. Distributions can have any number of values (styles), so EMD handles binary and non-binary datasets



## Prepare WNC for style classification

In [1]:
import os
import numpy as np
import pandas as pd
from collections import defaultdict
from datasets import (
    load_dataset,
    load_from_disk,
    load_metric,
    Dataset,
    Features,
    Value,
    ClassLabel,
    DatasetDict,
)=

%load_ext lab_black

In [16]:
def build_classification_dataset(path: str) -> DatasetDict:
    """
    Formats the translation-task version of WNC as a classification dataset.

    Dataset splits remain the same, but the number of records in each split are doubled
    as we create an individual record for both the "source_text" and "target_text" fields.
    In this way, "source_text" is assigned a label of "subjective" and "target_text" is assigned
    a label of "neutral". Records are randomly shuffled within each split.

    Args:
        path (str): path to HuggingFace dataset

    Returns:
        DatasetDict

    """
    datasets = load_from_disk(path)
    dataset_dict = defaultdict(dict)

    SPLITS = ["train", "test", "validation"]
    LABEL_MAPPING = {"source_text": "subjective", "target_text": "neutral"}
    FEATURES = Features(
        {
            "text": Value("string"),
            "label": ClassLabel(num_classes=2, names=["subjective", "neutral"]),
        }
    )

    for split in SPLITS:
        df = datasets[split].to_pandas()
        split_dict = defaultdict(list)

        for column, label in LABEL_MAPPING.items():
            split_dict["text"].extend(df[column].tolist())
            split_dict["label"].extend([label] * len(df))

        # reorder records so subjective/neutral pairs alternate in sequence
        temp_df = pd.DataFrame(split_dict)
        dfs = np.split(temp_df, indices_or_sections=2, axis=0)
        dfs = [df.reset_index(drop=True) for df in dfs]
        temp_df = pd.concat(dfs).sort_index(kind="merge").reset_index(drop=True)

        dataset_dict[split] = Dataset.from_dict(
            temp_df.to_dict(orient="list"), features=FEATURES
        )

    return DatasetDict(dataset_dict)

In [6]:
DATASETS_PATH = "/home/cdsw/data/processed/WNC_full"
wnc_classification = build_classification_dataset(DATASETS_PATH)

In [7]:
wnc_classification

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 308394
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 17154
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 17214
    })
})

In [8]:
wnc_classification["train"][2:12]

{'text': ["following the end of kenneth kaunda's repressive dictatorship , chiluba won the country's multi-party presidential elections.",
  "following the end of kenneth kaunda's presidency , chiluba won the country's multi-party presidential elections.",
  'a brilliant quarterback with the university of illinois, haller was signed by the giants as an amateur free agent in 1958. he made his debut on april 11, 1961 as a platoon catcher.',
  'a quarterback with the university of illinois, haller was signed by the giants as an amateur free agent in 1958. he made his debut on april 11, 1961 as a platoon catcher.',
  'traitor to his people adam yahiye gadahn (born september 1, 1978) is an american-born man who is suspected of being a member of the al qaeda organization.',
  'adam yahiye gadahn (born september 1, 1978) is an american-born man who is suspected of being a member of the al qaeda organization.',
  'a funny thing happened on the way to the moon is a 2001 documentary written, pro

In [14]:
wnc_classification["train"].features["label"].int2str(0)

'subjective'

In [15]:
# save dataset
CLS_DATASET_PATH = "/home/cdsw/data/processed/WNC_cls_full"
os.makedirs(CLS_DATASET_PATH)
wnc_classification.save_to_disk(CLS_DATASET_PATH)

### Testing dataset

In [15]:
test_wnc_classification = DatasetDict(
    {
        "train": wnc_classification["train"].select(range(1000)),
        "test": wnc_classification["test"].select(range(1000)),
        "validation": wnc_classification["validation"].select(range(1000)),
    }
)

TEST_CLS_DATASET_PATH = "/home/cdsw/data/processed/WNC_full_cls_TEST"
os.makedirs(TEST_CLS_DATASET_PATH)
test_wnc_classification.save_to_disk(TEST_CLS_DATASET_PATH)

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

In [16]:
test_wnc_classification

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 1000
    })
})

## Train a classifier

In [2]:
CLS_DATASET_PATH = "/home/cdsw/data/processed/WNC_cls_full"
wnc_full_cls = load_from_disk(CLS_DATASET_PATH)

In [3]:
wnc_full_cls

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 308394
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 17154
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 17214
    })
})

In [6]:
wnc_full_cls["train"].to_pandas()

Unnamed: 0,text,label
0,while for long nearly only women where shown a...,0
1,"increased tolerance, more tempered censorship,...",1
2,following the end of kenneth kaunda's repressi...,0
3,following the end of kenneth kaunda's presiden...,1
4,a brilliant quarterback with the university of...,0
...,...,...
308389,regardless of how a received message is format...,1
308390,"in peloponnesos, at any rate, the revolution h...",0
308391,"in peloponnesos, at any rate, the revolution h...",1
308392,communism disregarded and hated man : roy in h...,0


In [9]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from transformers.integrations import MLflowCallback
from datasets import load_metric

In [10]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)


tokenized_datasets = wnc_full_cls.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Loading cached processed dataset at /home/cdsw/data/processed/WNC_cls_full/train/cache-0a1520a3bb15d3a6.arrow
Loading cached processed dataset at /home/cdsw/data/processed/WNC_cls_full/test/cache-7534370130f2438d.arrow
Loading cached processed dataset at /home/cdsw/data/processed/WNC_cls_full/validation/cache-a3249b64e9d2c2a2.arrow


In [11]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 308394
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 17154
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 17214
    })
})

In [12]:
MODEL_NAME = "bert-cls-full-nbtest"
MODEL_DIR = "/home/cdsw/models"

training_args = TrainingArguments(
    output_dir=os.path.join(MODEL_DIR, MODEL_NAME),
    learning_rate=5e-05,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    logging_dir=os.path.join(MODEL_DIR, "logs", MODEL_NAME),
    logging_steps=50,
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    eval_steps=50,
    save_total_limit=5,
    save_steps=50,
    # metric_for_best_model="f1",
    # greater_is_better=True,
)

In [8]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [9]:
def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [10]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.remove_callback(MLflowCallback)

In [11]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 308394
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 96375


Step,Training Loss,Validation Loss,Accuracy,F1
50,0.7067,0.693016,0.501394,0.66633
100,0.6896,0.692646,0.50488,0.050574


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 17214
  Batch size = 16
Saving model checkpoint to /home/cdsw/models/bert-cls-full-nbtest/checkpoint-50
Configuration saved in /home/cdsw/models/bert-cls-full-nbtest/checkpoint-50/config.json
Model weights saved in /home/cdsw/models/bert-cls-full-nbtest/checkpoint-50/pytorch_model.bin
tokenizer config file saved in /home/cdsw/models/bert-cls-full-nbtest/checkpoint-50/tokenizer_config.json
Special tokens file saved in /home/cdsw/models/bert-cls-full-nbtest/checkpoint-50/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequen

KeyboardInterrupt: 

## Hyperparameter search

In [1]:
CLS_DATASET_PATH = "/home/cdsw/data/processed/WNC_cls_full"
wnc_full_cls = load_from_disk(CLS_DATASET_PATH)

NameError: name 'load_from_disk' is not defined

In [None]:
wnc_full_cls

In [2]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
)
from torch.utils.data import SequentialSampler, BatchSampler, DataLoader
from transformers.integrations import MLflowCallback
from datasets import load_metric

In [3]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)


tokenized_datasets = wnc_full_cls.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

NameError: name 'wnc_full_cls' is not defined

In [7]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 308394
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 17154
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 17214
    })
})

In [18]:
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)


model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/cdsw/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-

In [9]:
def compute_metrics(eval_preds):

    accuracy_metric = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    return accuracy_metric.compute(predictions=predictions, references=labels)

In [22]:
MODEL_NAME = "bert-cls-full-blah"
MODEL_DIR = "/home/cdsw/models"

training_args = TrainingArguments(
    output_dir=os.path.join(MODEL_DIR, MODEL_NAME),
    learning_rate=3e-05,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    # num_train_epochs=2,
    logging_dir=os.path.join(MODEL_DIR, "logs", MODEL_NAME),
    logging_steps=500,
    overwrite_output_dir=True,
    evaluation_strategy="steps",
    eval_steps=1000,
    save_total_limit=1,
    save_steps=1000,
    load_best_model_at_end=True,
    # evaluation_strategy="epoch",
    metric_for_best_model="eval_accuracy",
    # metric_for_best_model=metric_name,
    # metric_for_best_model="f1",
    greater_is_better=True,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [10]:
class CustomTrainer(Trainer):
    """
    A custom Trainer that overwrites and subclasses the `get_train_dataloader()` method.

    This customization allows us to introduce a flag that disables shuffling on the DataLoader. When
    `shuffle_train` flag is True, a RandomSampler is used via `self._get_train_sampler`. When set to False,
    a SequentialSampler is utilized in the dataloader.

    """

    def __init__(self, shuffle_train, *args, **kwargs):
        self.shuffle_train = shuffle_train
        super().__init__(*args, **kwargs)

    def seed_worker(self, _):
        """
        Helper function to set worker seed during Dataloader initialization.
        """
        worker_seed = torch.initial_seed() % 2**32
        set_seed(worker_seed)

    def get_train_dataloader(self) -> DataLoader:
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")

        train_dataset = self.train_dataset
        data_collator = self.data_collator

        if isinstance(train_dataset, Dataset):
            train_dataset = self._remove_unused_columns(
                train_dataset, description="training"
            )
            print(train_dataset)
        else:
            data_collator = self._get_collator_with_removed_columns(
                data_collator, description="training"
            )

        if self.shuffle_train:
            train_sampler = self._get_train_sampler()
        else:
            train_sampler = SequentialSampler(self.train_dataset)

        return DataLoader(
            train_dataset,
            batch_size=self.args.per_device_train_batch_size,
            sampler=train_sampler,
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
            worker_init_fn=self.seed_worker,
        )

In [23]:
train_dataset = tokenized_datasets["train"].shard(index=1, num_shards=10)

trainer = Trainer(
    # shuffle_train=True,
    # model_init=model_init,
    model,
    args=training_args,
    # train_dataset=train_dataset,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.remove_callback(MLflowCallback)

In [13]:
?Trainer()

Object `Trainer()` not found.


In [24]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 308394
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 57825


Step,Training Loss,Validation Loss,Accuracy
1000,0.7053,0.696181,0.5
2000,0.7032,0.69813,0.5


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 17214
  Batch size = 16
Saving model checkpoint to /home/cdsw/models/bert-cls-full-blah/checkpoint-1000
Configuration saved in /home/cdsw/models/bert-cls-full-blah/checkpoint-1000/config.json
Model weights saved in /home/cdsw/models/bert-cls-full-blah/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /home/cdsw/models/bert-cls-full-blah/checkpoint-1000/tokenizer_config.json
Special tokens file saved in /home/cdsw/models/bert-cls-full-blah/checkpoint-1000/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequen

KeyboardInterrupt: 

In [14]:
trainer.train()

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/cdsw/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18.0.dev0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file https://huggingface.co/bert-base-

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0043,5.994174,0.5
2,0.0,6.370475,0.5
3,0.0,6.49227,0.5


Saving model checkpoint to /home/cdsw/models/bert-cls-full-nbtest/checkpoint-500
Configuration saved in /home/cdsw/models/bert-cls-full-nbtest/checkpoint-500/config.json
Model weights saved in /home/cdsw/models/bert-cls-full-nbtest/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /home/cdsw/models/bert-cls-full-nbtest/checkpoint-500/tokenizer_config.json
Special tokens file saved in /home/cdsw/models/bert-cls-full-nbtest/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 17214
  Batch size = 16
Saving model checkpoint to /home/cdsw/models/bert-cls-full-nbtest/checkpoint-1000
Configuration saved in /home/cdsw/models/bert-cls-full-nbtest/checkpoint-1000/config.json
Model weights save

TrainOutput(global_step=2892, training_loss=0.0007404177269707067, metrics={'train_runtime': 633.3754, 'train_samples_per_second': 146.075, 'train_steps_per_second': 4.566, 'total_flos': 3896571952125600.0, 'train_loss': 0.0007404177269707067, 'epoch': 3.0})

In [12]:
def my_hp_space(trial):
    # return {
    #     "learning_rate": trial.suggest_float("learning_rate", 3e-06, 3e-04, log=True),
    #     "weight_decay": trial.suggest_float("weight_decay", 0.000001, 0.01, log=True),
    #     # "per_device_train_batch_size": trial.suggest_categorical(
    #     #     "per_device_train_batch_size", [8, 16]
    #     # ),
    #     # "shuffle_train": trial.suggest_categorical("shuffle_train", [True, False]),
    # }

    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
        "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 8),
        "seed": trial.suggest_int("seed", 1, 40),
        "per_device_train_batch_size": trial.suggest_categorical(
            "per_device_train_batch_size", [8, 16, 32]
        ),
    }


def my_objective(metrics):
    return metrics["eval_accuracy"]

In [None]:
def my_hp_space_ray(trial):
    from ray import tune

    return {
        "learning_rate": tune.loguniform(1e-4, 1e-2),
        "num_train_epochs": tune.choice(range(1, 6)),
        "seed": tune.choice(range(1, 41)),
        "per_device_train_batch_size": tune.choice([4, 8, 16, 32, 64]),
    }

In [14]:
best_run = trainer.hyperparameter_search(
    hp_space=my_hp_space,
    compute_objective=my_objective,
    direction="maximize",
    backend="optuna",
    n_trials=4,
    # resources_per_trial={"cpu": 1, "gpu": 1}
)

[32m[I 2022-05-17 16:37:05,485][0m A new study created in memory with name: no-name-65ef4c6f-12a4-45c3-b09e-f75d6f6198be[0m
Trial:
loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/cdsw/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.18

Epoch,Training Loss,Validation Loss,Accuracy
1,0.015,5.1163,0.5
2,0.0,5.591533,0.5
3,0.0,5.830876,0.5
4,0.0,5.919676,0.5


Saving model checkpoint to /home/cdsw/models/bert-cls-full-nbtest/run-0/checkpoint-500
Configuration saved in /home/cdsw/models/bert-cls-full-nbtest/run-0/checkpoint-500/config.json
Model weights saved in /home/cdsw/models/bert-cls-full-nbtest/run-0/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /home/cdsw/models/bert-cls-full-nbtest/run-0/checkpoint-500/tokenizer_config.json
Special tokens file saved in /home/cdsw/models/bert-cls-full-nbtest/run-0/checkpoint-500/special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 17214
  Batch size = 16
Saving model checkpoint to /home/cdsw/models/bert-cls-full-nbtest/run-0/checkpoint-1000
Configuration saved in /home/cdsw/models/bert-cls-full-nbtest/run-0/chec

Epoch,Training Loss,Validation Loss,Accuracy
1,0.0,6.396152,0.5
2,0.0,7.22308,0.5
3,0.0,7.856434,0.5


Saving model checkpoint to /home/cdsw/models/bert-cls-full-nbtest/run-1/checkpoint-500
Configuration saved in /home/cdsw/models/bert-cls-full-nbtest/run-1/checkpoint-500/config.json
Model weights saved in /home/cdsw/models/bert-cls-full-nbtest/run-1/checkpoint-500/pytorch_model.bin
tokenizer config file saved in /home/cdsw/models/bert-cls-full-nbtest/run-1/checkpoint-500/tokenizer_config.json
Special tokens file saved in /home/cdsw/models/bert-cls-full-nbtest/run-1/checkpoint-500/special_tokens_map.json
Saving model checkpoint to /home/cdsw/models/bert-cls-full-nbtest/run-1/checkpoint-1000
Configuration saved in /home/cdsw/models/bert-cls-full-nbtest/run-1/checkpoint-1000/config.json
Model weights saved in /home/cdsw/models/bert-cls-full-nbtest/run-1/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in /home/cdsw/models/bert-cls-full-nbtest/run-1/checkpoint-1000/tokenizer_config.json
Special tokens file saved in /home/cdsw/models/bert-cls-full-nbtest/run-1/checkpoint-1000/s

KeyboardInterrupt: 

In [55]:
best_run

BestRun(run_id='1', objective=0.5005228302544441, hyperparameters={'learning_rate': 1.2819901548249134e-06, 'num_train_epochs': 2, 'seed': 38, 'per_device_train_batch_size': 64})

## Evaluation

In [13]:
predictions = trainer.predict(tokenized_datasets["validation"])

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 1000
  Batch size = 16


In [14]:
predictions.predictions

array([[-0.3496061, -0.6291089],
       [ 1.2544787, -2.2091932],
       [ 1.4202304, -2.272884 ],
       ...,
       [ 1.9655778, -2.7637262],
       [-2.3133612,  2.5391092],
       [ 1.3082958, -2.4287748]], dtype=float32)

In [15]:
predictions.predictions.shape

(1000, 2)

In [16]:
preds = np.argmax(predictions.predictions, axis=-1)

In [18]:
metric = load_metric("glue", "mrpc")

In [19]:
metric.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.626, 'f1': 0.625250501002004}

In [24]:
compute_metrics(predictions)

ValueError: too many values to unpack (expected 2)

#### manual

In [20]:
accuracy_metric = load_metric("accuracy")
f1_metric = load_metric("f1")

Downloading:   0%|          | 0.00/2.07k [00:00<?, ?B/s]

In [22]:
accuracy.compute(predictions=preds, references=predictions.label_ids)

{'accuracy': 0.626}

In [23]:
f1.compute(predictions=preds, references=predictions.label_ids)

{'f1': 0.625250501002004}

## Test custom DataLoader

In [6]:
import torch

In [12]:
from torch.utils.data import SequentialSampler, BatchSampler, DataLoader
from transformers import set_seed
from datasets import Dataset


class CustomTrainer(Trainer):
    """
    A custom Trainer that overwrites and subclasses the `get_train_dataloader()` method.

    This customization allows us to introduce a flag that disables shuffling on the DataLoader. When
    `shuffle_train` flag is True, a RandomSampler is used via `self._get_train_sampler`. When set to False,
    a BatchSampler(SequentialSampler()) is utilized in the dataloader.

    """

    def __init__(self, shuffle_train, *args, **kwargs):
        self.shuffle_train = shuffle_train
        super().__init__(*args, **kwargs)

    def seed_worker(self, _):
        """
        Helper function to set worker seed during Dataloader initialization.
        """
        worker_seed = torch.initial_seed() % 2**32
        set_seed(worker_seed)

    def get_train_dataloader(self) -> DataLoader:
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")

        train_dataset = self.train_dataset
        data_collator = self.data_collator

        if isinstance(train_dataset, Dataset):
            print("Removed unused columns")
            train_dataset = self._remove_unused_columns(
                train_dataset, description="training"
            )
        else:
            print("New data collator")
            data_collator = self._get_collator_with_removed_columns(
                data_collator, description="training"
            )
            print(data_collator)

        if self.shuffle_train:
            train_sampler = self._get_train_sampler()
        else:
            train_sampler = SequentialSampler(self.train_dataset)

        return DataLoader(
            train_dataset,
            batch_size=self.args.per_device_train_batch_size,
            sampler=train_sampler,
            collate_fn=self.data_collator,
            drop_last=self.args.dataloader_drop_last,
            num_workers=self.args.dataloader_num_workers,
            pin_memory=self.args.dataloader_pin_memory,
            worker_init_fn=self.seed_worker,
        )

In [8]:
samples = tokenized_datasets["train"][:8]
samples = {k: v for k, v in samples.items() if k not in ["text", "label", "sentence2"]}
[len(x) for x in samples["input_ids"]]

NameError: name 'tokenized_datasets' is not defined

In [16]:
batch = data_collator(samples)
{k: v.shape for k, v in batch.items()}

{'input_ids': torch.Size([8, 62]),
 'token_type_ids': torch.Size([8, 62]),
 'attention_mask': torch.Size([8, 62])}

In [23]:
test = list(
    BatchSampler(
        SequentialSampler(wnc_full_cls["train"]), batch_size=3, drop_last=False
    )
)

In [25]:
wnc_full_cls["train"]

Dataset({
    features: ['text', 'label'],
    num_rows: 1000
})

In [9]:
training_args = TrainingArguments(
    output_dir="./models/TESTING123",
    per_device_train_batch_size=3,
)

In [10]:
# load base-model and tokenizer
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)


tokenized_datasets = wnc_full_cls.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Loading cached processed dataset at /home/cdsw/data/processed/WNC_full_cls_TEST/train/cache-11a8b3faaf6e3c0a.arrow
Loading cached processed dataset at /home/cdsw/data/processed/WNC_full_cls_TEST/test/cache-5c62f87956a8bec2.arrow
Loading cached processed dataset at /home/cdsw/data/processed/WNC_full_cls_TEST/validation/cache-8db993456d6202ea.arrow
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining

In [11]:
tokenized_datasets["train"]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1000
})

In [12]:
def compute_metrics(eval_preds):

    accuracy_metric = load_metric("accuracy")
    f1_metric = load_metric("f1")

    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels),
        "f1": f1_metric.compute(predictions=predictions, references=labels),
    }


trainer = CustomTrainer(
    shuffle_train=False,
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.remove_callback(MLflowCallback)

In [24]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1000
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 189


Removed unused columns


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=189, training_loss=0.5969503493536086, metrics={'train_runtime': 19.458, 'train_samples_per_second': 154.178, 'train_steps_per_second': 9.713, 'total_flos': 115834642122240.0, 'train_loss': 0.5969503493536086, 'epoch': 3.0})

In [13]:
tdl = trainer.get_train_dataloader()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.


Removed unused columns


In [54]:
tdl_iter = iter(tdl)

In [55]:
first = next(tdl_iter)

In [66]:
second = next(tdl_iter)

In [76]:
third = next(tdl_iter)

In [83]:
i = 2

In [84]:
tokenizer.decode(third["input_ids"].tolist()[i])

'[CLS] two spanish ships with reinforcements, however, under the notorious hernando de soto, soon arrived by sea ( with at least a hundred volunteers ), and on these ships the spaniards, bound for more fruitful conquests on the peruvian mainland, embarked without incident and sailed back towards tumbes, arriving there on may 16, 1532. [SEP]'

In [85]:
tokenized_datasets["train"]["text"][6 + i]

'two spanish ships with reinforcements, however, under the notorious hernando de soto, soon arrived by sea (with at least a hundred volunteers), and on these ships the spaniards, bound for more fruitful conquests on the peruvian mainland, embarked without incident and sailed back towards tumbes, arriving there on may 16, 1532.'

In [22]:
for i in range(3):
    print(tokenizer.decode(first["input_ids"].tolist()[i]))
    print(tokenizer.decode(tokenized_datasets["train"]["input_ids"][i]))
    print()
    print("----------------------------")

[CLS] against this background his murder became a political cause celebre and cpi ( m )'s political opponents turned the needle of suspicion on them. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] against this background his murder became a political cause celebre and cpi ( m )'s political opponents turned the needle of suspicion on them. [SEP]

----------------------------
[CLS] ddd is licensed under the gnu general public license and is open source. [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD]
[CLS] ddd is licensed under the gnu general public license and is open source. [SEP]

----------------------------
[C

In [21]:
tokenized_datasets["train"][0]

{'text': "against this background his murder became a political cause celebre and cpi(m)'s political opponents turned the needle of suspicion on them .",
 'label': 0,
 'input_ids': [101,
  2114,
  2023,
  4281,
  2010,
  4028,
  2150,
  1037,
  2576,
  3426,
  8292,
  2571,
  13578,
  1998,
  28780,
  1006,
  1049,
  1007,
  1005,
  1055,
  2576,
  7892,
  2357,
  1996,
  12201,
  1997,
  10928,
  2006,
  2068,
  1012,
  102],
 'token_type_ids': [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [51]:
second["input_ids"].tolist()[5]

[101,
 1999,
 3172,
 1010,
 1996,
 2316,
 2207,
 2046,
 1996,
 4242,
 1010,
 1037,
 9019,
 1011,
 5533,
 8827,
 17994,
 9648,
 10415,
 2600,
 2201,
 2008,
 2001,
 8216,
 2135,
 19657,
 2007,
 1996,
 2316,
 1005,
 1055,
 4563,
 5470,
 15058,
 1012,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0]

In [54]:
tokenized_datasets["train"]["input_ids"][16:32][5]

[101,
 2016,
 2003,
 3862,
 2005,
 2195,
 10106,
 1010,
 2164,
 1996,
 2088,
 1011,
 2898,
 19855,
 1004,
 14783,
 1058,
 4877,
 2072,
 2640,
 4329,
 1010,
 2029,
 2016,
 2318,
 2007,
 20163,
 19855,
 1011,
 1037,
 2088,
 1011,
 2898,
 4297,
 19761,
 4263,
 1997,
 1996,
 8361,
 3968,
 2050,
 3068,
 1012,
 102]

In [55]:
tokenizer.decode(tokenized_datasets["train"]["input_ids"][16:32][5])

'[CLS] she is notable for several achievements, including the world - wide mead & conway vlsi design revolution, which she started with carver mead - a world - wide incubator of the emerging eda industry. [SEP]'

In [5]:
list(BatchSampler(SequentialSampler(range(10)), batch_size=3, drop_last=False))

[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]

## Load a trained model

In [7]:
from transformers import pipeline, set_seed

In [None]:
set_seed(

In [9]:
MODEL_PATH = "/home/cdsw/models/bert-cls-full3/checkpoint-96000/"
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

In [10]:
classifier = pipeline(task="text-classification", model=model, tokenizer=tokenizer)

In [35]:
classifier(
    "following the end of kenneth kaunda's repressive dictatorship , chiluba won the country's multi-party presidential elections."
)

[{'label': 'LABEL_0', 'score': 0.9783034920692444}]

In [36]:
classifier(
    "following the end of kenneth kaunda's repressive presidency , chiluba won the country's multi-party presidential elections."
)

[{'label': 'LABEL_0', 'score': 0.981116771697998}]

In [12]:
classifier.

'text-classification'

In [None]:
Trainer(

In [37]:
import datasets

In [38]:
datasets.list_metrics()

['accuracy',
 'bertscore',
 'bleu',
 'bleurt',
 'cer',
 'chrf',
 'code_eval',
 'comet',
 'competition_math',
 'coval',
 'cuad',
 'exact_match',
 'f1',
 'frugalscore',
 'glue',
 'google_bleu',
 'indic_glue',
 'mae',
 'mahalanobis',
 'matthews_correlation',
 'mauve',
 'mean_iou',
 'meteor',
 'mse',
 'pearsonr',
 'perplexity',
 'precision',
 'recall',
 'roc_auc',
 'rouge',
 'sacrebleu',
 'sari',
 'seqeval',
 'spearmanr',
 'squad',
 'squad_v2',
 'super_glue',
 'ter',
 'wer',
 'wiki_split',
 'xnli',
 'xtreme_s']

In [27]:
type(training_args)

transformers.training_args.TrainingArguments

In [23]:
import os

In [None]:
os.path.exists('

In [12]:
from dataclasses import dataclass, field
from typing import Optional
from transformers.trainer_utils import IntervalStrategy

@dataclass
class StiArguments:
    """
    TrainingArguments is the subset of the arguments we use in our example scripts **which relate to the training loop itself**.
    Using [`HfArgumentParser`] we can turn this class into [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the command line.
    """
    
    model_name_or_path: str = field(
        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
    )
    output_dir: str = field(
        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
    )
    overwrite_output_dir: bool = field(
        default=False,
        metadata={
            "help": (
                "Overwrite the content of the output directory. "
                "Use this to continue training if output_dir points to a checkpoint directory."
            )
        },
    )
    learning_rate: float = field(default=5e-5, metadata={"help": "The initial learning rate for AdamW."})
    per_device_train_batch_size: int = field(
        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for training."}
    )
    per_device_eval_batch_size: int = field(
        default=8, metadata={"help": "Batch size per GPU/TPU core/CPU for evaluation."}
    )
    num_train_epochs: float = field(default=3.0, metadata={"help": "Total number of training epochs to perform."})
    logging_dir: Optional[str] = field(default=None, metadata={"help": "Tensorboard log dir."})
    logging_strategy: IntervalStrategy = field(
        default="steps",
        metadata={"help": "The logging strategy to use."},
    )
    logging_steps: int = field(default=500, metadata={"help": "Log every X updates steps."})
    eval_steps: int = field(default=None, metadata={"help": "Run an evaluation every X steps."})
    evaluation_strategy: IntervalStrategy = field(
        default="no",
        metadata={"help": "The evaluation strategy to use."},
    )
    save_steps: int = field(default=500, metadata={"help": "Save checkpoint every X updates steps."})
    save_total_limit: Optional[int] = field(
        default=None,
        metadata={
            "help": (
                "Limit the total amount of checkpoints. "
                "Deletes the older checkpoints in the output_dir. Default is unlimited checkpoints"
            )
        },
    )
    load_best_model_at_end: Optional[bool] = field(
        default=False,
        metadata={"help": "Whether or not to load the best model found during training at the end of training."},
    )
    metric_for_best_model: Optional[str] = field(
        default=None, metadata={"help": "The metric to use to compare two different models."}
    )
    greater_is_better: Optional[bool] = field(
        default=None, metadata={"help": "Whether the `metric_for_best_model` should be maximized or not."}
    )
    

In [16]:
parser = HfArgumentParser(StiArguments)

In [19]:
parser

HfArgumentParser(prog='ipykernel_launcher.py', usage=None, description=None, formatter_class=<class 'argparse.ArgumentDefaultsHelpFormatter'>, conflict_handler='error', add_help=True)

In [22]:
dir(parser)

['__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_action_groups',
 '_actions',
 '_add_action',
 '_add_container_actions',
 '_add_dataclass_arguments',
 '_check_conflict',
 '_check_value',
 '_defaults',
 '_get_args',
 '_get_formatter',
 '_get_handler',
 '_get_kwargs',
 '_get_nargs_pattern',
 '_get_option_tuples',
 '_get_optional_actions',
 '_get_optional_kwargs',
 '_get_positional_actions',
 '_get_positional_kwargs',
 '_get_value',
 '_get_values',
 '_handle_conflict_error',
 '_handle_conflict_resolve',
 '_has_negative_number_optionals',
 '_match_argument',
 '_match_arguments_partial',
 '_mutually_exclusive_groups',
 '_negative_number_matcher',
 '_opt

In [25]:
import argparse

In [26]:
parser = argparse.ArgumentParser(description="Script to run train job for seq2seq (TST) or classifier (STI) models.")

In [None]:
parser.add_argument(

In [27]:
parser.add_argument('task', type=str, help='Select which task to run: seq2seq or classifier.')

_StoreAction(option_strings=[], dest='task', nargs=None, const=None, default=None, type=<class 'str'>, choices=None, help='Select which task to run: seq2seq or classifier.', metavar=None)

In [28]:
parser

ArgumentParser(prog='ipykernel_launcher.py', usage=None, description='Script to run train job for seq2seq (TST) or classifier (STI) models.', formatter_class=<class 'argparse.HelpFormatter'>, conflict_handler='error', add_help=True)