In [1]:
import os
# Set GPU device
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

In [2]:
class DataArgument:
    def __init__(self):
        self.task_name =  'sst-2'
        self.data_dir  = './output_SanText_glove/SST-2/eps_3.00/'
        self.max_seq_length = 128 
        self.overwrite_cache = True

class TrainingArgument:
    def __init__(self):
        self.do_train   = True
        self.do_eval    = True
        self.do_predict = False

class ModelArgument:
    def __init__(self):
        self.cache_dir = None
    
class Argument:
    def __init__(self):
        self.model_name_or_path  = 'bert-base-uncased'
        self.task_name = None
        self.data_dir  = './output_SanText_glove/SST-2/eps_3.00/'
        self.train_file = self.data_dir + "train.tsv"
        self.validation_file = self.data_dir + "dev.tsv"
        self.max_length = 128
        
        self.per_device_train_batch_size = 64 
        self.per_device_eval_batch_size = 64 
        
        self.weight_decay = 0.0
        self.learning_rate = 2e-5 
        self.num_train_epochs = 3 
        self.max_train_steps = None
        self.gradient_accumulation_steps = 1
        self.lr_scheduler_type = "linear"
        self.num_warmup_steps = 0
        self.seed = None
        
        self.output_dir  = './tmp/sst2-sanitize/'
        self.overwrite_output_dir = True
        self.overwrite_cache = True
        self.save_steps = 2000
        self.trust_remote_code = False
        self.use_slow_tokenizer = False
        self.pad_to_max_length = True
        
        self.checkpointing_steps = None
        self.resume_from_checkpoint = None
        self.report_to = "all"
        self.ignore_mismatched_sizes = True

        self.with_tracking = False
        self.push_to_hub = False

args = Argument()
data_args = DataArgument()
training_args = TrainingArgument()
model_args = ModelArgument()

In [3]:
import argparse
import json
import logging
import math
import os
import random
from pathlib import Path
import wandb

import datasets
import evaluate
import torch

from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm

from accelerate import Accelerator
from accelerate.logging import get_logger
from accelerate.utils import set_seed

import transformers
from transformers import (
    AutoConfig,
    AutoModelForSequenceClassification,
    AutoTokenizer,
    DataCollatorWithPadding,
    PretrainedConfig,
    SchedulerType,
    default_data_collator,
    get_scheduler,
)

logger = get_logger(__name__)

accelerator = (
    Accelerator(log_with=args.report_to, project_dir=args.output_dir) if args.with_tracking else Accelerator()
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
wandb.login()

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="SanText-SST",
    
    # track hyperparameters and run metadata
    config={
    "learning_rate": args.learning_rate,
    "architecture": args.model_name_or_path,
    "dataset": "SST-2",
    "epochs": args.num_train_epochs,
    }
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mguntsvzz[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [5]:
# Get the datasets: you can either provide your own CSV/JSON training and evaluation files (see below)
# or specify a GLUE benchmark task (the dataset will be downloaded automatically from the datasets Hub).

# For CSV/JSON files, this script will use as labels the column called 'label' and as pair of sentences the
# sentences in columns called 'sentence1' and 'sentence2' if such column exists or the first two columns not named
# label if at least two columns are provided.

# If the CSVs/JSONs contain only one non-label column, the script does single sentence classification on this
# single column. You can easily tweak this behavior (see below)

# In distributed training, the load_dataset function guarantee that only one local process can concurrently
# download the dataset.
if args.task_name is not None:
    # Downloading and loading a dataset from the hub.
    raw_datasets = load_dataset("glue", args.task_name)
else:
    # Loading the dataset from local csv or json file.
    data_files = {}
    if args.train_file is not None:
        data_files["train"] = args.train_file
    if args.validation_file is not None:
        data_files["validation"] = args.validation_file
    extension = (args.train_file if args.train_file is not None else args.validation_file).split(".")[-1]
    if extension == 'tsv':
        raw_datasets = load_dataset("csv", data_files=data_files, delimiter='\t')
    else:
        raw_datasets = load_dataset(extension, data_files=data_files, delimiter='\t')
    
raw_datasets

Found cached dataset csv (/home/todsavadt/.cache/huggingface/datasets/csv/default-d5e0641ac45cae53/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
100%|██████████| 2/2 [00:00<00:00, 1520.78it/s]


DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label'],
        num_rows: 872
    })
})

In [6]:
# Labels
if args.task_name is not None:
    is_regression = args.task_name == "stsb"
    if not is_regression:
        label_list = raw_datasets["train"].features["label"].names
        num_labels = len(label_list)
    else:
        num_labels = 1
else:
    # Trying to have good defaults here, don't hesitate to tweak to your needs.
    is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
    if is_regression:
        num_labels = 1
    else:
        # A useful fast method:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
        label_list = raw_datasets["train"].unique("label")
        label_list.sort()  # Let's sort it for determinism
        num_labels = len(label_list)
        
num_labels

2

In [7]:
# Load pretrained model and tokenizer
#
# In distributed training, the .from_pretrained methods guarantee that only one local process can concurrently
# download model & vocab.
config = AutoConfig.from_pretrained(
    args.model_name_or_path,
    num_labels=num_labels,
    finetuning_task=args.task_name,
    trust_remote_code=args.trust_remote_code,
)
tokenizer = AutoTokenizer.from_pretrained(
    args.model_name_or_path, use_fast=not args.use_slow_tokenizer, trust_remote_code=args.trust_remote_code
)
model = AutoModelForSequenceClassification.from_pretrained(
    args.model_name_or_path,
    from_tf=bool(".ckpt" in args.model_name_or_path),
    config=config,
    ignore_mismatched_sizes=args.ignore_mismatched_sizes,
    trust_remote_code=args.trust_remote_code,
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Preprocessing the datasets
if args.task_name is not None:
    sentence1_key, sentence2_key = task_to_keys[args.task_name]
else:
    # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
    non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
    if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
        sentence1_key, sentence2_key = "sentence1", "sentence2"
    else:
        if len(non_label_column_names) >= 2:
            sentence1_key, sentence2_key = non_label_column_names[:2]
        else:
            sentence1_key, sentence2_key = non_label_column_names[0], None

In [9]:
# Some models have set the order of the labels to use, so let's make sure we do use it.
label_to_id = None
if (
    model.config.label2id != PretrainedConfig(num_labels=num_labels).label2id
    and args.task_name is not None
    and not is_regression
):
    # Some have all caps in their config, some don't.
    label_name_to_id = {k.lower(): v for k, v in model.config.label2id.items()}
    if sorted(label_name_to_id.keys()) == sorted(label_list):
        logger.info(
            f"The configuration of the model provided the following label correspondence: {label_name_to_id}. "
            "Using it!"
        )
        label_to_id = {i: label_name_to_id[label_list[i]] for i in range(num_labels)}
    else:
        logger.warning(
            "Your model seems to have been trained with labels, but they don't match the dataset: ",
            f"model labels: {sorted(label_name_to_id.keys())}, dataset labels: {sorted(label_list)}."
            "\nIgnoring the model labels as a result.",
        )
elif args.task_name is None and not is_regression:
    label_to_id = {v: i for i, v in enumerate(label_list)}

if label_to_id is not None:
        model.config.label2id = label_to_id
        model.config.id2label = {id: label for label, id in config.label2id.items()}
elif args.task_name is not None and not is_regression:
    model.config.label2id = {l: i for i, l in enumerate(label_list)}
    model.config.id2label = {id: label for label, id in config.label2id.items()}

padding = "max_length" if args.pad_to_max_length else False

In [10]:
def preprocess_function(examples):
    # Tokenize the texts
    texts = (
        (examples[sentence1_key],) if sentence2_key is None else (examples[sentence1_key], examples[sentence2_key])
    )
    result = tokenizer(*texts, padding=padding, max_length=args.max_length, truncation=True)
    
    if "label" in examples:
        if label_to_id is not None:
            # Map labels to IDs (not necessary for GLUE tasks)
            result["labels"] = [label_to_id[l] for l in examples["label"]]
        else:
            # In all cases, rename the column to labels because the model will expect that.
            result["labels"] = examples["label"]
    return result

with accelerator.main_process_first():
    processed_datasets = raw_datasets.map(
        preprocess_function,
        batched=True,
        remove_columns=raw_datasets["train"].column_names,
        desc="Running tokenizer on dataset",
    )

train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation_matched" if args.task_name == "mnli" else "validation"]

Loading cached processed dataset at /home/todsavadt/.cache/huggingface/datasets/csv/default-d5e0641ac45cae53/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-c118854d8b6f229a.arrow
Loading cached processed dataset at /home/todsavadt/.cache/huggingface/datasets/csv/default-d5e0641ac45cae53/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1/cache-cd5bb8b0202a7f4d.arrow


In [11]:
# Log a few random samples from the training set:
for index in random.sample(range(len(train_dataset)), 3):
    logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")

# DataLoaders creation:
if args.pad_to_max_length:
    # If padding was already done ot max length, we use the default data collator that will just convert everything
    # to tensors.
    data_collator = default_data_collator
else:
    # Otherwise, `DataCollatorWithPadding` will apply dynamic padding for us (by padding to the maximum length of
    # the samples passed). When using mixed precision, we add `pad_to_multiple_of=8` to pad all tensors to multiple
    # of 8s, which will enable the use of Tensor Cores on NVIDIA hardware with compute capability >= 7.5 (Volta).
    data_collator = DataCollatorWithPadding(tokenizer, pad_to_multiple_of=(8 if accelerator.use_fp16 else None))

train_dataloader = DataLoader(
    train_dataset, shuffle=True, collate_fn=data_collator, batch_size=args.per_device_train_batch_size
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size)

In [12]:
# Optimizer
# Split weights in two groups, one with weight decay and the other not.
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
        "weight_decay": args.weight_decay,
    },
    {
        "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
        "weight_decay": 0.0,
    },
]
optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=args.learning_rate)

# Scheduler and math around the number of training steps.
overrode_max_train_steps = False
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if args.max_train_steps is None:
    args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
    overrode_max_train_steps = True

lr_scheduler = get_scheduler(
    name=args.lr_scheduler_type,
    optimizer=optimizer,
    num_warmup_steps=args.num_warmup_steps,
    num_training_steps=args.max_train_steps,
)

In [13]:
# Prepare everything with our `accelerator`.
model, optimizer, train_dataloader, eval_dataloader, lr_scheduler = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader, lr_scheduler
)

# We need to recalculate our total training steps as the size of the training dataloader may have changed
num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
if overrode_max_train_steps:
    args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
# Afterwards we recalculate our number of training epochs
args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

# Figure out how many steps we should save the Accelerator states
checkpointing_steps = args.checkpointing_steps
if checkpointing_steps is not None and checkpointing_steps.isdigit():
    checkpointing_steps = int(checkpointing_steps)
    
# We need to initialize the trackers we use, and also store our configuration.
# The trackers initializes automatically on the main process.
if args.with_tracking:
    experiment_config = vars(args)
    # TensorBoard cannot log Enums, need the raw value
    experiment_config["lr_scheduler_type"] = experiment_config["lr_scheduler_type"].value
    accelerator.init_trackers("glue_no_trainer", experiment_config)

# Get the metric function
if args.task_name is not None:
    metric = evaluate.load("glue", args.task_name)
else:
    metric = evaluate.load("accuracy")

# Train!
total_batch_size = args.per_device_train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

In [14]:
print("***** Running training *****")
print(f"  Num examples = {len(train_dataset)}")
print(f"  Num Epochs = {args.num_train_epochs}")
print(f"  Instantaneous batch size per device = {args.per_device_train_batch_size}")
print(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
print(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
print(f"  Total optimization steps = {args.max_train_steps}")

***** Running training *****
  Num examples = 67349
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 3159


In [15]:
# Only show the progress bar once on each machine.
progress_bar = tqdm(range(args.max_train_steps), disable=not accelerator.is_local_main_process)
completed_steps = 0
starting_epoch = 0
# Potentially load in the weights and states from a previous save
if args.resume_from_checkpoint:
    if args.resume_from_checkpoint is not None or args.resume_from_checkpoint != "":
        checkpoint_path = args.resume_from_checkpoint
        path = os.path.basename(args.resume_from_checkpoint)
    else:
        # Get the most recent checkpoint
        dirs = [f.name for f in os.scandir(os.getcwd()) if f.is_dir()]
        dirs.sort(key=os.path.getctime)
        path = dirs[-1]  # Sorts folders by date modified, most recent checkpoint is the last
        checkpoint_path = path
        path = os.path.basename(checkpoint_path)

    accelerator.print(f"Resumed from checkpoint: {checkpoint_path}")
    accelerator.load_state(path)
    # Extract `epoch_{i}` or `step_{i}`
    training_difference = os.path.splitext(path)[0]

    if "epoch" in training_difference:
        starting_epoch = int(training_difference.replace("epoch_", "")) + 1
        resume_step = None
        completed_steps = starting_epoch * num_update_steps_per_epoch
    else:
        # need to multiply `gradient_accumulation_steps` to reflect real steps
        resume_step = int(training_difference.replace("step_", "")) * args.gradient_accumulation_steps
        starting_epoch = resume_step // len(train_dataloader)
        completed_steps = resume_step // args.gradient_accumulation_steps
        resume_step -= starting_epoch * len(train_dataloader)

  0%|          | 0/3159 [00:00<?, ?it/s]

In [None]:
# update the progress_bar if load from checkpoint
progress_bar.update(completed_steps)

for epoch in range(starting_epoch, args.num_train_epochs):
    model.train()
    if args.with_tracking:
        total_loss = 0
    if args.resume_from_checkpoint and epoch == starting_epoch and resume_step is not None:
        # We skip the first `n` batches in the dataloader when resuming from a checkpoint
        active_dataloader = accelerator.skip_first_batches(train_dataloader, resume_step)
    else:
        active_dataloader = train_dataloader
    for step, batch in enumerate(active_dataloader):
        outputs = model(**batch)
        loss = outputs.loss
        # We keep track of the loss at each epoch
        if args.with_tracking:
            total_loss += loss.detach().float()
        loss = loss / args.gradient_accumulation_steps
        accelerator.backward(loss)
        if step % args.gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
            completed_steps += 1

        if isinstance(checkpointing_steps, int):
            if completed_steps % checkpointing_steps == 0:
                output_dir = f"step_{completed_steps}"
                if args.output_dir is not None:
                    output_dir = os.path.join(args.output_dir, output_dir)
                accelerator.save_state(output_dir)

        if completed_steps >= args.max_train_steps:
            break

    model.eval()
    samples_seen = 0
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1) if not is_regression else outputs.logits.squeeze()
        predictions, references = accelerator.gather((predictions, batch["labels"]))
        # If we are in a multiprocess environment, the last batch has duplicates
        if accelerator.num_processes > 1:
            if step == len(eval_dataloader) - 1:
                predictions = predictions[: len(eval_dataloader.dataset) - samples_seen]
                references = references[: len(eval_dataloader.dataset) - samples_seen]
            else:
                samples_seen += references.shape[0]
        metric.add_batch(
            predictions=predictions,
            references=references,
        )

    eval_metric = metric.compute()
    logger.info(f"epoch {epoch}: {eval_metric}")
    wandb.log({"Epoch" : epoch, "eval_metric": eval_metric})

    # if args.with_tracking:
    #     accelerator.log(
    #         {
    #             "accuracy" if args.task_name is not None else "glue": eval_metric,
    #             "train_loss": total_loss.item() / len(train_dataloader),
    #             "epoch": epoch,
    #             "step": completed_steps,
    #         },
    #         step=completed_steps,
    #     )

    if args.push_to_hub and epoch < args.num_train_epochs - 1:
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        unwrapped_model.save_pretrained(
            args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
        )
        if accelerator.is_main_process:
            tokenizer.save_pretrained(args.output_dir)
            # repo.push_to_hub(
            #     commit_message=f"Training in progress epoch {epoch}", blocking=False, auto_lfs_prune=True
            # )

    if args.checkpointing_steps == "epoch":
        output_dir = f"epoch_{epoch}"
        if args.output_dir is not None:
            output_dir = os.path.join(args.output_dir, output_dir)
        accelerator.save_state(output_dir)

# if args.with_tracking:
#     accelerator.end_training()

if args.output_dir is not None:
    accelerator.wait_for_everyone()
    unwrapped_model = accelerator.unwrap_model(model)
    unwrapped_model.save_pretrained(
        args.output_dir, is_main_process=accelerator.is_main_process, save_function=accelerator.save
    )
    if accelerator.is_main_process:
        tokenizer.save_pretrained(args.output_dir)
        if args.push_to_hub:
            repo.push_to_hub(commit_message="End of training", auto_lfs_prune=True)

# if args.task_name == "mnli":
#     # Final evaluation on mismatched validation set
#     eval_dataset = processed_datasets["validation_mismatched"]
#     eval_dataloader = DataLoader(
#         eval_dataset, collate_fn=data_collator, batch_size=args.per_device_eval_batch_size
#     )
#     eval_dataloader = accelerator.prepare(eval_dataloader)

#     model.eval()
#     for step, batch in enumerate(eval_dataloader):
#         outputs = model(**batch)
#         predictions = outputs.logits.argmax(dim=-1)
#         metric.add_batch(
#             predictions=accelerator.gather(predictions),
#             references=accelerator.gather(batch["labels"]),
#         )

#     eval_metric = metric.compute()
#     logger.info(f"mnli-mm: {eval_metric}")

# if args.output_dir is not None:
#     all_results = {f"eval_{k}": v for k, v in eval_metric.items()}
#     with open(os.path.join(args.output_dir, "all_results.json"), "w") as f:
#         json.dump(all_results, f)

wandb.finish()

100%|█████████▉| 3156/3159 [15:50<00:00,  3.31it/s]