In [3]:
!pip install transformers evaluate datasets

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.4 MB/s[0m eta [3

In [4]:
# utils
import json
import inspect
import importlib

MODEL_REGISTRY = {}
TASK_REGISTRY = {}
TRAINER_REGISTRY = {}

def register_classes(class_obj, registry: dict):
    assert class_obj.__name__ not in registry, "{} has duplicate class object names, this is not permitted!".format(class_obj.__name__)
    registry[class_obj.__name__] = class_obj

    return registry

def register_to(registry):
    def register_to_inner(class_obj):
        nonlocal registry
        register_classes(class_obj, registry)
    return register_to_inner


def read_config(path):
    class Args():
        built_in = "__"
        def __init__(self, config):
            for k, i in config.__dict__.items():
                if k[:2] == k[-2:] == self.built_in:
                    # clear built-in modules
                    continue
                setattr(self, k, i)

    config = importlib.import_module(path)

    args = dict()
    for name, obj in inspect.getmembers(config):
        if inspect.isclass(obj) and obj.__module__ == config.__name__:
            args[name] = Args(obj)

    # return Args(config.model), Args(config.train), Args(config.task)
    return args


In [5]:
# task

import torch
from torch.utils.data.dataloader import DataLoader

from evaluate import load
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from transformers import (
    AutoModelForQuestionAnswering,
    AutoModelForSequenceClassification,
    AutoTokenizer
)

class TaskClass:

    def __init__(self, task_args, train_args, model_fn):
        self.tokenizer = AutoTokenizer.from_pretrained(task_args.model_name)
        self.data_collator = DataCollatorWithPadding(tokenizer=self.tokenizer)
        self.init_model(model_fn, task_args)
        self.train_args = train_args
        self.task_args = task_args

    def init_model(self):
        raise NotImplementedError

    @staticmethod
    def process_function(examples, tokenizer, input_fields):
        raise NotImplementedError

    def loss_function(self, hypo, targ):
        raise NotImplementedError

    def prepare(self):
        raise NotImplementedError

    def evaluate(self):
        raise NotImplementedError


@register_to(TASK_REGISTRY)
class SQuADv2(TaskClass):

    def __init__(self, task_args, data_args):
        super().__init__(task_args, data_args)
        self.criterion = torch.nn.functional.cross_entropy

    @staticmethod
    def process_function(examples, tokenizer):
        questions = [q.strip() for q in examples["question"]]
        inputs = tokenizer(
            questions,
            examples["context"],
            max_length=384,
            truncation="only_second",
            return_offsets_mapping=True,
            padding="max_length",
        )

        offset_mapping = inputs.pop("offset_mapping")
        answers = examples["answers"]
        start_positions = []
        end_positions = []

        for i, offset in enumerate(offset_mapping):
            answer = answers[i]
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])
            sequence_ids = inputs.sequence_ids(i)

            # Find the start and end of the context
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            # If the answer is not fully inside the context, label it (0, 0)
            if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Otherwise it's the start and end token positions
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

        inputs["start_positions"] = start_positions
        inputs["end_positions"] = end_positions
        return inputs


    def init_model(self, task_args):
        self.model = AutoModelForSequenceClassification.from_pretrained(task_args.model_name, num_labels=2)

    def prepare(self):
        squad = load_dataset("squad")
        tokenized_squad = squad.map(
            lambda x: self.process_function(x, self.tokenizer),
            batched=True,
            remove_columns=squad["train"].column_names,
        )
        train_dataloader = DataLoader(
            tokenized_squad['train'],
            shuffle=True,
            collate_fn=self.data_collator,
            batch_size=self.train_args.train_batch,
        )
        validation_dataloader = DataLoader(
            tokenized_squad['validation'],
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=self.train_args.val_batch,
        )
        # test_dataloader = DataLoader(
        #     tokenized_squad['test'],
        #     shuffle=False,
        #     collate_fn=self.data_collator,
        #     batch_size=self.train_args.test_batch,
        # )
        return (
            train_dataloader,
            validation_dataloader,
            None,
        )

class SequenceClassification(TaskClass):

    def __init__(self, task_args, train_args, model_fn):
        super().__init__(task_args, train_args, model_fn)
        self.criterion = torch.nn.functional.cross_entropy
        self.metric = load("glue", self.task_args.task_name.lower())

    @staticmethod
    def process_function(examples, tokenizer, input_fields, max_seq_len=384):
        if len(input_fields) == 1:
            inp = tokenizer(
                [i.strip() for i in examples[input_fields[0]]],
                max_length=max_seq_len,
                truncation=True,
            )
        else:
            inp = tokenizer(
                [i.strip() for i in examples[input_fields[0]]],
                [i.strip() for i in examples[input_fields[1]]],
                max_length=max_seq_len,
                truncation=True,
            )
        inp["label"] = examples["label"]
        return inp

    def loss_function(self, hypo, targ):
        # hypo.shape == (bsz, num_classes)
        # targ.shape == (bsz)
        return self.criterion(hypo, targ)

    def extract_answer_from_output(self, outp):
        return outp.logits.argmax(dim=1).detach().tolist()

    def inference(self, inp):
        outp = self.model(**inp)
        return self.extract_answer_from_output(outp)

    def evaluate(self, inp, label):
        pred = self.inference(inp)
        return pred, label.detach().tolist()

@register_to(TASK_REGISTRY)
class MNLI(SequenceClassification):

    def init_model(self, model_fn, task_args):
        self.model = model_fn(task_args.model_name, num_labels=3)

    def prepare_eval(self):
        ds = load_dataset("nyu-mll/glue", self.task_args.task_name.lower(), split="test_matched")
        tokenized_ds = ds.map(
            lambda x: self.process_function(x, self.tokenizer, ["premise", "hypothesis"]),
            batched=True,
            remove_columns=ds.column_names,
        )
        test_matched_dataloader = DataLoader(
            tokenized_ds,
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=self.train_args.test_batch,
        )
        return test_matched_dataloader

    def prepare(self):
        ds = load_dataset("nyu-mll/glue", self.task_args.task_name.lower())
        tokenized_ds = ds.map(
            lambda x: self.process_function(x, self.tokenizer, ["premise", "hypothesis"]),
            batched=True,
            remove_columns=ds["train"].column_names,
        )
        train_dataloader = DataLoader(
            tokenized_ds['train'],
            shuffle=True,
            collate_fn=self.data_collator,
            batch_size=self.train_args.train_batch,
        )
        validation_matched_dataloader = DataLoader(
            tokenized_ds['validation_matched'],
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=self.train_args.val_batch,
        )
        test_matched_dataloader = DataLoader(
            tokenized_ds['test_matched'],
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=self.train_args.test_batch,
        )
        # "train", "validation_matched", "test_matched"
        # ['premise', 'hypothesis', 'label', 'idx']
        # task: SequenceClassification
        # label: 0, 1, 2
        return (
            train_dataloader,
            validation_matched_dataloader,
            test_matched_dataloader,
        )

@register_to(TASK_REGISTRY)
class SST2(SequenceClassification):

    def init_model(self, model_fn, task_args):
        self.model = model_fn(task_args.model_name, num_labels=2)

    def prepare(self):
        ds = load_dataset("nyu-mll/glue", self.task_args.task_name.lower())
        tokenized_ds = ds.map(
            lambda x: self.process_function(x, self.tokenizer, ["sentence"]),
            batched=True,
            remove_columns=ds["train"].column_names,
        )
        train_dataloader = DataLoader(
            tokenized_ds['train'],
            shuffle=True,
            collate_fn=self.data_collator,
            batch_size=self.train_args.train_batch,
        )
        validation_dataloader = DataLoader(
            tokenized_ds['validation'],
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=self.train_args.val_batch,
        )
        test_dataloader = DataLoader(
            tokenized_ds['test'],
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=self.train_args.test_batch,
        )
        # "train", "validation", "test"
        # ['sentence', 'label', 'idx']
        # task: SequenceClassification
        # label: 0, 1
        # stanford sentiment treebank (sst2) tests for sentiment (pos/neg) of given sentence

        return (
            train_dataloader,
            validation_dataloader,
            test_dataloader,
        )

@register_to(TASK_REGISTRY)
class MRPC(SequenceClassification):

    def init_model(self, model_fn, task_args):
        self.model = model_fn(task_args.model_name, num_labels=2)

    def prepare(self):
        ds = load_dataset("nyu-mll/glue", self.task_args.task_name.lower())
        tokenized_ds = ds.map(
            lambda x: self.process_function(x, self.tokenizer, ['sentence1', 'sentence2']),
            batched=True,
            remove_columns=ds["train"].column_names,
        )
        train_dataloader = DataLoader(
            tokenized_ds['train'],
            shuffle=True,
            collate_fn=self.data_collator,
            batch_size=self.train_args.train_batch,
        )
        validation_dataloader = DataLoader(
            tokenized_ds['validation'],
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=self.train_args.val_batch,
        )
        test_dataloader = DataLoader(
            tokenized_ds['test'],
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=self.train_args.test_batch,
        )
        # "train", "validation", "test"
        # ['sentence1', 'sentence2', 'label', 'idx']
        # task: SequenceClassification
        # label: 0, 1
        # microsoft research paraphrase corpus (mrpc)mtests for semantic equivalence

        return (
            train_dataloader,
            validation_dataloader,
            test_dataloader,
        )

@register_to(TASK_REGISTRY)
class CoLA(SequenceClassification):


    def init_model(self, model_fn, task_args):
        self.model = model_fn(task_args.model_name, num_labels=2)

    def prepare(self):
        ds = load_dataset("nyu-mll/glue", self.task_args.task_name.lower())
        tokenized_ds = ds.map(
            lambda x: self.process_function(x, self.tokenizer, ['sentence']),
            batched=True,
            remove_columns=ds["train"].column_names,
        )
        train_dataloader = DataLoader(
            tokenized_ds['train'],
            shuffle=True,
            collate_fn=self.data_collator,
            batch_size=self.train_args.train_batch,
        )
        validation_dataloader = DataLoader(
            tokenized_ds['validation'],
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=self.train_args.val_batch,
        )
        test_dataloader = DataLoader(
            tokenized_ds['test'],
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=self.train_args.test_batch,
        )
        # "train", "validation", "test"
        # ['sentence', 'label', 'idx']
        # task: SequenceClassification
        # label: 0, 1
        # tests whether the given sentence is grammatically correct english

        return (
            train_dataloader,
            validation_dataloader,
            test_dataloader,
        )

@register_to(TASK_REGISTRY)
class QNLI(SequenceClassification):


    def init_model(self, model_fn, task_args):
        self.model = model_fn(task_args.model_name, num_labels=2)

    def prepare(self):
        ds = load_dataset("nyu-mll/glue", self.task_args.task_name.lower())
        tokenized_ds = ds.map(
            lambda x: self.process_function(x, self.tokenizer, ['question', 'sentence']),
            batched=True,
            remove_columns=ds["train"].column_names,
        )
        train_dataloader = DataLoader(
            tokenized_ds['train'],
            shuffle=True,
            collate_fn=self.data_collator,
            batch_size=self.train_args.train_batch,
        )
        validation_dataloader = DataLoader(
            tokenized_ds['validation'],
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=self.train_args.val_batch,
        )
        test_dataloader = DataLoader(
            tokenized_ds['test'],
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=self.train_args.test_batch,
        )
        # "train", "validation", "test"
        # ['question', 'sentence', 'label', 'idx']
        # task: SequenceClassification
        # label: 0, 1
        # tests for whether the answer to the question can be found in the question
        return (
            train_dataloader,
            validation_dataloader,
            test_dataloader,
        )

@register_to(TASK_REGISTRY)
class QQP(SequenceClassification):


    def init_model(self, model_fn, task_args):
        self.model = model_fn(task_args.model_name, num_labels=2)

    def prepare(self):
        ds = load_dataset("nyu-mll/glue", self.task_args.task_name.lower())
        tokenized_ds = ds.map(
            lambda x: self.process_function(x, self.tokenizer, ['question1', 'question2']),
            batched=True,
            remove_columns=ds["train"].column_names,
        )
        train_dataloader = DataLoader(
            tokenized_ds['train'],
            shuffle=True,
            collate_fn=self.data_collator,
            batch_size=self.train_args.train_batch,
        )
        validation_dataloader = DataLoader(
            tokenized_ds['validation'],
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=self.train_args.val_batch,
        )
        test_dataloader = DataLoader(
            tokenized_ds['test'],
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=self.train_args.test_batch,
        )
        # "train", "validation", "test"
        # ['question1', 'question2', 'label', 'idx']
        # task: SequenceClassification
        # label: 0, 1
        # quora question pairs (qqp) tests for semantic equivalence
        return (
            train_dataloader,
            validation_dataloader,
            test_dataloader,
        )

@register_to(TASK_REGISTRY)
class RTE(SequenceClassification):


    def init_model(self, model_fn, task_args):
        self.model = model_fn(task_args.model_name, num_labels=2)

    def prepare(self):
        ds = load_dataset("nyu-mll/glue", self.task_args.task_name.lower())
        tokenized_ds = ds.map(
            lambda x: self.process_function(x, self.tokenizer, ['sentence1', 'sentence2']),
            batched=True,
            remove_columns=ds["train"].column_names,
        )
        train_dataloader = DataLoader(
            tokenized_ds['train'],
            shuffle=True,
            collate_fn=self.data_collator,
            batch_size=self.train_args.train_batch,
        )
        validation_dataloader = DataLoader(
            tokenized_ds['validation'],
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=self.train_args.val_batch,
        )
        test_dataloader = DataLoader(
            tokenized_ds['test'],
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=self.train_args.test_batch,
        )
        # "train", "validation", "test"
        # ['sentence1', 'sentence2', 'label', 'idx']
        # task: SequenceClassification
        # label: 0, 1
        # recognizing textual entailment (rte) tests textual entailment (collapses neutral & contradiction into not entailment)
        return (
            train_dataloader,
            validation_dataloader,
            test_dataloader,
        )

@register_to(TASK_REGISTRY)
class STSB(SequenceClassification):

    def prepare(self):
        ds = load_dataset("nyu-mll/glue", self.task_args.task_name.lower())
        tokenized_ds = ds.map(
            lambda x: self.process_function(x, self.tokenizer, ['sentence1', 'sentence2']),
            batched=True,
            remove_columns=ds["train"].column_names,
        )
        train_dataloader = DataLoader(
            tokenized_ds['train'],
            shuffle=True,
            collate_fn=self.data_collator,
            batch_size=self.train_args.train_batch,
        )
        validation_dataloader = DataLoader(
            tokenized_ds['validation'],
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=self.train_args.val_batch,
        )
        test_dataloader = DataLoader(
            tokenized_ds['test'],
            shuffle=False,
            collate_fn=self.data_collator,
            batch_size=self.train_args.test_batch,
        )
        # "train", "validation", "test"
        # ['sentence1', 'sentence2', 'label', 'idx']
        # task: SequenceClassification
        # label: floating point from 0 to 5
        # pair is human-annotated with a similarity score from 1 to 5
        return (
            train_dataloader,
            validation_dataloader,
            test_dataloader,
        )


In [6]:
# model

from datasets import load_dataset
from transformers import (
    AutoModelForQuestionAnswering,
    AutoTokenizer,
    AutoModelForSequenceClassification,
)

@register_to(MODEL_REGISTRY)
def SequenceClassificationModel(model_name, **kwargs):
    return AutoModelForSequenceClassification.from_pretrained(model_name, **kwargs)


@register_to(MODEL_REGISTRY)
def SequenceClassificationLoRA(model_name, **kwargs):
    return AutoModelForSequenceClassification.from_pretrained(model_name, **kwargs)


In [7]:
# custom_trainer.py

import wandb
import torch
from accelerate import Accelerator
from tqdm import tqdm

class FakeWandB:

    def __init__(self):
        self.logs = []

    def log(self, logs):
        self.logs.append(logs)

class CustomTrainer:
    device = "cuda" if torch.cuda.is_available() else "cpu"

    def __init__(self, task, wandb_config):
        self.task = task
        if wandb_config is not None:
            wandb.login(key=wandb_config.api_key)
            wandb.init(
                # Set the project where this run will be logged
                project=wandb_config.project_name,
                name=wandb_config.experiment_name,
                # Track hyperparameters and run metadata
                config={
                    "task":self.task.task_args.__dict__,
                    "train_args":self.task.train_args.__dict__,
                }
            )
            self.wandb = wandb
        else:
            self.wandb = FakeWandB()

    def prepare_train(self, args):

        train_dl, val_dl, test_dl = self.task.prepare()
        total_training_steps = len(train_dl) * args.epochs

        self.optim = torch.optim.AdamW(
            self.task.model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)
        self.scheduler = torch.optim.lr_scheduler.LinearLR(
            self.optim,
            start_factor=1e-10,
            total_iters=total_training_steps*args.warmup_ratio
        )

        self.task.model = self.task.model.to(self.device)

        return train_dl, val_dl, test_dl

    def train(self, args):
        self.task.model.train()
        device = self.device
        train_dl, val_dl, test_dl = self.prepare_train(args)
        accelerator = Accelerator(gradient_accumulation_steps=args.grad_accum)
        model, self.optim, train_dl, self.scheduler = accelerator.prepare(
            self.task.model, self.optim, train_dl, self.scheduler
        )

        steps_per_epoch = len(train_dl)
        val_steps_per_epoch = len(val_dl)
        for epoch in range(args.epochs):

            # ========== training ==========
            losses = []
            num_datapoints = 0

            for step, batch in enumerate(train_dl):
                # breakpoint()
                with accelerator.accumulate(model):
                    # ========== forward pass ==========
                    batch = {i:j.to(device) for i,j in batch.items()}
                    outputs = model(**batch)
                    loss = self.task.loss_function(outputs.logits, batch['labels'])

                    # ========== backpropagation ==========
                    accelerator.backward(loss)
                    self.optim.step()
                    self.scheduler.step()
                    self.optim.zero_grad()

                    # ========== logging ==========
                    loss_for_logging = loss.detach().tolist()
                    losses.append(loss_for_logging*len(batch['labels']))
                    num_datapoints += len(batch['labels'])
                    self.wandb.log({
                        "train/loss": loss_for_logging,
                        "train/learning_rate": self.scheduler.get_last_lr()[0]
                    })
                    print("Epoch {} training loss: {}".format(
                        step/steps_per_epoch, loss_for_logging), end="\r")

            print("\nEpoch {} avg training loss: {}".format(
                epoch, sum(losses)/num_datapoints))

            # ========== validation ==========
            val_losses = []
            num_datapoints = 0
            preds = []
            labels = []
            for step, batch in enumerate(val_dl):
                # ========== forward pass ==========
                batch = {i:j.to(self.device) for i,j in batch.items()}
                outputs = model(**batch)
                loss = self.task.loss_function(outputs.logits, batch['labels'])

                # ========== compute metric ==========
                preds.extend(
                    self.task.extract_answer_from_output(outputs)
                )
                labels.extend(
                    batch['labels'].detach().tolist()
                )

                # ========== logging ==========
                val_loss_for_logging = loss.detach().tolist()
                val_losses.append(val_loss_for_logging*len(batch['labels']))
                num_datapoints += len(batch['labels'])
                print("Epoch {} validation loss: {}".format(
                    step/val_steps_per_epoch, val_loss_for_logging), end="\r")

            self.wandb.log({"val/loss": sum(val_losses)/num_datapoints})
            print("Epoch {} avg validation loss: {}".format(
                    epoch, sum(val_losses)/num_datapoints))
            val_result = self.task.metric.compute(
                predictions=preds,
                references=labels,
            )
            print("Epoch {} validation acc: {}".format(
                epoch, val_result), end="\r")
            self.wandb.log({"val/{}".format(i):j for i,j in val_result.items()})

    def evaluate(self, dl):
        pred_list = []
        label_list = []
        with torch.no_grad():
            for inp in dl:
                preds, labels = self.task.evaluate(inp, inp['label'])
                pred_list.extend(preds)
                label_list.extend(labels)

        result = self.task.metric.compute(
            predictions=pred_list,
            references=label_list,
        )

        return result

    def inference(self, dl):
        self.task.model.eval()
        infer_list = []
        with torch.no_grad():
            for inp in dl:
                preds = self.task.inference(inp)
                infer_list.extend(preds)
        return infer_list


In [9]:
# main

import sys
import importlib

# from custom_trainer import CustomTrainer
# from utils import (
#     MODEL_REGISTRY,
#     TASK_REGISTRY,
#     TRAINER_REGISTRY,
#     read_config,
# )

# importlib.import_module("model")
# importlib.import_module("task")

def main_train(config):
    # args = read_config(f"configs.{config}")
    args = config
    task_class = TASK_REGISTRY.get(args['task'].task_name)
    model_fn = MODEL_REGISTRY.get(args['task'].model)
    task = task_class(args['task'], args['train'], model_fn)
    trainer = CustomTrainer(task, args.get("wandb_config", None))
    trainer.train(args['train'])

def main_eval(config):
    pass

def main_infer(config):
    pass

# if __name__=="__main__":
#     assert len(sys.argv) == 3, "define mode (train | eval) and config"
#     print("Executing python3", sys.argv)
#     mode = sys.argv[1]
#     config = sys.argv[2]
#     if mode == "train":
#         main_train(config)
#     elif mode == "eval":
#         main_eval(config)
#     elif mode == "infer":
#         main_infer(config)


In [10]:
# config

import os
from datetime import datetime

class task:
    model = "SequenceClassificationModel"
    model_name = "distilbert-base-uncased"
    task_name = "MNLI"

class train:
    learning_rate = 2e-04
    epochs = 30
    weight_decay = 0.01
    report_to = "wandb"
    val_batch = 64
    test_batch = 64
    train_batch = 128
    warmup_ratio = 0.03
    grad_accum = 1

class wandb_config:
    project_name = "jjh"
    experiment_name = "debug_{}".format(datetime.now().strftime("%H_%M_%S_%m%d"))
    api_key = "YOUR_API_KEY_HERE"


In [None]:

##### COLAB SPECIAL #####
class Args():
    built_in = "__"
    def __init__(self, config):
        for k, i in config.__dict__.items():
            if k[:2] == k[-2:] == self.built_in:
                # clear built-in modules
                continue
            setattr(self, k, i)

def get_config():
    return {
        "task": Args(task),
        "train": Args(train),
        "wandb_config": Args(wandb_config),
    }

#########################

main_train(get_config())

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


README.md:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/52.2M [00:00<?, ?B/s]

(…)alidation_matched-00000-of-00001.parquet:   0%|          | 0.00/1.21M [00:00<?, ?B/s]

(…)dation_mismatched-00000-of-00001.parquet:   0%|          | 0.00/1.25M [00:00<?, ?B/s]

test_matched-00000-of-00001.parquet:   0%|          | 0.00/1.22M [00:00<?, ?B/s]

test_mismatched-00000-of-00001.parquet:   0%|          | 0.00/1.26M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/392702 [00:00<?, ? examples/s]

Generating validation_matched split:   0%|          | 0/9815 [00:00<?, ? examples/s]

Generating validation_mismatched split:   0%|          | 0/9832 [00:00<?, ? examples/s]

Generating test_matched split:   0%|          | 0/9796 [00:00<?, ? examples/s]

Generating test_mismatched split:   0%|          | 0/9847 [00:00<?, ? examples/s]

Map:   0%|          | 0/392702 [00:00<?, ? examples/s]

Map:   0%|          | 0/9815 [00:00<?, ? examples/s]

Map:   0%|          | 0/9832 [00:00<?, ? examples/s]

Map:   0%|          | 0/9796 [00:00<?, ? examples/s]

Map:   0%|          | 0/9847 [00:00<?, ? examples/s]

