In [None]:
# |default_exp text.utils
# |default_cls_lvl 3

In [None]:
# |hide
%reload_ext autoreload
%autoreload 2

# utils

> `text.utils` contains various text specific utility classes/functions

In [None]:
# |export
from __future__ import annotations

import importlib, inspect, os, random, sys, warnings

import numpy as np
import pandas as pd
import torch

from enum import Enum
from fastcore.foundation import L
from transformers import (
    AutoConfig,
    AutoTokenizer,
    PretrainedConfig,
    PreTrainedTokenizerBase,
    PreTrainedModel,
)
from transformers.utils import logging as hf_logging

from blurr.utils import Singleton

In [None]:
# |export
# silence all the HF warnings
warnings.simplefilter("ignore")
hf_logging.set_verbosity_error()

In [None]:
# |hide
import pdb

from IPython.display import display
from fastcore.test import *
from nbdev import nbdev_export
from nbdev.showdoc import show_doc

from blurr.utils import print_versions

In [None]:
# | echo: false
print("What we're running with at the time this documentation was generated:")
print_versions("torch fastai transformers")

What we're running with at the time this documentation was generated:
torch: 1.9.0+cu102
fastai: 2.7.9
transformers: 4.21.2


In [None]:
# |hide
# |cuda
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")

Using GPU #1: GeForce GTX 1080 Ti


In [None]:
# |export
def get_hf_objects(
    pretrained_model_name_or_path: str | os.PathLike,
    model_cls: PreTrainedModel,
    config: PretrainedConfig | str | os.PathLike = None,
    tokenizer_cls: PreTrainedTokenizerBase = None,
    config_kwargs: dict = {},
    tokenizer_kwargs: dict = {},
    model_kwargs: dict = {},
    cache_dir: str | os.PathLike = None,
) -> tuple[str, PretrainedConfig, PreTrainedTokenizerBase, PreTrainedModel]:
    """
    Given at minimum a `pretrained_model_name_or_path` and `model_cls (such as
    `AutoModelForSequenceClassification"), this method returns all the Hugging Face objects you need to train
    a model using Blurr
    """
    # config
    if config is None:
        config = AutoConfig.from_pretrained(
            pretrained_model_name_or_path, cache_dir=cache_dir, **config_kwargs
        )

    # tokenizer (gpt2, roberta, bart (and maybe others) tokenizers require a prefix space)
    if any(
        s in pretrained_model_name_or_path
        for s in ["gpt2", "roberta", "bart", "longformer"]
    ):
        tokenizer_kwargs = {**{"add_prefix_space": True}, **tokenizer_kwargs}

    if tokenizer_cls is None:
        tokenizer = AutoTokenizer.from_pretrained(
            pretrained_model_name_or_path, cache_dir=cache_dir, **tokenizer_kwargs
        )
    else:
        tokenizer = tokenizer_cls.from_pretrained(
            pretrained_model_name_or_path, cache_dir=cache_dir, **tokenizer_kwargs
        )

    # model
    model = model_cls.from_pretrained(
        pretrained_model_name_or_path,
        config=config,
        cache_dir=cache_dir,
        **model_kwargs
    )

    # arch
    try:
        arch = model.__module__.split(".")[2]
    except:
        arch = "unknown"

    return (arch, config, tokenizer, model)

In [None]:
show_doc(get_hf_objects, title_level=2)

---

[source](https://github.com/ohmeow/blurr/tree/master/blob/master/blurr/text/utils.py#L30){target="_blank" style="float:right; font-size:smaller"}

## get_hf_objects

>      get_hf_objects (pretrained_model_name_or_path:Union[str,os.PathLike],
>                      model_cls:transformers.modeling_utils.PreTrainedModel, co
>                      nfig:Union[transformers.configuration_utils.PretrainedCon
>                      fig,str,os.PathLike]=None, tokenizer_cls:transformers.tok
>                      enization_utils_base.PreTrainedTokenizerBase=None,
>                      config_kwargs:dict={}, tokenizer_kwargs:dict={},
>                      model_kwargs:dict={},
>                      cache_dir:Union[str,os.PathLike]=None)

Given at minimum a `pretrained_model_name_or_path` and `model_cls (such as
`AutoModelForSequenceClassification"), this method returns all the Hugging Face objects you need to train
a model using Blurr

## BlurrText -

In [None]:
# |export
@Singleton
class BlurrText:
    """A general utility class for getting your Hugging Face objects"""

    def __init__(self):
        # get hf classes (tokenizers, configs, models, etc...)
        transformer_classes = inspect.getmembers(
            importlib.import_module("transformers")
        )

        # build a df that we can query against to get various transformers objects/info
        self._df = pd.DataFrame(
            transformer_classes, columns=["class_name", "class_location"]
        )
        self._df = self._df[
            self._df.class_location.apply(lambda v: isinstance(v, type))
        ]

        # add the module each class is included in
        self._df["module"] = self._df.class_location.apply(lambda v: v.__module__)

        # remove class_location (don't need it anymore)
        self._df.drop(labels=["class_location"], axis=1, inplace=True)

        # break up the module into separate cols
        module_parts_df = self._df.module.str.split(".", n=-1, expand=True)
        for i in range(len(module_parts_df.columns)):
            self._df[f"module_part_{i}"] = module_parts_df[i]

        # using module part 1, break up the functional area and arch into separate cols
        module_part_3_df = self._df.module_part_3.str.split("_", n=1, expand=True)
        self._df[["functional_area", "arch"]] = module_part_3_df

        self._df["arch"] = self._df["arch"].str.replace("_fast", "")

        # transformers >=4.5.x does "auto" differently; so remove it and "utils" from "arch" column
        self._df = self._df[~self._df["arch"].isin(["auto", "utils"])]

        # if functional area = modeling, pull out the task it is built for
        model_type_df = self._df[
            (self._df.functional_area == "modeling")
        ].class_name.str.rsplit("For", n=1, expand=True)

        model_type_df[1] = np.where(
            model_type_df[1].notnull(),
            "For" + model_type_df[1].astype(str),
            model_type_df[1],
        )

        self._df["model_task"] = model_type_df[1]
        self._df["model_task"] = self._df["model_task"].str.replace(
            "For", "", n=1, case=True, regex=False
        )

        model_type_df = self._df[
            (self._df.functional_area == "modeling")
        ].class_name.str.rsplit("With", n=1, expand=True)
        model_type_df[1] = np.where(
            model_type_df[1].notnull(),
            "With" + model_type_df[1].astype(str),
            self._df[(self._df.functional_area == "modeling")].model_task,
        )

        self._df["model_task"] = model_type_df[1]
        self._df["model_task"] = self._df["model_task"].str.replace(
            "With", "", n=1, case=True, regex=False
        )

        # look at what we're going to remove (use to verify we're just getting rid of stuff we want too)
        # df[~df['hf_class_type'].isin(['modeling', 'configuration', 'tokenization'])]

        # only need these 3 functional areas for our querying purposes
        self._df = self._df[
            self._df["functional_area"].isin(
                ["modeling", "configuration", "tokenization"]
            )
        ]

    def get_tasks(self, arch: str = None):
        """This method can be used to get a list of all tasks supported by your transformers install, or
        just those available to a specific architecture
        """
        query = ["model_task.notna()"]
        if arch:
            query.append(f'arch == "{arch}"')

        return sorted(
            self._df.query(" & ".join(query), engine="python")
            .model_task.unique()
            .tolist()
        )

    def get_architectures(self):
        return sorted(
            self._df[(self._df.arch.notna()) & (self._df.arch != None)]
            .arch.unique()
            .tolist()
        )

    def get_models(self, arch: str = None, task: str = None):
        """The transformer models available for use (optional: by architecture | task)"""
        query = ['functional_area == "modeling"']
        if arch:
            query.append(f'arch == "{arch}"')
        if task:
            query.append(f'model_task == "{task}"')

        models = sorted(self._df.query(" & ".join(query)).class_name.tolist())
        return models

    def get_model_architecture(self, model_name_or_enum):
        """Get the architecture for a given model name / enum"""
        model_name = (
            model_name_or_enum
            if isinstance(model_name_or_enum, str)
            else model_name_or_enum.name
        )
        return self._df[self._df.class_name == model_name].arch.values[0]

    def get_hf_objects(
        self,
        pretrained_model_name_or_path: Optional[Union[str, os.PathLike]],
        model_cls: PreTrainedModel,
        config: Union[PretrainedConfig, str, os.PathLike] = None,
        tokenizer_cls: PreTrainedTokenizerBase = None,
        config_kwargs: dict = {},
        tokenizer_kwargs: dict = {},
        model_kwargs: dict = {},
        cache_dir: Union[str, os.PathLike] = None,
    ) -> Tuple[str, PretrainedConfig, PreTrainedTokenizerBase, PreTrainedModel]:

        arch, config, tokenizer, model = get_hf_objects(
            pretrained_model_name_or_path,
            model_cls,
            config,
            tokenizer_cls,
            config_kwargs,
            tokenizer_kwargs,
            model_kwargs,
            cache_dir,
        )

        if arch == "unknown":
            arch = self.get_model_architecture(type(model).__name__)

        return (arch, config, tokenizer, model)

In [None]:
show_doc(BlurrText, title_level=2)

---

## Singleton object at 0x7fadbbff55e0>

>      Singleton object at 0x7fadbbff55e0> (*args, **kwargs)

`BlurrText` is a `Singleton` (there exists only one instance, and the same instance is returned upon subsequent instantiation requests).  You can get at via the `NLP` constant below.

In [None]:
NLP = BlurrText()
NLP2 = BlurrText()
test_eq(NLP, NLP2)

In [None]:
# |hide
display(NLP._df.head())

print(list(NLP._df.model_task.unique()))
print("")
print(list(NLP._df.functional_area.unique()))
print("")
print(list(NLP._df.arch.unique()))
print("")
print(list(NLP._df.module_part_3.unique()))

Unnamed: 0,class_name,module,module_part_0,module_part_1,module_part_2,module_part_3,functional_area,arch,model_task
6,AdaptiveEmbedding,transformers.models.transfo_xl.modeling_transfo_xl,transformers,models,transfo_xl,modeling_transfo_xl,modeling,transfo_xl,
8,AlbertConfig,transformers.models.albert.configuration_albert,transformers,models,albert,configuration_albert,configuration,albert,
9,AlbertForMaskedLM,transformers.models.albert.modeling_albert,transformers,models,albert,modeling_albert,modeling,albert,MaskedLM
10,AlbertForMultipleChoice,transformers.models.albert.modeling_albert,transformers,models,albert,modeling_albert,modeling,albert,MultipleChoice
11,AlbertForPreTraining,transformers.models.albert.modeling_albert,transformers,models,albert,modeling_albert,modeling,albert,PreTraining


[None, nan, 'MaskedLM', 'MultipleChoice', 'PreTraining', 'QuestionAnswering', 'SequenceClassification', 'TokenClassification', 'CausalLM', 'ConditionalGeneration', 'ImageClassification', 'MaskedImageModeling', 'SemanticSegmentation', 'NextSentencePrediction', 'DepthEstimation', 'AudioFrameClassification', 'CTC', 'XVector', 'Teacher', 'QuestionAnsweringSimple', 'LMHeadModel', 'CausalImageModeling', 'EntityClassification', 'EntityPairClassification', 'EntitySpanClassification', 'Classification', 'InstanceSegmentation', 'merModel', 'merPreTrainedModel', 'ObjectDetection', 'ImageClassificationConvProcessing', 'ImageClassificationFourier', 'ImageClassificationLearned', 'MultimodalAutoencoding', 'OpticalFlow', 'Generation', 'OpenQA', 'LMHead', 'merLayer', 'ImageAndTextRetrieval', 'ImagesAndTextClassification', 'RegionToPhraseAlignment', 'VisualReasoning']

['modeling', 'configuration', 'tokenization']

['transfo_xl', 'albert', 'bart', 'barthez', 'bartpho', 'bert', 'beit', 'bert_generation', 

... the ***task***

In [None]:
# show_doc(BlurrText(BlurrText).get_tasks)

In [None]:
print(NLP.get_tasks())
print("")
print(NLP.get_tasks("bart"))

['AudioFrameClassification', 'CTC', 'CausalImageModeling', 'CausalLM', 'Classification', 'ConditionalGeneration', 'DepthEstimation', 'EntityClassification', 'EntityPairClassification', 'EntitySpanClassification', 'Generation', 'ImageAndTextRetrieval', 'ImageClassification', 'ImageClassificationConvProcessing', 'ImageClassificationFourier', 'ImageClassificationLearned', 'ImagesAndTextClassification', 'InstanceSegmentation', 'LMHead', 'LMHeadModel', 'MaskedImageModeling', 'MaskedLM', 'MultimodalAutoencoding', 'MultipleChoice', 'NextSentencePrediction', 'ObjectDetection', 'OpenQA', 'OpticalFlow', 'PreTraining', 'QuestionAnswering', 'QuestionAnsweringSimple', 'RegionToPhraseAlignment', 'SemanticSegmentation', 'SequenceClassification', 'Teacher', 'TokenClassification', 'VisualReasoning', 'XVector', 'merLayer', 'merModel', 'merPreTrainedModel']

['CausalLM', 'ConditionalGeneration', 'QuestionAnswering', 'SequenceClassification']


... the ***architecture***

In [None]:
# show_doc(BlurrText(BlurrText).get_architectures)

In [None]:
print(NLP.get_architectures())

['albert', 'bart', 'barthez', 'bartpho', 'beit', 'bert', 'bert_generation', 'bert_japanese', 'bertweet', 'big_bird', 'bigbird_pegasus', 'blenderbot', 'blenderbot_small', 'bloom', 'byt5', 'camembert', 'canine', 'clip', 'codegen', 'convbert', 'convnext', 'cpm', 'ctrl', 'cvt', 'data2vec_audio', 'data2vec_text', 'data2vec_vision', 'deberta', 'deberta_v2', 'decision_transformer', 'deit', 'detr', 'distilbert', 'dpr', 'dpt', 'electra', 'encoder_decoder', 'flaubert', 'flava', 'fnet', 'fsmt', 'funnel', 'glpn', 'gpt2', 'gpt_neo', 'gpt_neox', 'gptj', 'groupvit', 'herbert', 'hubert', 'ibert', 'imagegpt', 'layoutlm', 'layoutlmv2', 'layoutlmv3', 'layoutxlm', 'led', 'levit', 'longformer', 'longt5', 'luke', 'lxmert', 'm2m_100', 'marian', 'maskformer', 'mbart', 'mbart50', 'mctct', 'megatron_bert', 'mluke', 'mmbt', 'mobilebert', 'mobilevit', 'mpnet', 'mt5', 'mvp', 'nezha', 'nllb', 'nystromformer', 'openai', 'opt', 'owlvit', 'pegasus', 'perceiver', 'phobert', 'plbart', 'poolformer', 'prophetnet', 'qdqber

In [None]:
# show_doc(BlurrText(BlurrText).get_model_architecture)

In [None]:
print(NLP.get_model_architecture("RobertaForSequenceClassification"))

roberta


... and lastly the ***models*** (optionally for a given task and/or architecture)

In [None]:
# show_doc(BlurrText(BlurrText).get_models)

In [None]:
print(L(NLP.get_models())[:5])

['AdaptiveEmbedding', 'AlbertForMaskedLM', 'AlbertForMultipleChoice', 'AlbertForPreTraining', 'AlbertForQuestionAnswering']


In [None]:
print(NLP.get_models(arch="bert")[:5])

['BertForMaskedLM', 'BertForMultipleChoice', 'BertForNextSentencePrediction', 'BertForPreTraining', 'BertForQuestionAnswering']


In [None]:
print(NLP.get_models(task="TokenClassification")[:5])

['AlbertForTokenClassification', 'BertForTokenClassification', 'BigBirdForTokenClassification', 'BloomForTokenClassification', 'CamembertForTokenClassification']


In [None]:
print(NLP.get_models(arch="bert", task="TokenClassification"))

['BertForTokenClassification']


## To get all your Hugging Face objects (arch, config, tokenizer, and model)

How to use:

In [None]:
from transformers import AutoModelForMaskedLM

hf_logging.set_verbosity_error()

arch, config, tokenizer, model = get_hf_objects(
    "bert-base-cased-finetuned-mrpc", model_cls=AutoModelForMaskedLM
)

print(arch)
print(type(config))
print(type(tokenizer))
print(type(model))

bert
<class 'transformers.models.bert.configuration_bert.BertConfig'>
<class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>
<class 'transformers.models.bert.modeling_bert.BertForMaskedLM'>


In [None]:
from transformers import AutoModelForQuestionAnswering

hf_logging.set_verbosity_error()

arch, config, tokenizer, model = get_hf_objects(
    "fmikaelian/flaubert-base-uncased-squad", model_cls=AutoModelForQuestionAnswering
)

print(arch)
print(type(config))
print(type(tokenizer))
print(type(model))

flaubert
<class 'transformers.models.flaubert.configuration_flaubert.FlaubertConfig'>
<class 'transformers.models.flaubert.tokenization_flaubert.FlaubertTokenizer'>
<class 'transformers.models.flaubert.modeling_flaubert.FlaubertForQuestionAnsweringSimple'>


In [None]:
from transformers import BertTokenizer, BertForNextSentencePrediction

hf_logging.set_verbosity_error()

arch, config, tokenizer, model = get_hf_objects(
    "bert-base-cased-finetuned-mrpc",
    config=None,
    tokenizer_cls=BertTokenizer,
    model_cls=BertForNextSentencePrediction,
)
print(arch)
print(type(config))
print(type(tokenizer))
print(type(model))

bert
<class 'transformers.models.bert.configuration_bert.BertConfig'>
<class 'transformers.models.bert.tokenization_bert.BertTokenizer'>
<class 'transformers.models.bert.modeling_bert.BertForNextSentencePrediction'>


## Export -

In [None]:
# |hide
nbdev_export()