In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import sys

sys.path.append("..")

In [None]:
import pdb, sys, inspect
from enum import Enum

import pandas as pd
import torch

from transformers import *
from fastai2.text.all import *

In [None]:
torch.cuda.set_device(1)
print(f"Using GPU #{torch.cuda.current_device()}: {torch.cuda.get_device_name()}")

Using GPU #1: GeForce GTX 1080 Ti


In [None]:
MODEL_FOR_QUESTION_ANSWERING_MAPPING
MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
MODEL_MAPPING

MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
ALL_MODELS = sum(
    (tuple(conf.pretrained_config_archive_map.keys()) for conf in MODEL_CONFIG_CLASSES),
    (),
)
MODEL_CONFIG_CLASSES, MODEL_TYPES,  # ALL_MODELS

## Utility Methods

In [None]:
# converts string representation to class
def str_to_class(classname):
    return getattr(sys.modules[__name__], classname)

## Class Inspection & Querying

**Notes**:

[1] There are "three standard classes required to use each model: **configuration, models and tokenizer**."

[2] All three standard classes can be initialized [via] `from_pretrained()`.  This method will download (as needed), cache, and load the pre-trained instace from the library or via the filesystem. 

**Models**: All derive from `nn.Module` (e.g., `BertModel`)

**Configuration**: Stores configuration required to **build a model** (e.g., `BertConfig`). "*If you are using a pretrained model* without any modification, *creating the model will automatically take care of instantiating the configuration* (which is part of the model)."

**Tokenizer**: Stores the vocab for each model and provides methods to encode/decode strings and provide the various embeddings required to be fed into a model.

**`from_pretrained()`**: To instantiate any of the above classes using a friendly name included in the library (`bert-base-uncased`) or from a path.

**`save_pretrained()`**: To save any of the classes locally so it can be re-loaded using `from_pretrained()`

In [None]:
transformer_classes = inspect.getmembers(
    sys.modules[__name__],
    lambda member: inspect.isclass(member)
    and member.__module__.startswith("transformers."),
)

transformer_classes[:5]

In [None]:
df = pd.DataFrame(transformer_classes, columns=["class_name", "class_location"])
df.head()

In [None]:
df["module"] = df.class_location.apply(lambda v: v.__module__)
df.head()

In [None]:
df.drop(labels=["class_location"], axis=1, inplace=True)
df.head()

In [None]:
module_parts_df = df.module.str.split(".", n=-1, expand=True)

In [None]:
for i in range(len(module_parts_df.columns)):
    df[f"module_part_{i}"] = module_parts_df[i]

df.head()

In [None]:
module_part_1_df = df.module_part_1.str.split("_", n=1, expand=True)
module_part_1_df.head()

In [None]:
df[["functional_area", "arch"]] = module_part_1_df
df.head()

Look custom, task-based implementations of models (indicated by `<model>For<task>`)

In [None]:
model_type_df = df[(df.functional_area == "modeling")].class_name.str.split(
    "For", n=1, expand=True
)
model_type_df.head()

In [None]:
model_type_df[1] = np.where(
    model_type_df[1].notnull(), "For" + model_type_df[1].astype(str), model_type_df[1]
)
df["model_task"] = model_type_df[1]

Look custom, task-based implementations of models (indicated by `<model>With<task>`)

In [None]:
model_type_df = df[(df.functional_area == "modeling")].class_name.str.split(
    "With", n=1, expand=True
)
model_type_df.head()

In [None]:
model_type_df[1] = np.where(
    model_type_df[1].notnull(),
    "With" + model_type_df[1].astype(str),
    df[(df.functional_area == "modeling")].model_task,
)

df["model_task"] = model_type_df[1]

In [None]:
df.head()

In [None]:
print(list(df.model_task.unique()))
print(list(df.functional_area.unique()))
print(list(df.module_part_2.unique()))
print(list(df.module_part_3.unique()))

In [None]:
# look at what we're going to remove (use to verify we're just getting rid of stuff we want too)
# df[~df['hf_class_type'].isin(['modeling', 'configuration', 'tokenization'])]

In [None]:
df = df[df["functional_area"].isin(["modeling", "configuration", "tokenization"])]

### Get included architectures

In [None]:
def get_architectures():
    return df[(df.arch.notna()) & (df.arch != None)].arch.unique().tolist()

In [None]:
print(get_architectures())

In [None]:
TRANSFORMER_ARCHITECTURES = Enum("TRANSFORMER_ARCHITECTURES", get_architectures())

In [None]:
print(L(TRANSFORMER_ARCHITECTURES))

### Get an architecture's config

In [None]:
def get_config(arch):
    return df[
        (df.functional_area == "configuration") & (df.arch == arch)
    ].class_name.values[0]

In [None]:
print(get_config("bert"))

### Get an architecture's tokenizers

There may be multiple so this returns a list

In [None]:
def get_tokenizers(arch):
    return df[
        (df.functional_area == "tokenization") & (df.arch == arch)
    ].class_name.values

In [None]:
print(get_tokenizers("electra"))

### Get included custom model tasks

Get the type of tasks for which there is a custom model for (*optional: by architecture*). There are a number of customized models built for specific tasks like token classification, question/answering, LM, etc....

In [None]:
def get_tasks(arch=None):
    query = ["model_task.notna()"]
    if arch:
        query.append(f'arch == "{arch}"')

    return df.query(" & ".join(query)).model_task.unique().tolist()

In [None]:
print(get_tasks())
print(get_tasks("bart"))

In [None]:
TRANSFORMER_TASKS_ALL = Enum("TRANSFORMER_TASKS_ALL", get_tasks())
TRANSFORMER_TASKS_AUTO = Enum("TRANSFORMER_TASKS_AUTO", get_tasks("auto"))

In [None]:
print("--- all tasks ---")
print(L(TRANSFORMER_TASKS_ALL))
print("\n--- auto only ---")
print(L(TRANSFORMER_TASKS_AUTO))

### Get included models

The transformer models available for use (*optional: by architecture | task*)

In [None]:
def get_models(arch=None, task=None):
    query = ['functional_area == "modeling"']
    if arch:
        query.append(f'arch == "{arch}"')
    if task:
        query.append(f'model_task == "{task}"')

    return df.query(" & ".join(query)).class_name.tolist()

In [None]:
print(L(get_models()))

In [None]:
print(get_models(arch="bert"))

In [None]:
print(get_models(task="ForTokenClassification"))

In [None]:
print(get_models(arch="bert", task="ForTokenClassification"))

In [None]:
TRANSFORMER_MODELS = Enum("TRANSFORMER_MODELS", get_models())

In [None]:
print(L(TRANSFORMER_MODELS))

### Get tokenizers, config, and model for a given model name / enum

In [None]:
def get_classes_for_model(model_name_or_enum):
    model_name = (
        model_name_or_enum
        if isinstance(model_name_or_enum, str)
        else model_name_or_enum.name
    )

    meta = df[df.class_name == model_name]
    tokenizers = get_tokenizers(meta.arch.values[0])
    config = get_config(meta.arch.values[0])

    return (
        [str_to_class(tok) for tok in tokenizers],
        str_to_class(config),
        str_to_class(model_name),
    )

In [None]:
tokenizers, config, model = get_classes_for_model("RobertaForSequenceClassification")

print(tokenizers[0])
print(config)
print(model)

In [None]:
tokenizers, config, model = get_classes_for_model(TRANSFORMER_MODELS.DistilBertModel)

print(tokenizers[0])
print(config)
print(model)

In [None]:
def get_model_architecture(model_name_or_enum):
    model_name = (
        model_name_or_enum
        if isinstance(model_name_or_enum, str)
        else model_name_or_enum.name
    )
    return df[df.class_name == model_name].arch.values[0]

In [None]:
get_model_architecture("RobertaForSequenceClassification")

## Loading Pre-Trained (configs, tokenizer, model)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-cased-finetuned-mrpc"
)

In [None]:
def get_auto_hf_objects(
    pretrained_model_name_or_path,
    task=TRANSFORMER_TASKS_AUTO.ForSequenceClassification,
    config=None,
):

    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)
    config = (
        AutoConfig.from_pretrained(pretrained_model_name_or_path)
        if (config is None)
        else config
    )

    model = str_to_class(f"AutoModel{task.name}").from_pretrained(
        pretrained_model_name_or_path, config=config
    )
    arch = get_model_architecture(type(model).__name__)

    return (arch, tokenizer, config, model)

In [None]:
arch, tokenizer, config, model = get_auto_hf_objects(
    "bert-base-cased-finetuned-mrpc", task=TRANSFORMER_TASKS_AUTO.WithLMHead
)

print(arch)
print(type(tokenizer))
print(type(config))
print(type(model))

In [None]:
arch, tokenizer, config, model = get_auto_hf_objects(
    "fmikaelian/flaubert-base-uncased-squad",
    task=TRANSFORMER_TASKS_AUTO.ForQuestionAnswering,
)

print(arch)
print(type(tokenizer))
print(type(config))
print(type(model))

In [None]:
def get_transformer_objects(
    pretrained_model_name_or_path,
    tokenizer_cls=BertTokenizer,
    model_cls=TRANSFORMER_MODELS.BertModel,
    config_cls=BertConfig,
):

    tokenizer = tokenizer_cls.from_pretrained(pretrained_model_name_or_path)

    if config_cls is None:
        model = str_to_class(model_cls.name).from_pretrained(
            pretrained_model_name_or_path
        )
        config = None
    else:
        config = config_cls.from_pretrained(pretrained_model_name_or_path)
        model = str_to_class(model_cls.name).from_pretrained(
            pretrained_model_name_or_path, config=config
        )

    arch = get_model_architecture(type(model).__name__)

    return (arch, tokenizer, config, model)

In [None]:
arch, tokenizer, config, model = get_transformer_objects(
    "bert-base-cased-finetuned-mrpc",
    tokenizer_cls=BertTokenizer,
    config_cls=None,
    model_cls=TRANSFORMER_MODELS.BertForNextSentencePrediction,
)
print(arch)
print(type(tokenizer))
print(type(config))
print(type(model))

## Tokenizers

Terms:

**Input IDs**: \
"The input ids are often the only required parameters to be passed to the model as input. They are *token indices, numerical representations of tokens* building the sequences that will be used as input by the model."

`tokenizer.tokenize(sequence)` => Splits the sequnce into tokens based on vocab

`tokenizer.encode(sequence)` => Converts tokens to their numerical IDs (add `add_special_tokens=False` to exclude special tokens)

`tokenizer.encode_plus(sequence)` => Returnes a dictionary of "input_ids", "token_type_ids", and "attention_mask"

**Attention Mask**: \
"This argument indicates to the model which tokens should be attended to, and which should not ... a binary tensor indicating the position of the padded indices so that the model does not attend to them. For the BertTokenizer, 1 indicate a value that should be attended to while 0 indicate a padded value." (optional)

`tokenizer.encode(sequence, max_length=20, pad_to_max_length=True)`

**Token Type IDs**: \
"Some models’ purpose is to do sequence classification or question answering. These require two different sequences to be encoded in the same input IDs. They are usually separated by special tokens, such as the classifier and separator tokens.... The Token Type IDs are a binary mask identifying the different sequences (segments) in the model."

`tokenizer.encode(sequence_a, sequence_b)`

"The first sequence, the “context” used for the question, has all its tokens represented by 0, whereas the question has all its tokens represented by 1. Some models, like `XLNetModel` use an additional token represented by a 2."

**Position IDs**: \
"The position IDs are used by the model to identify which token is at which position. Contrary to RNNs that have the position of each token embedded within them, transformers are unaware of the position of each token.... If no position IDs are passed to the model, they are automatically created as absolute positional embeddings." (optional)

"Absolute positional embeddings are selected in the range `[0, config.max_position_embeddings - 1]`. Some models use other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings."

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
print(tokenizer.tokenize("Hi! You like the Bert Tokenizer?"))
print(tokenizer.encode("Hi! You like the Bert Tokenizer?"))
print(tokenizer.encode("Hi! You like the Bert Tokenizer?", add_special_tokens=False))
print(tokenizer.encode_plus("Hi! You like the Bert Tokenizer?"))
print(tokenizer.encode_plus("Hi!", "You like the Bert Tokenizer?"))

In [None]:
print(tokenizer.encode("Hi! You like the Bert Tokenizer?", add_special_tokens=False))

In [None]:
# ALBERT
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
tok_results = tokenizer.encode_plus("Hi!", "You like the Bert Tokenizer?")
print(tok_results)
print(tokenizer.decode(tok_results["input_ids"]))
print(tokenizer.pad_token_id, tokenizer.pad_token_type_id)

In [None]:
# BART
tokenizer = BartTokenizer.from_pretrained("bart-large-cnn")
tok_results = tokenizer.encode_plus("Hi!", "You like the Bert Tokenizer?")
print(tok_results)
print(tokenizer.decode(tok_results["input_ids"]))
print(tokenizer.pad_token_id, tokenizer.pad_token_type_id)

In [None]:
# BERT
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
tok_results = tokenizer.encode_plus("Hi!", "You like the Bert Tokenizer?")
print(tok_results)
print(tokenizer.decode(tok_results["input_ids"]))
print(tokenizer.pad_token_id, tokenizer.pad_token_type_id)
print(
    tokenizer.prepare_for_model(
        [101, 8790, 106, 102, 1192, 1176, 1103, 15035, 1706, 6378, 17260, 136, 102],
        None,
    )
)

In [None]:
# CTRL
tokenizer = CTRLTokenizer.from_pretrained("ctrl")
tok_results = tokenizer.encode_plus("Hi!", "You like the Bert Tokenizer?")
print(tok_results)
print(tokenizer.decode(tok_results["input_ids"]))
print(tokenizer.pad_token_id, tokenizer.pad_token_type_id)

In [None]:
# CAMBERT
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
tok_results = tokenizer.encode_plus("Hi!", "You like the Bert Tokenizer?")
print(tok_results)
print(tokenizer.decode(tok_results["input_ids"]))
print(tokenizer.pad_token_id, tokenizer.pad_token_type_id)

In [None]:
# ELECTRA
tokenizer = ElectraTokenizer.from_pretrained("google/electra-small-discriminator")
tok_results = tokenizer.encode_plus("Hi!", "You like the Bert Tokenizer?")
print(tok_results)
print(tokenizer.decode(tok_results["input_ids"]))
print(tokenizer.pad_token_id, tokenizer.pad_token_type_id)

In [None]:
# GPT-2
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tok_results = tokenizer.encode_plus("Hi!", "You like the Bert Tokenizer?")
print(tok_results)
print(tokenizer.decode(tok_results["input_ids"]))
print(tokenizer.pad_token_id, tokenizer.pad_token_type_id)

In [None]:
# GPT
tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
tok_results = tokenizer.encode_plus("Hi!", "You like the Bert Tokenizer?")
print(tok_results)
print(tokenizer.decode(tok_results["input_ids"]))
print(tokenizer.pad_token_id, tokenizer.pad_token_type_id)

In [None]:
# RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
tok_results = tokenizer.encode_plus("Hi!", "You like the Bert Tokenizer?")
print(tok_results)
print(tokenizer.decode(tok_results["input_ids"]))
print(tokenizer.pad_token_id, tokenizer.pad_token_type_id)

In [None]:
# T5
tokenizer = T5Tokenizer.from_pretrained("t5-small")
tok_results = tokenizer.encode_plus("Hi!", "You like the Bert Tokenizer?")
print(tok_results)
print(tokenizer.decode(tok_results["input_ids"]))
print(tokenizer.pad_token_id, tokenizer.pad_token_type_id)

In [None]:
# TransfoXLTokenizer
tokenizer = TransfoXLTokenizer.from_pretrained("transfo-xl-wt103")
tok_results = tokenizer.encode_plus(
    "Hi!", "You like the Bert Tokenizer?", add_space_before_punct_symbol=True
)
print(tok_results)
print(tokenizer.decode(tok_results["input_ids"]))
print(tokenizer.pad_token_id, tokenizer.pad_token_type_id)

In [None]:
# XLMRobertaTokenizer

In [None]:
# XLM
tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
tok_results = tokenizer.encode_plus("Hi!", None)
print(tok_results)
print(tokenizer.decode(tok_results["input_ids"]))
print(tokenizer.pad_token_id, tokenizer.pad_token_type_id)

In [None]:
# XLNet
tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
tok_results = tokenizer.encode_plus("Hi! what do you thing of this thing we are doing")

print(tok_results)
print(tokenizer.pad_token_id, tokenizer.pad_token_type_id)

tokenizer.batch_encode_plus(
    ["Hi! what do you thing of this thing we are doing"],
    max_length=10,
    stride=5,
    pad_to_max_length=True,
    return_overflowing_tokens=True,
    return_special_tokens_masks=True,
    return_input_lengths=True,
)

In [None]:
encoded_ids = tokenizer.encode("Hi!", "You like the Bert Tokenizer?")
print(encoded_ids)
toks = tokenizer.convert_ids_to_tokens(encoded_ids)
print(toks)
sep_idxs = [idx for idx, tok in enumerate(toks) if tok == tokenizer.sep_token]
print(len(sep_idxs), sep_idxs)
toks_modified = (
    toks if len(sep_idxs) == 1 else [toks[: sep_idxs[0] + 1], toks[sep_idxs[0] + 1 :]]
)
print(toks_modified)
tokenizer.get_special_tokens_mask(*toks_modified)

In [None]:
tokenizer.encode(".", add_special_tokens=False)
tokenizer.get_vocab()["."]

In [None]:
tok_a = tokenizer.tokenize("Hi!")
tok_b = tokenizer.tokenize("You like the Bert Tokenizer?")
tok_a, tok_b

In [None]:
a = tokenizer.convert_tokens_to_ids(tokenizer.tokenize("Hi!"))
b = tokenizer.convert_tokens_to_ids(tokenizer.tokenize("You like the Bert Tokenizer?"))

print(tokenizer.build_inputs_with_special_tokens(a, b))
print(tokenizer.create_token_type_ids_from_sequences(a, b))

# ddd = tokenizer.build_inputs_with_special_tokens(a,b)
# [0 if idx == tokenizer.pad_token_id else 1 for idx in ddd]

tokenizer.pad_token_id, tokenizer.pad_token_type_id

In [None]:
d = tokenizer.prepare_for_model(
    a, b, max_length=25, pad_to_max_length=True, return_tensors="pt"
)
e = tokenizer.prepare_for_model(
    a, b, max_length=25, pad_to_max_length=True, return_tensors="pt"
)
f = tokenizer.prepare_for_model(
    a, b, max_length=25, pad_to_max_length=True, return_tensors="pt"
)

x = [d["input_ids"], e["input_ids"], f["input_ids"]]
d["input_ids"].shape, torch.cat(x).shape

## Models

"See the models docstrings for the detail of the inputs" ... `outputs = model(tokens_tensor, token_type_ids=segments_tensors)`

"Transformers models always output tuples. See the models docstrings for the detail of all the outputs. In our case, the first element is the hidden state of the last layer of the Bert model" ... `encoded_layers = outputs[0]`

`GPT-2`, `GPT`, `XLNet`, `Transfo-XL`, `CTRL` (and some others) "make use of a `past` or `mems` attribute which can be used to prevent re-computing the key/value pairs when using sequential decoding. It is useful when generating sequences as a big part of the attention mechanism benefits from previous computations."

"If you want to fine-tune a model on a specific task, you can leverage one of the `run_$TASK.py` script in the examples directory.

**AutoModel**:
"These examples leverage auto-models, which are classes that will instantiate a model according to a given checkpoint, automatically selecting the correct model architecture. Please check the `AutoModel` documentation for more information"
- AutoConfig
- AutoTokenizer
- AutoModel
- AutoModelForPreTraining
- AutoModelWithLMHead
- AutoModelForQuestionAnswering
- AutoModelForSequenceClassification
- AutoModelForTokenClassification

**Inference**:

Option 1: Use `Pipelines`

Option 2: Use the model directly with the tokenizer

## Question-Answer

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForQuestionAnswering.from_pretrained(
    "bert-large-uncased-whole-word-masking-finetuned-squad"
)

In [None]:
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_ids = tokenizer.encode(question, text)
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
start_scores, end_scores = model(
    torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids])
)

In [None]:
input_ids, start_scores.shape

([101,
  2040,
  2001,
  3958,
  27227,
  1029,
  102,
  3958,
  27227,
  2001,
  1037,
  3835,
  13997,
  102],
 torch.Size([1, 14]))

In [None]:
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = " ".join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores) + 1])
answer

'a nice puppet'

In [None]:
tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
model = XLMForQuestionAnsweringSimple.from_pretrained("xlm-mlm-en-2048")

In [None]:
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_ids = tokenizer.encode(question, text, add_special_tokens=True)

In [None]:
outputs = model(torch.tensor([input_ids]))

In [None]:
len(outputs)

In [None]:
all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = " ".join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores) + 1])
answer, all_tokens

In [None]:
input_ids = torch.tensor(
    tokenizer.encode(
        "Who was Jim Henson?", "Jim Henson was a nice puppet", add_special_tokens=True
    )
).unsqueeze(
    0
)  # Batch size 1
start_positions = torch.tensor([1])
end_positions = torch.tensor([3])
outputs = model(input_ids)
# loss = outputs[0]
torch.argmax(outputs[1]), len(input_ids[0]), input_ids

## Navigating nn hierarchy

In [None]:
# layer_groups = hft_splitter(temp_arch, tmp_model)
# print(len(layer_groups))

# for g in layer_groups:
#     print(len(g))

# layer_groups[3][3].shape

# tmp_model

# for g in layer_groups:
#     print(len(g))

# x = list(hft_model.named_children())[0]

# len(list(x[1].named_children()))
# for m in x[1].named_children():
#     print(m[0])

# for m in tmp_model.named_children():
#     print(m[0])