# HuggingFace Transformers Quick Tour

https://huggingface.co/docs/transformers/main/en/quicktour 

In [1]:
#pip install transformers datasets librosa torch

# Pipelines

In [2]:
from transformers import pipeline

classifier = pipeline("sentiment-analysis")

classifier("We are very happy to show you the 🤗 Transformers library.")
results = classifier(["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."])

for result in results:
    print(f"label: {result['label']}, with score: {round(result['score'], 4)}")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


label: POSITIVE, with score: 0.9998
label: NEGATIVE, with score: 0.5309


In [3]:
import torch
from datasets import load_dataset, Audio
from transformers import pipeline

# info on hf pipelines: https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.pipeline
speech_recognizer = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h")

# Dataset: https://huggingface.co/datasets/PolyAI/minds14
dataset = load_dataset("PolyAI/minds14", name="en-US", split="train")

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the latest cached version of the module from /Users/gen/.cache/huggingface/modules/datasets_modules/datasets/PolyAI--minds14/aa40414f15e0f919231d617440192034af844835dc1e6a697f4b552e0551fd26 (last modified on Fri Jan 13 13:20:56 2023) since it couldn't be found locally at PolyAI/minds14., or remotely on the Hugging Face Hub.
Found cached dataset minds14 (/Users/gen/.cache/huggingface/datasets/PolyAI___minds14/en-US/1.0.0/aa40414f15e0f919231d617440192034af844835dc1e6a697f4b552e0551fd26)


In [4]:
# Get the sampling rate
dataset = dataset.cast_column("audio", Audio(sampling_rate=speech_recognizer.feature_extractor.sampling_rate))
result = speech_recognizer(dataset[:4]["audio"])
print([d["text"] for d in result])
# if data big then use generator instead of a list
# eg: speech or vision
# loads inputs to mem

use_pytorch = True
use_tensorflow = False

# choose a suitable model from the hub: https://huggingface.co/models
# For a custom use case, fine tune a model on your custom data
# here: mBERT for French sentiment analysis
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"


# Docs auto seq class: https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForSequenceClassification
# Docs auto tokenizer: https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoTokenizer

if use_pytorch:
    from transformers import AutoTokenizer, AutoModelForSequenceClassification
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name)
elif use_tensorflow:
    from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
    model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
    tokenizer = AutoTokenizer.from_pretrained(model_name) 

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
classifier("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")

['I WOULD LIKE TO SET UP A JOINT ACCOUNT WITH MY PARTNER HOW DO I PROCEED WITH DOING THAT', "FODING HOW I'D SET UP A JOIN TO HET WITH MY WIFE AND WHERE THE AP MIGHT BE", "I I'D LIKE TOY SET UP A JOINT ACCOUNT WITH MY PARTNER I'M NOT SEEING THE OPTION TO DO IT ON THE AP SO I CALLED IN TO GET SOME HELP CAN I JUST DO IT OVER THE PHONE WITH YOU AND GIVE YOU THE INFORMATION OR SHOULD I DO IT IN THE AP AND I'M MISSING SOMETHING UQUETTE HAD PREFERRED TO JUST DO IT OVER THE PHONE OF POSSIBLE THINGS", 'HOW DO I THURN A JOIN A COUNT']


[{'label': '5 stars', 'score': 0.7272654175758362}]

# The Auto Class

In [5]:
# pad the inputs to ensure equal length
if use_pytorch:
    pt_batch = tokenizer(
        ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )
elif use_tensorflow:
    tf_batch = tokenizer(
        ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
        padding=True,
        truncation=True,
        max_length=512,
        return_tensors="tf",
    )

In [6]:
# A note about auto classes and auto models

from transformers import AutoTokenizer

# Instantiate a tokenizer on the same model you'll use.
model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)

encoding = tokenizer("We are very happy to show you the 🤗 Transformers library.")
print(encoding)

# This model is multilingual. Can use it to tokenize French too.
french_encoding = tokenizer("Nous sommes très heureux de vous présenter la bibliothèque 🤗 Transformers.")
print(french_encoding)


# Auto model
# docs: https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModel

if use_pytorch:
    from transformers import AutoModelForSequenceClassification

    model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
    pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)

    pt_outputs = pt_model(**pt_batch)

    from torch import nn
    
    # normalization delayed so it can be combined with evaluation/loss function
    # model outputs can behave like a tuple or dictionary (if keyed)
    pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
    print(pt_predictions)
elif use_tensorflow:
    from transformers import TFAutoModelForSequenceClassification

    model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
    tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)

    tf_outputs = tf_model(tf_batch)

    import tensorflow as tf

    tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)
    tf_predictions


{'input_ids': [101, 11312, 10320, 12495, 19308, 10114, 11391, 10855, 10103, 100, 58263, 13299, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'input_ids': [101, 16902, 24419, 10107, 10860, 41838, 12204, 10102, 22812, 40452, 10106, 26759, 100, 58263, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
tensor([[0.0021, 0.0018, 0.0115, 0.2121, 0.7725],
        [0.2084, 0.1826, 0.1969, 0.1755, 0.2365]], grad_fn=<SoftmaxBackward0>)


In [14]:
tf_save_directory = "./tf_save_pretrained"
pt_save_directory = "./pt_save_pretrained"

if use_pytorch:
    # Save a fine tuned model and tokenizer
    tokenizer.save_pretrained(pt_save_directory)
    pt_model.save_pretrained(pt_save_directory)
    
    # Load it back in later
    pt_model = AutoModelForSequenceClassification.from_pretrained("./pt_save_pretrained")
    
"""
elif use_tensorflow:
    # Save model and tokenizer    
    tf_save_directory = "./tf_save_pretrained"
    tokenizer.save_pretrained(tf_save_directory)
    tf_model.save_pretrained(tf_save_directory)
    
    # Load in model and tokenizer
    tf_model = TFAutoModelForSequenceClassification.from_pretrained("./tf_save_pretrained")
"""

# Convert between frameworks from_pt or from_tf

use_pytorch = False
use_tensorflow = True

from transformers import TFAutoModelForSequenceClassification
import tensorflow as tf

if use_pytorch:
    from transformers import AutoModel
    tokenizer = AutoTokenizer.from_pretrained(tf_save_directory)
    pt_model = AutoModelForSequenceClassification.from_pretrained(tf_save_directory, from_tf=True)
elif use_tensorflow:
    from transformers import TFAutoModel
    tokenizer = AutoTokenizer.from_pretrained(pt_save_directory)
    tf_model = TFAutoModelForSequenceClassification.from_pretrained(pt_save_directory, from_pt=True)
    
use_pytorch = True
use_tensorflow = False

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertForSequenceClassification: ['bert.embeddings.position_ids']
- This IS expected if you are initializing TFBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


# Custom Models

In [15]:
# Any model created with a custom config, weights need to be learned

# Using a pretrained model, pull its config, 
# changing one attribute: number of attention heads
from transformers import AutoConfig
my_config = AutoConfig.from_pretrained("distilbert-base-uncased", n_heads=12)

In [16]:
if use_pytorch:
    from transformers import AutoModel
    my_model = AutoModel.from_config(my_config)
elif use_tensorflow:
    from transformers import TFAutoModel
    my_model = TFAutoModel.from_config(my_config)
    
# and then train it

# Trainer (pytorch optimized)

In [17]:
# Trainer Params

# Model
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")


# Args
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="trainer/out",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
)


# Preprocessing class like a tokenizer, image processor, feature extractor, or processor
from transformers import AutoTokenizer
# tokenizer from that model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


# Dataset
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes")  # doctest: +IGNORE_RESULT


# Tokenizer for dataset
def tokenize_dataset(dataset):
    return tokenizer(dataset["text"])

dataset = dataset.map(tokenize_dataset, batched=True)

# Data Collator (with padding)
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier

Downloading builder script:   0%|          | 0.00/5.03k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.02k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.25k [00:00<?, ?B/s]

Downloading and preparing dataset rotten_tomatoes/default to /Users/gen/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46...


Downloading data:   0%|          | 0.00/488k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/8530 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1066 [00:00<?, ? examples/s]

Dataset rotten_tomatoes downloaded and prepared to /Users/gen/.cache/huggingface/datasets/rotten_tomatoes/default/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/9 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [18]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)  # doctest: +SKIP


In [None]:
# Always have training in its own cell.
# Don't want to have to retrain just cause the next thing didn't work.

In [21]:
trainer.train()


The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 8530
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2134
  Number of trainable parameters = 66955010


Step,Training Loss
500,0.162
1000,0.1674
1500,0.1725
2000,0.168


Saving model checkpoint to trainer/out/checkpoint-500
Configuration saved in trainer/out/checkpoint-500/config.json
Model weights saved in trainer/out/checkpoint-500/pytorch_model.bin
tokenizer config file saved in trainer/out/checkpoint-500/tokenizer_config.json
Special tokens file saved in trainer/out/checkpoint-500/special_tokens_map.json
Saving model checkpoint to trainer/out/checkpoint-1000
Configuration saved in trainer/out/checkpoint-1000/config.json
Model weights saved in trainer/out/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in trainer/out/checkpoint-1000/tokenizer_config.json
Special tokens file saved in trainer/out/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to trainer/out/checkpoint-1500
Configuration saved in trainer/out/checkpoint-1500/config.json
Model weights saved in trainer/out/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in trainer/out/checkpoint-1500/tokenizer_config.json
Special tokens file saved in trainer/ou

TrainOutput(global_step=2134, training_loss=0.16697968784178543, metrics={'train_runtime': 560.2621, 'train_samples_per_second': 30.45, 'train_steps_per_second': 3.809, 'total_flos': 195799234032192.0, 'train_loss': 0.16697968784178543, 'epoch': 2.0})

In [20]:
# trainer.push_to_hub() 
# This requires that it be a git repo

"""
OSError: Tried to clone a repository in a non-empty folder that isn't a git repository. If you really want to do this, do it manually:
git init && git remote add origin && git pull origin main
 or clone repo to a new folder and move your existing files there afterwards.
"""

"\nOSError: Tried to clone a repository in a non-empty folder that isn't a git repository. If you really want to do this, do it manually:\ngit init && git remote add origin && git pull origin main\n or clone repo to a new folder and move your existing files there afterwards.\n"

# TensorFlow Training

In [None]:
# Model
from transformers import TFAutoModelForSequenceClassification
model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased")

# Preprocessing class like a tokenizer, image processor, feature extractor, or processor
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

# Function to optimize dataset
def tokenize_dataset(dataset):
    return tokenizer(dataset["text"])  # doctest: +SKIP

# Dataset
dataset = dataset.map(tokenize_dataset)  # doctest: +SKIP
tf_dataset = model.prepare_tf_dataset(
    dataset, batch_size=16, shuffle=True, tokenizer=tokenizer
)  # doctest: +SKIP

In [None]:
from tensorflow.keras.optimizers import Adam

model.compile(optimizer=Adam(3e-5))
model.fit(dataset)  # doctest: +SKIP