# Pretrained Models with Tensorflow from Hugging Faces

## 1. Loading the dataset

In [1]:
from datasets import list_datasets, load_dataset

dataset = load_dataset('tweet_eval', 'emoji')
dataset["train"][20]

Reusing dataset tweet_eval (C:\Users\Administrator\.cache\huggingface\datasets\tweet_eval\emoji\1.1.0\12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343)


  0%|          | 0/3 [00:00<?, ?it/s]

{'text': 'Thank you @user for an incredible night last night @ Shrine Auditorium &amp; Expo Hall',
 'label': 7}

## 2. Tokenize the texts

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets

Loading cached processed dataset at C:\Users\Administrator\.cache\huggingface\datasets\tweet_eval\emoji\1.1.0\12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343\cache-f7f960e9034d4442.arrow
Loading cached processed dataset at C:\Users\Administrator\.cache\huggingface\datasets\tweet_eval\emoji\1.1.0\12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343\cache-ead8d431705ecafd.arrow
Loading cached processed dataset at C:\Users\Administrator\.cache\huggingface\datasets\tweet_eval\emoji\1.1.0\12aee5282b8784f3e95459466db4cdf45c6bf49719c25cdb0743d71ed0410343\cache-fd807d38501ddf88.arrow


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 45000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 50000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5000
    })
})

## 3. Training

In [4]:
# Convert dataset to Tensorflow format
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

In [6]:
tf_datasets = dict()

for set_types in ["train", "test", "validation"]:
    tf_datasets[set_types] = tokenized_datasets[set_types].to_tf_dataset(
        columns=["attention_mask", "input_ids", "token_type_ids"],
        label_cols=["labels"],
        shuffle=True,
        collate_fn=data_collator,
        batch_size=8
    )

tf_datasets

{'train': <PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>,
 'test': <PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>,
 'validation': <PrefetchDataset element_spec=({'input_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(None, 512), dtype=tf.int64, name=None)}, TensorSpec(shape=(None,), dtype=tf.int64, name=None))>}

In [8]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=20)
model.summary()

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  108310272 
                                                                 
 dropout_75 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  15380     
                                                                 
Total params: 108,325,652
Trainable params: 108,325,652
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['sparse_categorical_accuracy']
)

In [13]:
model.fit(tf_datasets["train"], validation_data=tf_datasets["validation"], epochs=1)
# KeyboardInterrupt is occurred since we cannot spend 64 hours for this tutorial only

   3/5625 [..............................] - ETA: 63:50:15 - loss: 6.3454 - sparse_categorical_accuracy: 0.0417

KeyboardInterrupt: 

## 4. Inference

In [14]:
from transformers import pipeline

classifier = pipeline(task="text-classification", model=model, tokenizer=tokenizer)

In [15]:
classifier("I am happy at HCMC")

[{'label': 'LABEL_14', 'score': 0.33317020535469055}]

In [16]:
classifier([
    "wonderful, magnificent, outstanding, significant",
    "I love you",
    "I like your house"
])

[{'label': 'LABEL_14', 'score': 0.3331185579299927},
 {'label': 'LABEL_14', 'score': 0.333149254322052},
 {'label': 'LABEL_14', 'score': 0.33318403363227844}]