# Text Classification with Hugging Face

Sentiment Analysis:
Determine positive/negative tone of movie review using labelled data from the imdb.

link: https://huggingface.co/docs/transformers/tasks/sequence_classification

In [1]:
from datasets import load_dataset
imdb = load_dataset("imdb")

ModuleNotFoundError: No module named 'datasets'

In [2]:
imdb["test"][0]

{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as 

In [3]:
from transformers import AutoTokenizer

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenized_imdb = imdb.map(preprocess_function, batched=True)

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/25 [00:00<?, ?ba/s]

  0%|          | 0/50 [00:00<?, ?ba/s]

In [4]:
use_pytorch = True
use_tensorflow = False

if use_pytorch:
    from transformers import DataCollatorWithPadding as dcwp_torch
    data_collator = dcwp_torch(tokenizer=tokenizer)
elif use_tensorflow:
    from transformers import DataCollatorWithPadding as dcwp_tensor
    data_collator = dcwp_tensor(tokenizer=tokenizer, return_tensors="tf")

In [5]:
import evaluate

# link: https://huggingface.co/spaces/evaluate-metric/accuracy
# alt: https://huggingface.co/spaces/evaluate-metric/precision
# alt: https://huggingface.co/spaces/evaluate-metric/recall

accuracy = evaluate.load("accuracy")

In [6]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [6]:
# Directional labels
# Binary classification

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [14]:
# PyTorch Training Version

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

# This automatically initiallizes the output_dir as a git dir. 
# For enabling auto push model to HF Hub
# This is safer though:
# create_repo(..., exists_ok=True)

training_args = TrainingArguments(
    output_dir="binary_text_classification_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    optim="adamw_torch",
    load_best_model_at_end=True,
    push_to_hub=True,
    report_to="all",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

loading configuration file config.json from cache at /Users/gen/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "NEGATIVE",
    "1": "POSITIVE"
  },
  "initializer_range": 0.02,
  "label2id": {
    "NEGATIVE": 0,
    "POSITIVE": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.24.0",
  "vocab_size": 30522
}

loading weights file pytorch_model.bin from cache at /Users/gen/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 25000
  Num Epochs = 2
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3126
  Number of trainable parameters = 66955010
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2336,0.187559,0.9274


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 25000
  Batch size = 16
Saving model checkpoint to binary_text_classification_model/checkpoint-1563
Configuration saved in binary_text_classification_model/checkpoint-1563/config.json
Model weights saved in binary_text_classification_model/checkpoint-1563/pytorch_model.bin
tokenizer config file saved in binary_text_classification_model/checkpoint-1563/tokenizer_config.json
Special tokens file saved in binary_text_classification_model/checkpoint-1563/special_tokens_map.json
tokenizer config file saved in binary_text_classification_model/tokenizer_config.json
Special tokens file saved in binary_text_classification_model/special_tokens_map.json
The following columns in the eva

In [None]:
# trainer.push_to_hub() # already did automatically above

# Tensor Flow Trainer Version

In [None]:
# link for TF Trainer: 
# https://huggingface.co/docs/transformers/training#train-a-tensorflow-model-with-keras

In [None]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tokenized_imdb["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
model_name = "distilbert-base-uncased"

In [None]:
from transformers import TFAutoModelForSequenceClassification

# Load the model with the label count and mappings
model = TFAutoModelForSequenceClassification.from_pretrained(
    model_name, num_labels=2, id2label=id2label, label2id=label2id
)

# Convert to the TF dataset
tf_train_set = model.prepare_tf_dataset(
    tokenized_imdb["train"],
    shuffle=True,
    batch_size=16,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_imdb["test"],
    shuffle=False,
    batch_size=16,
    collate_fn=data_collator,
)

In [None]:
# Set the configuration for the model by compiling
import tensorflow as tf

model.compile(optimizer=optimizer)


In [None]:
# Keras Callbacks for accuracy evaluation

from transformers.keras_callbacks import KerasMetricCallback
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)

tf_model_output_dir="text-classification-to-label-tensorflow",

from transformers.keras_callbacks import PushToHubCallback
push_to_hub_callback = PushToHubCallback(
    output_dir=tf_model_output_dir,
    tokenizer=tokenizer,
)

callbacks = [metric_callback, push_to_hub_callback]

In [None]:
# Fine tune the model by fitting
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=callbacks)

In [None]:
# more info:
# PyTorch: https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification.ipynb
# TensorFlow: https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/text_classification-tf.ipynb

# Use model for inference

In [7]:
text = "This was a masterpiece. Not completely faithful to the books, but enthralling from beginning to end. Might be my favorite of the three."

from transformers import pipeline

# model_in_dir = binary_text_classification_model
# model_in_dir = tf_model_output_dir
model_in_dir = "stevhliu/my_awesome_model" # remote model on the hub

# model can be local?
classifier = pipeline("sentiment-analysis", model=model_in_dir)
classifier(text)

[{'label': 'LABEL_1', 'score': 0.9994940757751465}]

In [None]:
# ALT: Manual Pipeline in PyTorch

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
inputs = tokenizer(text, return_tensors="pt")

from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
with torch.no_grad():
    logits = model(**inputs).logits
    
predicted_class_id = logits.argmax().item()
model.config.id2label[predicted_class_id]

In [None]:
# ALT: Manual Pipeline in TensorFlow

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("stevhliu/my_awesome_model")
inputs = tokenizer(text, return_tensors="tf")

from transformers import TFAutoModelForSequenceClassification
model = TFAutoModelForSequenceClassification.from_pretrained("stevhliu/my_awesome_model")
logits = model(**inputs).logits

predicted_class_id = int(tf.math.argmax(logits, axis=-1)[0])
model.config.id2label[predicted_class_id]

# Task 2: Emotional Categories

HF DistilBERT dataset fine tuned on emotions: sadness, joy, love, anger, fear, surprise.

In [None]:
from transformers import pipeline
classifier = pipeline("text-classification", model='bhadresh-savani/distilbert-base-uncased-emotion', top_k=None)
prediction = classifier("I love using transformers. The best part is wide range of support and its easy to use", )
print(prediction)

# Option 3: provide potential labels

In [None]:
# Multiclass, dependent classification 
# Probabilities for the classes must sum to 1

In [None]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification")
# default model: facebook/bart-large-mnli
# https://huggingface.co/facebook/bart-large-mnli
# NLI dataset.
# categorization by predicting inference/entailment between source sentence and an example like:
# This sentence is about <label>

In [None]:
classifier(
    "This is a course about the Transformers library",
    candidate_labels=["education", "politics", "business"],
)

In [None]:
# Dependent labels
classifier(
    "This is a course about the Transformers library",
    candidate_labels=["obscenity", "political", "violent"],
)

In [None]:
x = 0.4018270671367645 + 0.3437800407409668 + 0.2543928921222687
x

In [None]:
# Another example from:
# https://towardsdatascience.com/zero-shot-text-classification-with-hugging-face-7f533ba83cd6

import GetOldTweets3 as got
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from transformers import pipeline

classifier = pipeline("zero-shot-classification")

txt = 'climate fight'
max_recs = 500

tweets_df = text_query_to_df(txt, max_recs)

candidate_labels = ["renewable", "politics", "emission", "temperature", "emergency", "advertisment"]
res = classifier(sent, candidate_labels)

In [None]:
if res['labels'][0] == 'renewable' and res['scores'][0] > 0.5:
    candidate_results[0] = candidate_results[0] + 1

In [None]:
# multi-class classification
