# Text Classification with Hugging Face

Sentiment Analysis:
Determine positive/negative tone of movie review using labelled data from the imdb.

In [1]:
from datasets import load_dataset
imdb = load_dataset("imdb")

Found cached dataset imdb (/Users/gen/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/3 [00:00<?, ?it/s]

In [2]:
imdb["test"][0]

{'text': 'I love sci-fi and am willing to put up with a lot. Sci-fi movies/TV are usually underfunded, under-appreciated and misunderstood. I tried to like this, I really did, but it is to good TV sci-fi as Babylon 5 is to Star Trek (the original). Silly prosthetics, cheap cardboard sets, stilted dialogues, CG that doesn\'t match the background, and painfully one-dimensional characters cannot be overcome with a \'sci-fi\' setting. (I\'m sure there are those of you out there who think Babylon 5 is good sci-fi TV. It\'s not. It\'s clichéd and uninspiring.) While US viewers might like emotion and character development, sci-fi is a genre that does not take itself seriously (cf. Star Trek). It may treat important issues, yet not as a serious philosophy. It\'s really difficult to care about the characters here as they are not simply foolish, just missing a spark of life. Their actions and reactions are wooden and predictable, often painful to watch. The makers of Earth KNOW it\'s rubbish as 

In [3]:
from transformers import AutoTokenizer

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

tokenized_imdb = imdb.map(preprocess_function, batched=True)

Loading cached processed dataset at /Users/gen/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-282ea8cadface71d.arrow


  0%|          | 0/25 [00:00<?, ?ba/s]

Loading cached processed dataset at /Users/gen/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1/cache-6fd244913e166afc.arrow


In [4]:
# Tokenization with Pytorch
from transformers import DataCollatorWithPadding as dcwp_torch

data_collator = dcwp_torch(tokenizer=tokenizer)

In [None]:
# Tokenization with TensorFlow
from transformers import DataCollatorWithPadding as dcwp_tensor

data_collator = dcwp_tensor(tokenizer=tokenizer, return_tensors="tf")

In [5]:
import evaluate

accuracy = evaluate.load("accuracy")

In [6]:
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [7]:
# Directional labels

id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}

In [None]:
# PyTorch Training Version

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id
)

training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb["train"],
    eval_dataset=tokenized_imdb["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

# Task 2: Emotional Categories

HF DistilBERT dataset fine tuned on emotions: sadness, joy, love, anger, fear, surprise.

In [9]:
from transformers import pipeline
classifier = pipeline("text-classification", model='bhadresh-savani/distilbert-base-uncased-emotion', top_k=None)
prediction = classifier("I love using transformers. The best part is wide range of support and its easy to use", )
print(prediction)

[[{'label': 'joy', 'score': 0.9959298968315125}, {'label': 'anger', 'score': 0.0018055237596854568}, {'label': 'love', 'score': 0.0009452480589970946}, {'label': 'sadness', 'score': 0.0006792713538743556}, {'label': 'fear', 'score': 0.0004111042362637818}, {'label': 'surprise', 'score': 0.0002288569521624595}]]


# Option 3: provide potential labels

In [21]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification")
# default model: facebook/bart-large-mnli
# https://huggingface.co/facebook/bart-large-mnli
# NLI dataset.
# categorization by predicting inference/entailment between source sentence and an example like:
# This sentence is about <label>

No model was supplied, defaulted to facebook/bart-large-mnli and revision c626438 (https://huggingface.co/facebook/bart-large-mnli).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [12]:
classifier(
    "This is a course about the Transformers library",
    candidate_labels=["education", "politics", "business"],
)

{'sequence': 'This is a course about the Transformers library',
 'labels': ['education', 'business', 'politics'],
 'scores': [0.8445950150489807, 0.11197733134031296, 0.043427735567092896]}

In [15]:
# Dependent labels
classifier(
    "This is a course about the Transformers library",
    candidate_labels=["obscenity", "political", "violent"],
)

{'sequence': 'This is a course about the Transformers library',
 'labels': ['violent', 'political', 'obscenity'],
 'scores': [0.4018270671367645, 0.3437800407409668, 0.2543928921222687]}

In [19]:
x = 0.4018270671367645 + 0.3437800407409668 + 0.2543928921222687
x

1.0

In [22]:
# Another example from:
# https://towardsdatascience.com/zero-shot-text-classification-with-hugging-face-7f533ba83cd6

import GetOldTweets3 as got
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

from transformers import pipeline

classifier = pipeline("zero-shot-classification")

txt = 'climate fight'
max_recs = 500

tweets_df = text_query_to_df(txt, max_recs)

candidate_labels = ["renewable", "politics", "emission", "temperature", "emergency", "advertisment"]
res = classifier(sent, candidate_labels)

ModuleNotFoundError: No module named 'GetOldTweets3'

In [None]:
if res['labels'][0] == 'renewable' and res['scores'][0] > 0.5:
    candidate_results[0] = candidate_results[0] + 1

In [None]:
# multi-class classification
