## Text Classification with Huggingface BERT models

This notebook demonstrates finetuning a Huggingface BERT model on the task of text classification using the Tensorflow framework. It also provides examples on running predictions using the huggingface `pipeline` api and a custom prediction method.

In [18]:
# !pip install -q transformers

In [19]:
import pandas as pd
from tqdm.notebook import tqdm
from transformers import BertTokenizerFast,TFBertForSequenceClassification
import numpy as np
import tensorflow as tf 
from tqdm.notebook import tqdm 

## Data 

We create sample data to illustrate training.

In [20]:
# example data political text has label 1, non-political text has label 0
data = [{'text': 'The president provided his stance on the wind mill policy today', 'label': 1},
{'text': 'The sun is shining bright and the day looks nice', 'label': 0},
{'text': 'The senators gathers to deliberate on climate change laws', 'label': 1},
{'text': 'Oh to be young and feel loves keen sting - Albus Dumbledore', 'label': 0}, 
]
data = data * 100
data = pd.DataFrame(data) 

## Initialize Model and Tokenizer

Download a pretrained model and tokenizer.

In [None]:
model_output_path = "models/text_classification"
model_path = "bert-base-uncased"


tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = TFBertForSequenceClassification.from_pretrained(model_path)

lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=5e-5,
    decay_steps=10000,
    decay_rate=0.9)
optimizer = tf.keras.optimizers.Adam(learning_rate=lr_schedule)
model.compile(optimizer=optimizer, 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=tf.metrics.SparseCategoricalAccuracy()
              ) # can also use any keras loss fn

In [None]:
# Convert the text in the dataframe to a `tf.data` pipeline.

def get_train_ds(data, batch_size=32): 
  X_train, y_train = list(data.text), list(data.label) 
  train_encodings = tokenizer(X_train, truncation=True, padding=True)  
  train_ds = tf.data.Dataset.from_tensor_slices((
      dict(train_encodings),
      y_train
  ))   
  train_ds = train_ds.batch(batch_size) 
  return train_ds

def train_on_slice(train_ds, num_epochs=1):
  model.fit(train_ds, epochs=num_epochs)   
  tokenizer.save_pretrained(f"{model_output_path}")
  model.save_pretrained(f"{model_output_path}")

In [None]:
train_ds = get_train_ds(data)
train_on_slice(train_ds) 



## Predictions 

We can make predictions with the huggingface pipeline api or write a custom prediction method.

In [None]:
from transformers import TextClassificationPipeline

tokenizer = BertTokenizerFast.from_pretrained(model_output_path)
model = TFBertForSequenceClassification.from_pretrained(model_output_path, id2label={1: 'political', 0: 'general'} ) # modify labels as needed. 
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)

In [44]:
texts = data.text.tolist()[:5]
preds = pipe(texts)
preds

[{'label': 'political', 'score': 0.979645311832428},
 {'label': 'general', 'score': 0.9694413542747498},
 {'label': 'political', 'score': 0.9788018465042114},
 {'label': 'general', 'score': 0.9729897975921631},
 {'label': 'political', 'score': 0.979645311832428}]

In [45]:
preds_df = pd.DataFrame(preds)
preds_df["text"] = texts
preds_df

Unnamed: 0,label,score,text
0,political,0.979645,The president provided his stance on the wind ...
1,general,0.969441,The sun is shining bright and the day looks nice
2,political,0.978802,The senators gathers to deliberate on climate ...
3,general,0.97299,Oh to be young and feel loves keen sting - Alb...
4,political,0.979645,The president provided his stance on the wind ...


## Custom Prediction Method

In [48]:
def extract_predictions( text, model, tokenizer, batch_size=256): 
  a_preds = []
  a_scores = []  
  chunks = range(0,len(text), batch_size)
  for i,x, in tqdm(enumerate(chunks), desc="batching and predicting", total=len(chunks)):
    start, end = x, x+batch_size
    stext = text[start: end]
    # print(start,end)
    data_encodings = tokenizer(stext, truncation=True, padding=True)  
    ds = tf.data.Dataset.from_tensor_slices(dict(data_encodings)).batch(batch_size)  

    preds = model.predict(ds)["logits"] 
    classes = np.argmax(preds, axis=1).tolist() 
    probs = tf.nn.softmax(preds) # get probabilities from logits
    scores = np.amax(probs, axis=1).tolist()   
    a_preds.extend(classes)
    a_scores.extend(scores) 
   
  ans = []
  for i in range(len(a_preds)):  # map labels to label titles 
    ans.append({
        "label": "general" if a_preds[i] == 0 else "political",
        "score": a_scores[i]
    })
  result = pd.DataFrame(ans) 
  return result

In [49]:
preds_df = extract_predictions(texts, model, tokenizer,)
preds_df["text"] = texts
preds_df

batching and predicting:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,label,score,text
0,political,0.979645,The president provided his stance on the wind ...
1,general,0.969441,The sun is shining bright and the day looks nice
2,political,0.978802,The senators gathers to deliberate on climate ...
3,general,0.97299,Oh to be young and feel loves keen sting - Alb...
4,political,0.979645,The president provided his stance on the wind ...
