# MultiClass Classification in 10 Minutes with BERT-TensorFlow and SoftMax
- Based on Article  
  https://towardsdatascience.com/sentiment-analysis-in-10-minutes-with-bert-and-hugging-face-294e8a04b671

- Data Source:
  - Unzip files (only one time after downloading tar.gz file)  
  http://qwone.com/~jason/20Newsgroups/

  - Download Link:  
    http://qwone.com/~jason/20Newsgroups/20news-bydate.tar.gz

In [None]:
from platform import python_version

print(python_version())

## Install Transformers Python Library to run it in CoLab

In [None]:
#!pip install transformers
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
import pickle
from transformers import *
from tqdm import tqdm, trange
from ast import literal_eval

## Mount Google Drive to Read Data & Model from Local Storage

In [None]:
device_name = tf.test.gpu_device_name()
#if device_name != '/device:GPU:0':
#  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

In [None]:
import gc
gc.collect()

In [None]:
dataset_name = 'S327Q02'
#sub_dataset = 'gender2'
train_data_file = '../dataset/extracted_files/'+dataset_name+'_train.csv'
output_model_name = '../models/G-SciEdBERT_model_'+dataset_name

In [None]:
train_df = pd.read_csv(train_data_file)
train_df.head()

In [None]:
print('Unique comments in training: ', train_df.sentence.nunique() == train_df.shape[0])
print('Null values in training: ', train_df.isnull().values.any())
train_df = train_df.dropna()
print('Null values after drop in training: ', train_df.isnull().values.any())

In [None]:
train_df['label'] = pd.Categorical(train_df.score, ordered=False).codes
train_df['label'].unique()

In [None]:
mapLabels = pd.DataFrame(train_df.groupby(['score', 'label']).count())

#drop count column
mapLabels.drop(['sentence'], axis = 1, inplace = True)
label2Index = mapLabels.to_dict(orient='index')

print (f"label2Index :{label2Index}")
print (type(label2Index))
#print (f"index2Label :{index2Label}")

In [None]:
index2label = {}

for key in label2Index:
  print (f"{key[1]} -> {key[0]}")
  index2label[key[1]] = key[0]

In [None]:
label2Index = {v: k for k, v in index2label.items()}

print (f'label2Index: {label2Index}')
print (f'index2label: {index2label}')

In [None]:
train_df.head()

In [None]:
train_df.rename(columns = {'label' : 'LABEL_COLUMN', 'sentence' : 'DATA_COLUMN'}, inplace = True)

In [None]:
# Remoe Email address to avoid additional noise
train_df.DATA_COLUMN.replace(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', '', regex=True, inplace=True)

In [None]:
train_df = train_df[['LABEL_COLUMN','DATA_COLUMN']]

In [None]:
train_df.head()

In [None]:
train_df.count()

In [None]:
#splitSize = df.count() * .8
#splitSize

In [None]:
#people_copy = people.copy()
train = train_df.sample(frac=1, random_state=5)
#new_data = train.sample(frac=0.8, random_state=0)

#test = train_df.drop(train.index)

In [None]:
print (train.count())
unique_labels = np.unique(train["LABEL_COLUMN"].tolist())
label_counts = train["LABEL_COLUMN"].value_counts()
print(label_counts)
print(unique_labels)


In [None]:
import pandas as pd
import numpy as np

# Function to calculate the Gini Coefficient
def gini_coefficient(array):
    """Calculate the Gini coefficient of a numpy array."""
    # All values are sorted and normalized (making the total equal to 1)
    array = array / array.sum()
    array = np.sort(array)
    index = np.arange(1, array.shape[0] + 1)
    n = array.shape[0]
    return ((np.sum((2 * index - n - 1) * array)) / n)

# Calculate the Gini Coefficient for the label counts
gini = gini_coefficient(label_counts.values)
print(f"Gini Coefficient for the label distribution: {gini}")

In [None]:
validation_data_file = '../dataset/extracted_files/'+dataset_name+'_test.csv'
test_df = pd.read_csv(validation_data_file)
test_df.head()
print('Unique comments in testing: ', test_df.sentence.nunique() == test_df.shape[0])
print('Null values in testing: ', test_df.isnull().values.any())
test_df = test_df.dropna()
print('Null values after drop in testing: ', test_df.isnull().values.any())
test_df['score'] = pd.Categorical(test_df.score, ordered=True).codes
test_df['score'].unique()
test_df.rename(columns = {'score' : 'LABEL_COLUMN', 'sentence' : 'DATA_COLUMN'}, inplace = True)
test_df.DATA_COLUMN.replace(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', '', regex=True, inplace=True)
test_df = test_df[['LABEL_COLUMN','DATA_COLUMN']]
test = test_df.sample(frac=1, random_state=5)
print (test.count())
#unique_labels = np.unique(test_data["LABEL_COLUMN"].tolist())
#label_counts = test_data["LABEL_COLUMN"].value_counts()
#print(label_counts)
#print(unique_labels)

In [None]:
uniqueLabels = train_df['LABEL_COLUMN'].unique()
print (f'Number of Labels: {len(uniqueLabels)},\nLabels:{uniqueLabels}')
sentences = list(train_df.DATA_COLUMN.values)

## Load the Model
See Load and Save notebooks in this repository to understand how Transformers models cen be:
1. Downloaded
2. Stored Locally and
3. be used from Local Storage.

This should be interesting if you work in a cloud environment without Internet connection.

Here we tell the model that we whish to train on **20 label values** instead of the original 1 label (with 1 or 0 values) for which the original model was designed. This is why the test below tells us that we better should train this model. So, training it we will :-)

In [None]:
max_length = 512
model = TFBertForSequenceClassification.from_pretrained('bert-base-german-cased', num_labels=len(uniqueLabels))
#model = TFBertForSequenceClassification.from_pretrained('../models/G-SciEdBert', from_pt=True, num_labels=len(uniqueLabels))
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased', do_lower_case=True) # tokenizer
encodings = tokenizer.batch_encode_plus(sentences,max_length=max_length,pad_to_max_length=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys())

In [None]:
model.summary()

## Creating Input Sequences
We have two pandas Dataframe objects waiting for us to convert them into suitable objects for the BERT model. We will take advantage of the InputExample function that helps us to create sequences from our dataset. The InputExample function can be called as follows:

In [None]:
# transformers.InputExample
InputExample(guid=None,
             text_a = "Hello, world",
             text_b = None,
             label = 1)

Now we will create two main functions:

1 — `convert_data_to_examples`: This will accept our train and test datasets and convert each row into an InputExample object.

2 — `convert_examples_to_tf_dataset`: This function will tokenize the InputExample objects, then create the required input format with the tokenized objects, finally, create an input dataset that we can feed to the model.

In [None]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN):
  train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN],
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

  validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN],
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

  return train_InputExamples, validation_InputExamples

In [None]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(train,
                                                                           test,
                                                                           'DATA_COLUMN',
                                                                           'LABEL_COLUMN')

In [None]:
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


In [None]:
DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

In [None]:
print (str(type(DATA_COLUMN)) + ' ' + DATA_COLUMN)
print (str(type(LABEL_COLUMN)) + ' ' + LABEL_COLUMN)

In [None]:
train.head(5)

In [None]:
%%time

train_InputExamples, validation_InputExamples = convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN)

train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

In [None]:
print (test.count())
unique_labels = np.unique(test["LABEL_COLUMN"].tolist())
label_counts = test["LABEL_COLUMN"].value_counts()
print(label_counts)
print(unique_labels)


In [None]:
import pandas as pd
import numpy as np

# Function to calculate the Gini Coefficient
def gini_coefficient(array):
    """Calculate the Gini coefficient of a numpy array."""
    # All values are sorted and normalized (making the total equal to 1)
    array = array / array.sum()
    array = np.sort(array)
    index = np.arange(1, array.shape[0] + 1)
    n = array.shape[0]
    return ((np.sum((2 * index - n - 1) * array)) / n)

# Calculate the Gini Coefficient for the label counts
gini = gini_coefficient(label_counts.values)
print(f"Gini Coefficient for the label distribution: {gini}")

## Configuring the BERT model and Fine-tuning
We will use Adam as our optimizer, CategoricalCrossentropy as our loss function, and SparseCategoricalAccuracy as our accuracy metric. Fine-tuning the model for 2 epochs will give us good accuracy, which is great.

In [None]:
%%time

model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, validation_data=validation_data)

In [None]:
#torch.save(model,output_model_name)
model.save_pretrained(output_model_name)

In [None]:
validation_data_file = '../dataset/extracted_files/'+dataset_name+'_test.csv'
test_df = pd.read_csv(validation_data_file)
test_df.head()
print('Unique comments in testing: ', test_df.sentence.nunique() == test_df.shape[0])
print('Null values in testing: ', test_df.isnull().values.any())
test_df = test_df.dropna()
print('Null values after drop in testing: ', test_df.isnull().values.any())
test_df['score'] = pd.Categorical(test_df.score, ordered=True).codes
test_df['score'].unique()
test_df.rename(columns = {'score' : 'LABEL_COLUMN', 'sentence' : 'DATA_COLUMN'}, inplace = True)
test_df.DATA_COLUMN.replace(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', '', regex=True, inplace=True)
test_df = test_df[['LABEL_COLUMN','DATA_COLUMN']]
test_data = test_df.sample(frac=1, random_state=5)
print (test_data.count())
unique_labels = np.unique(test_data["LABEL_COLUMN"].tolist())
label_counts = test_data["LABEL_COLUMN"].value_counts()
print(label_counts)
print(unique_labels)

In [None]:
pred_sentences= test_data["DATA_COLUMN"].tolist()
validation_labels = test_data["LABEL_COLUMN"].tolist()
tf_batch = tokenizer(pred_sentences, max_length=512, padding=True, truncation=True, return_tensors='tf')
tf_outputs = model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)

# Get index of predicted label for each sentence
predicted_labels = tf.argmax(tf_predictions, axis=1).numpy()

true_positives = 0

# output human readable label predictions
for i in range(len(pred_sentences)):
    predicted_label = predicted_labels[i]
    actual_label = validation_labels[i]
    if predicted_label == actual_label:
        true_positives+=1
accuracy = true_positives/len(pred_sentences)
print("Overall testing Accuracy:",accuracy )
        

    
#for i in range(len(pred_sentences)):
    #print(pred_sentences[i], ": \n", str(predicted_labels[i]) +" with score: "+ str(tf_predictions[i][predicted_labels[i]].numpy()))
    #print ("Actual Label:",str(validation_labels[i]) )

# Compute accuracy for each label
unique_labels = np.unique(validation_labels)
label_accuracies = {}

for label in unique_labels:
    correct_predictions = np.sum((predicted_labels == label) & (validation_labels == label))
    total_label_count = np.sum(validation_labels == label)
    
    accuracy = correct_predictions / total_label_count
    label_accuracies[label] = accuracy

print("Validation accuracy for each label:", label_accuracies)

In [None]:
#model = torch.load(output_model_name)
model_name = 'gelatin_gender2'
output_model_name = '../models/bert_model_'+model_name
#output_model_name = '../models/bert_model_ETS_CH_gelatin'
new_model = TFBertForSequenceClassification.from_pretrained(output_model_name)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # tokenizer

Training the model might take a while, so ensure you enabled the GPU acceleration from the Notebook Settings. After our training is completed, we can move onto making sentiment predictions.

## Making Predictions
I created a list of two reviews I created. The first one is a positive review, while the second one is clearly negative.

In [None]:
pred_sentences = ["The water is only stirring while the weight is falling. When the weight falls, the paddle will stop stirring."]

In [None]:
dataset_name = 'gelatin'
sub_dataset = 'gender1'
data_file = '../datasets/'+dataset_name+'/'+sub_dataset+'_test.csv'
df = pd.read_csv(data_file)
df.head()
print('Unique comments in training: ', df.sentence.nunique() == df.shape[0])
print('Null values in training: ', df.isnull().values.any())
df = df.dropna()
print('Null values after drop in training: ', df.isnull().values.any())
pred_sentences = list(df['sentence'])
actual_labels = list(df['score'])
print(len(pred_sentences))

We need to tokenize our reviews with our pre-trained BERT tokenizer. We will then feed these tokenized sequences to our model and run a final softmax layer to get the predictions. We can then use the argmax function to determine whether our sentiment prediction for the review is positive or negative. Finally, we will print out the results with a simple for loop. The following lines do all of these said operations:

In [None]:
import csv
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = new_model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)

# Get index of predicted label for each sentence
pred_label = tf.argmax(tf_predictions, axis=1).numpy()
num_classes = tf_predictions.shape[1]

# output human readable label predictions
for i in range(len(pred_sentences)):
    print(pred_sentences[i], ": \n", str(pred_label[i]) +" with score: "+ str(tf_predictions[i][pred_label[i]].numpy()))
  #print(pred_sentences[i], ": \n", str(index2label[label[i]]) +" with score: "+ str(tf_predictions[i][label[i]].numpy()))
    print ()
with open('../outputfiles/'+model_name+'Model_'+sub_dataset+'_w_all_probs.csv', 'w',encoding="utf-8", newline='') as csvfile:
    csvwriter = csv.writer(csvfile)
    # Writing headers
    headers = ['Sentence', 'Actual Score', 'Predicted Score', 'Predicted Score Probability']
    headers += [f'Probability_Score_{i}' for i in range(num_classes)]
    csvwriter.writerow(headers)

    # Write data
    for i in range(len(pred_sentences)):
        sentence = pred_sentences[i]
        actual_score = actual_labels[i]  # or any other method to obtain the actual score
        bert_score = pred_label[i]
        probability = tf_predictions[i][pred_label[i]].numpy()
        probabilities = tf_predictions[i].numpy().tolist()

        # Write the row to the CSV file
        csvwriter.writerow([sentence, actual_score, bert_score, probability] + probabilities)


In [None]:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = new_model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
tf_predictions
tf.argmax(tf_predictions, axis=1).numpy()
index2label

## Debugging the Final Tensor Shape

In [None]:
tf_predictions.shape

In [None]:
for i in range(len(tf_predictions)):
  print (tf_predictions[i])

In [None]:
for i in range(len(tf_predictions)):
  print (str(tf_predictions[i][0]) + ' - ' + str(tf_predictions[i][1]))

In [None]:
for i in range(len(tf_predictions)):
  print(tf_predictions[i][label[i]].numpy())