In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datetime import datetime
from google.colab import files

In [3]:
model_save_name = 'BERTModel-Emotions'
path = F"/content/gdrive/MyDrive/Colab Notebooks/ml-blockchain/savedModels/{model_save_name}" 
testSetPath = F"/content/gdrive/MyDrive/Colab Notebooks/ml-blockchain/savedModels/{model_save_name}-test.csv"
validationSetPath = F"/content/gdrive/MyDrive/Colab Notebooks/ml-blockchain/savedModels/{model_save_name}-validation.csv"
labels = ['Sadness', 'Anger', 'Love', 'Surprise', 'Fear', 'Happiness']

## Install Transformers library

In [None]:
!pip install transformers

## Load the BERT Classifier and Tokenizer along with Input modules

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from transformers import BertConfig, BertModel

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=len(labels))
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
model.summary()

## Download Kaggle dataset

In [None]:
! pip install kaggle
! mkdir ~/.kaggle
! cp /content/gdrive/MyDrive/Colab\ Notebooks/ml-blockchain/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

In [None]:
! kaggle datasets download -d ishantjuyal/emotions-in-text
! unzip emotions-in-text.zip

In [9]:
csv_path = '/content/Emotion_final.csv'
dataset_file = pd.read_csv(csv_path)

In [None]:
dataset_file.head()

In [None]:
print(dataset_file.Emotion.unique())

In [None]:
dataset_file.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
dataset_file['LABEL_COLUMN'] = dataset_file['LABEL_COLUMN'].replace('sadness', 0)
dataset_file['LABEL_COLUMN'] = dataset_file['LABEL_COLUMN'].replace('anger', 1)
dataset_file['LABEL_COLUMN'] = dataset_file['LABEL_COLUMN'].replace('love', 2)
dataset_file['LABEL_COLUMN'] = dataset_file['LABEL_COLUMN'].replace('surprise', 3)
dataset_file['LABEL_COLUMN'] = dataset_file['LABEL_COLUMN'].replace('fear', 4)
dataset_file['LABEL_COLUMN'] = dataset_file['LABEL_COLUMN'].replace('happy', 5)
dataset_file

## Split dataset

In [None]:
train, test_and_validatition = train_test_split(dataset_file, test_size=0.2, random_state=77)
test, validation = train_test_split(test_and_validatition, test_size=0.5, random_state=77)
print(len(train),len(test),len(validation))

## Save test and validation datasets

In [14]:
with open(testSetPath, 'w', encoding = 'utf-8-sig') as f:
  test.to_csv(f)

with open(validationSetPath, 'w', encoding = 'utf-8-sig') as f:
  validation.to_csv(f)

## Create input sequences

In [15]:
def convert_data_to_examples_single(inputDataset, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = inputDataset.apply(lambda x: InputExample(guid=None, 
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)  
  return train_InputExamples


def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] 

    for e in examples:
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, 
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, 
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )

In [16]:
DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

In [None]:
test_inputExamples = convert_data_to_examples_single(test, DATA_COLUMN, LABEL_COLUMN)
test_data = convert_examples_to_tf_dataset(list(test_inputExamples), tokenizer)
test_data = test_data.batch(32)

validation_InputExamples = convert_data_to_examples_single(validation, DATA_COLUMN, LABEL_COLUMN)
validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)

## Configure the Loaded BERT model and Train for Fine-tuning

In [None]:
train_InputExamples  = convert_data_to_examples_single(train, DATA_COLUMN, LABEL_COLUMN)
train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)

In [None]:
# model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=3e-5, epsilon=1e-08), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, validation_data=validation_data)

In [None]:
model.evaluate(test_data)

## Save model

In [21]:
model.save_pretrained(path)

## Load model

In [None]:
loaded_model = TFBertForSequenceClassification.from_pretrained(path, local_files_only=True)
loaded_model.summary()

## Make Predictions with the Fine-tuned Model

In [23]:
pred_sentences = ['I am scared of the dark',
                  'I want to spend the rest of my life with you',
                  'He was filled with joy when he opened his present',
                  'She was devastated after the death of her husband']

In [None]:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = loaded_model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
  print(pred_sentences[i], ": \n", labels[label[i]])