In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datetime import datetime
from google.colab import files

## Install Transformers library

In [3]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.0-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 35.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 54.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 54.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.4 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyY

## Load the BERT Classifier and Tokenizer along with Input modules

In [4]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from transformers import BertConfig, BertModel

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=6)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [5]:
model.summary()

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  4614      
                                                                 
Total params: 109,486,854
Trainable params: 109,486,854
Non-trainable params: 0
_________________________________________________________________


## Download Kaggle dataset

In [6]:
! pip install kaggle
! mkdir ~/.kaggle
! cp /content/gdrive/MyDrive/Colab\ Notebooks/ml-blockchain/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
! kaggle datasets download -d ishantjuyal/emotions-in-text
! unzip emotions-in-text.zip

Downloading emotions-in-text.zip to /content
  0% 0.00/781k [00:00<?, ?B/s]
100% 781k/781k [00:00<00:00, 130MB/s]
Archive:  emotions-in-text.zip
  inflating: Emotion_final.csv       


In [8]:
path = '/content/Emotion_final.csv'
dataset_file = pd.read_csv(path)

In [9]:
dataset_file.head()

Unnamed: 0,Text,Emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [10]:
print(dataset_file.Emotion.unique())

['sadness' 'anger' 'love' 'surprise' 'fear' 'happy']


In [11]:
dataset_file.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
dataset_file['LABEL_COLUMN'] = dataset_file['LABEL_COLUMN'].replace('sadness', 0)
dataset_file['LABEL_COLUMN'] = dataset_file['LABEL_COLUMN'].replace('anger', 1)
dataset_file['LABEL_COLUMN'] = dataset_file['LABEL_COLUMN'].replace('love', 2)
dataset_file['LABEL_COLUMN'] = dataset_file['LABEL_COLUMN'].replace('surprise', 3)
dataset_file['LABEL_COLUMN'] = dataset_file['LABEL_COLUMN'].replace('fear', 4)
dataset_file['LABEL_COLUMN'] = dataset_file['LABEL_COLUMN'].replace('happy', 5)
dataset_file

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,1
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,1
...,...,...
21454,Melissa stared at her friend in dism,4
21455,Successive state elections have seen the gover...,4
21456,Vincent was irritated but not dismay,4
21457,Kendall-Hume turned back to face the dismayed ...,4


## Split train sets

In [12]:
train, test_and_validatition = train_test_split(dataset_file, test_size=0.2, random_state=77)
test, validation = train_test_split(test_and_validatition, test_size=0.5, random_state=77)
print(len(train),len(test),len(validation))

17167 2146 2146


## Save test and validation datasets

In [13]:
model_save_name = 'BERTModel-Emotions'
path = F"/content/gdrive/MyDrive/Colab Notebooks/ml-blockchain/savedModels/{model_save_name}" 
testSetPath = F"/content/gdrive/MyDrive/Colab Notebooks/ml-blockchain/savedModels/{model_save_name}-test.csv"
validationSetPath = F"/content/gdrive/MyDrive/Colab Notebooks/ml-blockchain/savedModels/{model_save_name}-validation.csv"

In [14]:
with open(testSetPath, 'w', encoding = 'utf-8-sig') as f:
  test.to_csv(f)

with open(validationSetPath, 'w', encoding = 'utf-8-sig') as f:
  validation.to_csv(f)

## Create input sequences

In [15]:
def convert_data_to_examples_single(inputDataset, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = inputDataset.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)  
  return train_InputExamples


def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )

In [16]:
DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

In [17]:
test_inputExamples = convert_data_to_examples_single(test, DATA_COLUMN, LABEL_COLUMN)
test_data = convert_examples_to_tf_dataset(list(test_inputExamples), tokenizer)
test_data = test_data.batch(32)

validation_InputExamples = convert_data_to_examples_single(validation, DATA_COLUMN, LABEL_COLUMN)
validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)



## Configure the Loaded BERT model and Train for Fine-tuning

In [18]:
train_InputExamples  = convert_data_to_examples_single(train, DATA_COLUMN, LABEL_COLUMN)
train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)



In [19]:
# model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=3e-5, epsilon=1e-08), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, validation_data=validation_data)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f1047d53e10>

In [20]:
model.evaluate(test_data)



[0.19020000100135803, 0.928704559803009]

In [21]:
model.save_pretrained(path)

## Load model

In [22]:
loaded_model = TFBertForSequenceClassification.from_pretrained(path, local_files_only=True)
loaded_model.summary()

Some layers from the model checkpoint at /content/gdrive/MyDrive/Colab Notebooks/ml-blockchain/savedModels/BERTModel-Emotions were not used when initializing TFBertForSequenceClassification: ['dropout_37']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /content/gdrive/MyDrive/Colab Notebooks/ml-blockchain/savedModels/BERTModel-Emotions.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequen

Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_75 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  4614      
                                                                 
Total params: 109,486,854
Trainable params: 109,486,854
Non-trainable params: 0
_________________________________________________________________


## Make Predictions with the Fine-tuned Model

In [27]:
pred_sentences = ['I am scared of the dark',
                  'I want to spend the rest of my life with you',
                  'He was filled with joy when he opened his present',
                  'She was devastated after the death of her husband']

In [28]:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = loaded_model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Sadness', 'Anger', 'Love', 'Surprise', 'Fear', 'Happiness']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
  print(pred_sentences[i], ": \n", labels[label[i]])

I am scared of the dark : 
 Fear
I want to spend the rest of my life with you : 
 Love
He was filled with joy when he opened his present : 
 Happiness
She was devastated after the death of her husband : 
 Sadness
