In [29]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [30]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from datetime import datetime
from google.colab import files

## Install Transformers library

In [31]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


## Load the BERT Classifier and Tokenizer along with Input modules

In [32]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
from transformers import BertConfig, BertModel

model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
model.summary()

Model: "tf_bert_for_sequence_classification_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_113 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
Total params: 109,484,547
Trainable params: 109,484,547
Non-trainable params: 0
_________________________________________________________________


## Download Kaggle dataset

In [34]:
! pip install kaggle
! mkdir ~/.kaggle
! cp /content/gdrive/MyDrive/Colab\ Notebooks/ml-blockchain/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [35]:
! kaggle datasets download -d yasserh/twitter-tweets-sentiment-dataset
! unzip twitter-tweets-sentiment-dataset.zip

twitter-tweets-sentiment-dataset.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  twitter-tweets-sentiment-dataset.zip
replace Tweets.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: Tweets.csv              


In [36]:
path = '/content/Tweets.csv'
dataset_file = pd.read_csv(path)

In [37]:
dataset_file.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [38]:
del dataset_file['textID']
del dataset_file['selected_text']
dataset_file['text'] = dataset_file['text'].replace(np.nan, 'Empty')
dataset_file['sentiment'] = dataset_file['sentiment'].replace('positive', 0)
dataset_file['sentiment'] = dataset_file['sentiment'].replace('negative', 1)
dataset_file['sentiment'] = dataset_file['sentiment'].replace('neutral', 2)
dataset_file.columns = ['DATA_COLUMN', 'LABEL_COLUMN']
dataset_file

Unnamed: 0,DATA_COLUMN,LABEL_COLUMN
0,"I`d have responded, if I were going",2
1,Sooo SAD I will miss you here in San Diego!!!,1
2,my boss is bullying me...,1
3,what interview! leave me alone,1
4,"Sons of ****, why couldn`t they put them on t...",1
...,...,...
27476,wish we could come see u on Denver husband l...,1
27477,I`ve wondered about rake to. The client has ...,1
27478,Yay good for both of you. Enjoy the break - y...,0
27479,But it was worth it ****.,0


## Split train sets

In [39]:
train, test_and_validatition = train_test_split(dataset_file, test_size=0.2, random_state=77)
test, validation = train_test_split(test_and_validatition, test_size=0.5, random_state=77)
print(len(train),len(test),len(validation))

21984 2748 2749


## Save test and validation datasets

In [40]:
model_save_name = 'BERTModel-Twitter'
path = F"/content/gdrive/MyDrive/Colab Notebooks/ml-blockchain/savedModels/{model_save_name}" 
testSetPath = F"/content/gdrive/MyDrive/Colab Notebooks/ml-blockchain/savedModels/{model_save_name}-test.csv"
validationSetPath = F"/content/gdrive/MyDrive/Colab Notebooks/ml-blockchain/savedModels/{model_save_name}-validation.csv"

In [41]:
with open(testSetPath, 'w', encoding = 'utf-8-sig') as f:
  test.to_csv(f)

with open(validationSetPath, 'w', encoding = 'utf-8-sig') as f:
  validation.to_csv(f)

## Create input sequences

In [42]:
def convert_data_to_examples_single(inputDataset, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = inputDataset.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)  
  return train_InputExamples


def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )

In [43]:
DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'

In [44]:
test_inputExamples = convert_data_to_examples_single(test, DATA_COLUMN, LABEL_COLUMN)
test_data = convert_examples_to_tf_dataset(list(test_inputExamples), tokenizer)
test_data = test_data.batch(32)

validation_InputExamples = convert_data_to_examples_single(validation, DATA_COLUMN, LABEL_COLUMN)
validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
validation_data = validation_data.batch(32)



## Configure the Loaded BERT model and Train for Fine-tuning

In [45]:
train_InputExamples  = convert_data_to_examples_single(train, DATA_COLUMN, LABEL_COLUMN)
train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
train_data = train_data.shuffle(100).batch(32).repeat(2)



In [47]:
# model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
model.compile(optimizer=tf.keras.optimizers.RMSprop(learning_rate=3e-5, epsilon=1e-08), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

model.fit(train_data, epochs=2, validation_data=validation_data)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fb0955b6b50>

In [48]:
model.evaluate(test_data)



[0.9680880904197693, 0.7601892352104187]

In [49]:
model.save_pretrained(path)

## Load model

In [50]:
loaded_model = TFBertForSequenceClassification.from_pretrained(path, local_files_only=True)
loaded_model.summary()

Some layers from the model checkpoint at /content/gdrive/MyDrive/Colab Notebooks/ml-blockchain/savedModels/BERTModel-Twitter were not used when initializing TFBertForSequenceClassification: ['dropout_113']
- This IS expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertForSequenceClassification were initialized from the model checkpoint at /content/gdrive/MyDrive/Colab Notebooks/ml-blockchain/savedModels/BERTModel-Twitter.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenc

Model: "tf_bert_for_sequence_classification_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_151 (Dropout)       multiple                  0         
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
Total params: 109,484,547
Trainable params: 109,484,547
Non-trainable params: 0
_________________________________________________________________


## Make Predictions with the Fine-tuned Model

In [55]:
pred_sentences = ['Today was the best day ever!',
                  'Do not talk to me, I am in a bad mood',
                  'I do not live in Nicosia',
                  'I am very optimistic for this effort']

In [56]:
tf_batch = tokenizer(pred_sentences, max_length=128, padding=True, truncation=True, return_tensors='tf')
tf_outputs = loaded_model(tf_batch)
tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
labels = ['Positive','Negative','Neutral']
label = tf.argmax(tf_predictions, axis=1)
label = label.numpy()
for i in range(len(pred_sentences)):
  print(pred_sentences[i], ": \n", labels[label[i]])

Today was the best day ever! : 
 Positive
Do not talk to me, I am in a bad mood : 
 Negative
I do not live in Nicosia : 
 Neutral
I am very optimistic for this effort : 
 Positive
