In [1]:
# Importing relevant libraries
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score

!pip install -q transformers
from transformers import BertTokenizer
from transformers import TFBertModel

import pickle
from tensorflow import keras
from keras.layers import Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy
from keras.metrics import SparseCategoricalAccuracy

import os

# Global Variables
data_size = 50000
model_name = "bert-base-cased"

In [2]:
print("Downloading dataset")
!gdown --id 1bwE8nqeNx2-jNGq-DVkeoClrIWhFOJkm

Downloading dataset
Downloading...
From: https://drive.google.com/uc?id=1bwE8nqeNx2-jNGq-DVkeoClrIWhFOJkm
To: /content/treebank7.csv
8.43MB [00:00, 31.9MB/s]


In [4]:
df = pd.read_csv('/content/treebank7.csv')
df = df.iloc[:data_size]

phrases = np.array(df["phrases"])
sentiment_labels = np.array(df["labels"])

In [5]:
tokenizer = BertTokenizer.from_pretrained(model_name)
max_token_len = np.max(np.array([len(tokenizer.encode(phrase)) for phrase in phrases]))
print(max_token_len)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=29.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435797.0, style=ProgressStyle(descripti…


48


In [7]:
# Encode dataset
token_ids = np.zeros(shape=(len(phrases), max_token_len), dtype=np.int32)
for index, phrase in enumerate(phrases):
  encoded = tokenizer.encode(phrase)
  token_ids[index][0 : len(encoded)] = encoded

In [8]:
sentiment_labels = np.array([round(value) for value in sentiment_labels])

In [9]:
# Split into training, testing and validation data
X_train, X_test, y_train, y_test = train_test_split(token_ids, sentiment_labels, test_size=0.2)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25)
print(len(X_train), len(X_test), len(X_val))
print(len(y_train), len(y_test), len(y_val))

training_data_X = {"input_ids": X_train, "attention_masks": (X_train != 0).astype(np.int32)}
testing_data_X = {"input_ids": X_test, "attention_masks": (X_test != 0).astype(np.int32)}
validation_data_X = {"input_ids": X_val, "attention_masks": (X_val != 0).astype(np.int32)}

30000 10000 10000
30000 10000 10000


In [10]:
base_bert_model = TFBertModel.from_pretrained(model_name)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=526681800.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [12]:
class BertModel(keras.Model):
  def __init__(self, dropout_prob = 0.1, **kwargs):
    super(BertModel, self).__init__()
    self.bert = base_bert_model
    self.dropout = Dropout(rate = dropout_prob)
    self.dense = Dense(1, activation="sigmoid")

  def call(self ,inputs, **kwargs):
    sequence_tokens, pooled_output = self.bert(inputs, **kwargs).values()
    pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
    sentiment_label = self.dense(pooled_output)
    return sentiment_label

bert_complete_model = BertModel()
bert_complete_model.compile(optimizer=Adam(learning_rate=3e-5, epsilon=1e-08), loss="binary_crossentropy", metrics=["accuracy"])

In [None]:
history = bert_complete_model.fit(training_data_X, y_train, epochs=1, batch_size=25, validation_data=(validation_data_X, y_val))



In [None]:
#    loss  , accu  , val_loss, val_accu
# 1: 0.5751, 0.7011,         , 0.7511