In [66]:
# BERT - CONTEXTUALZIED!

In [None]:
!nvidia-smi

In [None]:
from google.colab import drive

drive.mount('/content/drive')

In [None]:
!pip install tensorflow-text
!pip install tensorflow-addons

In [70]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.preprocessing import OneHotEncoder
import tensorflow.keras.layers as KL
import numpy as np
from tensorflow.keras.utils import to_categorical
import tensorflow_addons as tfa
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
#Test if using gpu
if tf.test.gpu_device_name():
    print('Default GPU Device:{}'.format(tf.test.gpu_device_name()))
else:
    print("Please install GPU version of TF")

<h2>Examine training data</h2>

In [72]:
N_CLASSES = 5
TRAIN_DATA_PATH = "/content/drive/My Drive/data/train.csv"
VAL_DATA_PATH = "/content/drive/My Drive/data/valid.csv"
TEST_DATA_PATH = "/content/drive/My Drive/data/test.csv"
CHECKPOINT_PATH = "/content/drive/My Drive/checkpoint_v4.hdf5"

In [73]:
df_train_raw = pd.read_csv(TRAIN_DATA_PATH)
df_train_raw.head(5)

Unnamed: 0,business_id,cool,date,funny,review_id,stars,text,useful,user_id
0,JCZEK7wiazoM6xiq8YeZyw,1,2018-01-16 20:13:13,1,oxj0_2jKOqQFIWEYRjWi6g,5,I've been here a handful of times now and I've...,1,1fq-gL1i_8xKhc9VgOZDGw
1,ALn_0f-Usn3n0a9WBcjhhg,0,2018-04-10,0,gZITaUSvzBUijZvNGXO_Cg,1,The service was terrible. The food was just ok...,0,wqG3PCf8ufXId2RG0oBufA
2,3tBRBsiTi6JJz3CJ7DcS_w,0,2014-07-11 19:08:48,0,ov2ohuP2bPJI35sscGGJpw,4,Alil pricey for the location but completly get...,0,xgXVmyRpUZUwbgo519IqJw
3,eD6MH0tD1R3C1Qs1sH0wBg,0,2018-04-28 22:03:23,0,LFJGPIrbR7U_g3oavotkXg,1,Don't get your car washed here. Paid 11 and my...,1,KjhzP6W-6T7cZrPczcnKOg
4,T-TES2u1IA2THb8uBhNdCA,0,2015-07-15 17:21:15,0,hUoRKiGTnMV51R6pQSYovQ,5,Cute but tight. Not expensive and creative. I ...,0,CN5OQxL6FVT3nr7L2Ohm2w


<h4>Training data class distribution</h4>

In [None]:

# Need to balance the class distribution

for i, size in enumerate(df_train_raw.groupby('stars').size()):
    print("{} stars: {}".format(i+1, size))

In [76]:
#  helper function to transform raw dataframe to X and y sets
def dataframe_extract(df_raw, test = False):

    X = df_raw['text']#.squeeze()
    # one-hot representation of label

    y_onehot = tf.keras.utils.to_categorical(df_raw['stars'].apply(lambda x: x-1).squeeze(), num_classes = N_CLASSES) 
    y = df_raw['stars'].apply(lambda x: x-1)#.squeeze()

    if test:
        return X, None, None
    else:
        return X,y_onehot,y
        

In [77]:

df_val_raw = pd.read_csv(VAL_DATA_PATH)
df_test_raw = pd.read_csv(TEST_DATA_PATH)

X_train, y_train_onehot, y_train = dataframe_extract(df_train_raw)
X_val, y_val_onehot, y_val = dataframe_extract(df_val_raw)
X_test, _, _  = dataframe_extract(df_test_raw, True)

<h4>Import BERT</h4>

1. Preprocess raw text (stemming, remove stopwords and punctuation, etc)
2. Feature transformation (eng words -> numerical vector)

In [78]:
#Layer for preprocessing text suited to BERT
bert_preprocess = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")
#Layer for actual BERT extraction
bert_encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4", trainable=True)

<h4>Build Model</h4>

In [80]:
# Weight initialization scheme - random normal
kernel_init = tf.keras.initializers.he_normal(seed=0)

# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_preprocess(text_input)
outputs = bert_encoder(preprocessed_text)

# Neural network layers (Other NN models?)
drop1 = KL.Dropout(0.4)(outputs['pooled_output'])
dense1 = KL.Dense(256, 
                  kernel_initializer=kernel_init, 
                  bias_initializer = "zeros", 
                  kernel_regularizer=tf.keras.regularizers.l2(0.01))(drop1)
norm1 = KL.LayerNormalization()(dense1)
acti1 = KL.LeakyReLU()(norm1)


drop = KL.Dropout(0.4)(outputs['pooled_output'])
pred = tf.keras.layers.Dense(N_CLASSES, activation='softmax',kernel_initializer=kernel_init, bias_initializer = "zeros")(acti1) #(acti2)#(acti3)

model = tf.keras.Model(inputs=[text_input], outputs = [pred])

In [None]:
print("Total training data: {}".format(len(X_train)))
print("Total validation data: {}".format(len(X_val)))

In [82]:
# CLASS WEIGHT

from sklearn.utils.class_weight import compute_class_weight

class_weights=dict(enumerate(compute_class_weight(class_weight = "balanced" , 
                     classes=np.unique(y_train), 
                     y = y_train)))

In [None]:
METRICS = [
      tf.keras.metrics.CategoricalAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
      tfa.metrics.F1Score(num_classes = N_CLASSES, name='f1', average='macro') 
]

model_earlystopping_callback = tf.keras.callbacks.EarlyStopping(
    monitor = "val_loss", 
    patience = 2, 
    restore_best_weights = True
)


def scheduler(epoch, lr):
  if epoch < 5:
    return lr
  else:
    return lr * tf.math.exp(-0.1)
lrschedule_callback = tf.keras.callbacks.LearningRateScheduler(scheduler)


checkpoint_filepath = CHECKPOINT_PATH
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    verbose = True   
)

#Optimizer -> Adam

Adam = tf.keras.optimizers.Adam(learning_rate=5e-5) # different learning rates maybe?
model.compile(loss='categorical_crossentropy', optimizer=Adam, metrics=METRICS)
model.summary()

In [None]:
# model.load_weights(checkpoint_filepath)
# print("Weights loaded successfully")

N_EPOCHS = 10

history = model.fit(X_train, y_train_onehot, 
          class_weight = class_weights,
          epochs=N_EPOCHS, 
          batch_size = 16, 
          validation_data = (X_val, y_val_onehot),
          callbacks=[model_checkpoint_callback, model_earlystopping_callback, lrschedule_callback])

In [None]:
from sklearn.metrics import classification_report

y_val_pred = model.predict(X_val, verbose=1)
y_val_pred_index = np.argmax(y_val_pred, axis=1)


print(classification_report(y_true = y_val, y_pred = y_val_pred_index,digits=4))

In [86]:
# Validation Performance	Macro-F1	Precision	Recall	Accuracy
# Weak baseline	          0.4270	   0.5420    0.4325	 0.6135

  # Strong                0.5673 	   0.5707	   0.5725	 0.6665

In [87]:
# x = list(range(1, N_EPOCHS+1))
# metric_list = list(history.history.keys())
# num_metrics = int(len(metric_list)/2)

# fig, ax = plt.subplots(nrows=1, ncols=num_metrics, figsize=(30, 5))

# for i in range(0, num_metrics):
#   ax[i].plot(x, history.history[metric_list[i]], marker="o", label=metric_list[i].replace("_", " "))
#   ax[i].plot(x, history.history[metric_list[i+num_metrics]], marker="o", label=metric_list[i+num_metrics].replace("_", " "))
#   ax[i].set_xlabel("epochs",fontsize=14)
#   ax[i].set_title(metric_list[i].replace("_", " "),fontsize=20)
#   ax[i].legend(loc="lower left")