<a href="https://colab.research.google.com/github/gkv856/KaggleData/blob/main/Natural_Language_Processing_with_Disaster_Tweets_using_fine_tune_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
import tensorflow as tf
import tensorflow_hub as hub
import re
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import random
import math
!pip install bert-for-tf2
!pip install tensorflow_text
!pip install -q tf-models-official
import tensorflow_text as text 
from official.nlp import optimization  # to create AdamW optimizer

!wget https://raw.githubusercontent.com/gkv856/KaggleData/main/helper_functions.py
from helper_functions import *
import tensorflow.keras.layers as layers

--2021-09-17 08:57:48--  https://raw.githubusercontent.com/gkv856/KaggleData/main/helper_functions.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10269 (10K) [text/plain]
Saving to: ‘helper_functions.py’


2021-09-17 08:57:48 (95.1 MB/s) - ‘helper_functions.py’ saved [10269/10269]



In [13]:
TRAIN_URL = "https://raw.githubusercontent.com/gkv856/KaggleData/main/train.csv"
TEST_URL = "https://raw.githubusercontent.com/gkv856/KaggleData/main/test.csv"
SUBMISSION_URL = "https://raw.githubusercontent.com/gkv856/KaggleData/main/sample_submission.csv"


In [14]:
df_train_original = pd.read_csv(TRAIN_URL)


In [15]:
mispell_dict = {"aren't" : "are not",
                "ain't": "are not",
"can't" : "cannot",
"couldn't" : "could not",
"couldnt" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"doesnt" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"havent" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"shouldnt" : "should not",
"that's" : "that is",
"thats" : "that is",
"there's" : "there is",
"theres" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"theyre":  "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not"}

def replace_typical_misspell(text):
    mispellings_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))

    def replace(match):
        return mispell_dict[match.group(0)]

    return mispellings_re.sub(replace, text)


def clean_text(text_str):
  text_str = BeautifulSoup(text_str, "lxml").get_text()
  text_str = re.sub(r"@[A-Za-z0-9]+", " ", text_str)
  text_str = re.sub(r"https?://[[A-Za-z0-9./]+", " ", text_str)
  text_str = re.sub(r"[^a-zA-Z.!?']", " ", text_str)
  text_str = re.sub(r" +", " ", text_str)
  text_str = text_str.lower()
  # text_str = replace_typical_misspell(text_str)

  return text_str

In [16]:
df_train = df_train_original.copy()
df_train["text"] = df_train["text"].apply(clean_text)

In [17]:
df_train["text_len"] = df_train["text"].map(len)

In [18]:
from sklearn.model_selection import train_test_split

data = df_train["text"].values
labels = df_train["target"].values
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=.3, random_state=42)

In [19]:
import bert

# will use this tokenizer to convert sentence to tokens
FullTokenizer = bert.bert_tokenization.FullTokenizer

BERT_PP_MODEL_URL = "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3"
BERT_MODEL_URL = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/4"
# BERT_MODEL_URL = "https://tfhub.dev/tensorflow/bert_en_wwm_uncased_L-24_H-1024_A-16/4"
# preprocessor = hub.load(BERT_PP_MODEL_URL)
# tokenize = hub.KerasLayer(preprocessor.tokenize)

In [20]:
# creating dataset from our data
X_train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))
X_test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test))

In [21]:
# defining batch size for the data and setting a prefetch
BATCH_SIZE = 32
X_train_ds_bp = X_train_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

X_test_ds_bp = X_test_ds.batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)


In [22]:
# creating bert preprocessor and model as Keras layer
bert_preprocessor = hub.KerasLayer(BERT_PP_MODEL_URL)
bert_encoder_NT = hub.KerasLayer(BERT_MODEL_URL, trainable=False)
bert_encoder_T = hub.KerasLayer(BERT_MODEL_URL, trainable=True)

In [23]:
# defining loss and metrics
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

In [40]:
# we are using adamw as an optimizer
EPOCHS = 3
init_lr = 3e-5

steps_per_epoch = tf.data.experimental.cardinality(X_train_ds_bp).numpy()
num_train_steps = steps_per_epoch * EPOCHS
num_warmup_steps = int(0.1*num_train_steps)

optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')

In [41]:
# prepare few callbacks
import tensorflow.keras.callbacks as cb
cb_early_stop = cb.EarlyStopping(patience=2, restore_best_weights=True)

# getting current datetime
from datetime import datetime
now = datetime.now()
curr_time = now.strftime("%Y%m_%H%M%S")
f_name = f"BERT_Fint_tune_{curr_time}"
# print("Current Time =", curr_time)
GD_MODEL_PATH = f"/content/drive/MyDrive/AI_ML_DL/Kaggle/01. Disaster tweet/{f_name}"
# print(GD_MODEL_PATH)
cb_ckp = cb.ModelCheckpoint(GD_MODEL_PATH, 
                            monitor='binary_accuracy', 
                            verbose=0, 
                            save_best_only=False,
                            save_weights_only=False, 
                            mode='auto', 
                            save_freq='epoch')


In [42]:
input_layer = layers.Input(shape=(), dtype=tf.string, name="batched_text_inputs_layer")
encoder_inputs = bert_preprocessor(input_layer)
bert_outputs = bert_encoder_T(encoder_inputs)

x = layers.Dense(512, activation = 'relu')(bert_outputs)
x = layers.Dropout(0.2)(x)
x = layers.Dense(256, activation = 'relu')(x)
x = layers.Dropout(0.1)(tf_model_output)
x = layers.Dense(64, activation = 'relu')(x)
x = layers.AveragePooling1D()(x)

# we are not using any activation because we are using loss with logits
outputs = tf.keras.layers.Dense(1, activation=None, name='classifier')(x)

model = tf.keras.Model(input_layer, outputs, name="Basic_BERT_Fine_Tune")

model.compile(optimizer=optimizer,
                         loss=loss,
                         metrics=metrics)
model.summary()

Model: "Basic_BERT_Fine_Tune"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
batched_text_inputs_layer (Inpu [(None,)]            0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        {'input_mask': (None 0           batched_text_inputs_layer[0][0]  
__________________________________________________________________________________________________
keras_layer_2 (KerasLayer)      {'sequence_output':  109482241   keras_layer[3][0]                
                                                                 keras_layer[3][1]                
                                                                 keras_layer[3][2]                
_______________________________________________________________________________

In [43]:
history_model = model.fit(X_train_ds_bp,
                           validation_data=X_test_ds_bp,
                           epochs=EPOCHS,
                           callbacks=[cb_ckp, cb_early_stop])
plot_loss_curves(history_model)

Epoch 1/3
  2/167 [..............................] - ETA: 2:14:01 - loss: 0.7005 - binary_accuracy: 0.4844

KeyboardInterrupt: ignored