In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
import keras
from tqdm import tqdm
import pickle
from keras.models import Model
import keras.backend as K
from sklearn.metrics import confusion_matrix,f1_score,classification_report
import matplotlib.pyplot as plt
from keras.callbacks import ModelCheckpoint
import itertools
from keras.models import load_model
from sklearn.utils import shuffle
from transformers import BertTokenizer, TFBertModel, BertConfig, TFBertForSequenceClassification


def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s) if unicodedata.category(c) != 'Mn')

def clean_stopwords_shortwords(w):
    stopwords_list=stopwords.words('english')
    words = w.split() 
    clean_words = [word for word in words if (word not in stopwords_list) and len(word) > 2]
    return " ".join(clean_words) 

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w=clean_stopwords_shortwords(w)
    w=re.sub(r'@\w+', '',w)
    return w

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [3]:
num_classes=len(train.target.unique())

bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=num_classes)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
sent= 'how to train the model, lets look at how a trained model calculates its prediction.'
tokens=bert_tokenizer.tokenize(sent)
print(tokens)

['how', 'to', 'train', 'the', 'model', ',', 'lets', 'look', 'at', 'how', 'a', 'trained', 'model', 'calculate', '##s', 'its', 'prediction', '.']


In [5]:
sentences = train['text']
labels = train['target']
len(sentences),len(labels)

(7613, 7613)

In [6]:
input_ids=[]
attention_masks=[]

for sent in sentences:
    bert_inp = bert_tokenizer.encode_plus(
        sent,
        add_special_tokens = True,
        max_length = 90,
        pad_to_max_length = True,
        return_attention_mask = True
    )
    
    input_ids.append(bert_inp['input_ids'])
    attention_masks.append(bert_inp['attention_mask'])

input_ids = np.asarray(input_ids)
attention_masks = np.array(attention_masks)
labels = np.array(labels)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [7]:
len(input_ids),len(attention_masks),len(labels)

(7613, 7613, 7613)

In [8]:
train_inp , val_inp, train_label, val_label, train_mask, val_mask = train_test_split(
    input_ids,
    labels,
    attention_masks,
    test_size = 0.2,
    shuffle = True
)

print('Train inp shape {} Val input shape {}\nTrain label shape {} Val label shape {}\nTrain attention mask shape {} Val attention mask shape {}'.format(
    train_inp.shape,
    val_inp.shape,
    train_label.shape,
    val_label.shape,
    train_mask.shape,
    val_mask.shape
))

Train inp shape (6090, 90) Val input shape (1523, 90)
Train label shape (6090,) Val label shape (1523,)
Train attention mask shape (6090, 90) Val attention mask shape (1523, 90)


In [9]:
log_dir = 'tensorboard_data/tb_bert'
model_save_path = './models/bert_model.h5'

callbacks = [tf.keras.callbacks.ModelCheckpoint(
    filepath=model_save_path,
    save_weights_only=True,
    monitor='val_loss',
    mode='min',
    save_best_only=True
),keras.callbacks.TensorBoard(log_dir = log_dir)]

print('\nBert Model',bert_model.summary())

loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5,epsilon=1e-08)

bert_model.compile(loss=loss,optimizer=optimizer,metrics=[metric])

Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  1538      
                                                                 
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________

Bert Model None


In [10]:
history=bert_model.fit(
    [train_inp, train_mask],
    train_label,
    batch_size=16,
    epochs=10,
    validation_data=([val_inp,val_mask],val_label),
    callbacks=callbacks
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [11]:
# %load_ext tensorboard

In [12]:
# log_dir='tensorboard_data/bert_model'
# %tensorboard --logdir {log_dir}

In [13]:
trained_model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased',num_labels=2)
trained_model.compile(loss=loss,optimizer=optimizer, metrics=[metric])
trained_model.load_weights(model_save_path)

preds = trained_model.predict([val_inp,val_mask],batch_size=32)
preds
# pred_labels = preds.argmax(axis=1)
# f1 = f1_score(val_label,pred_labels)
# print('F1 score',f1)
# print('Classification Report')
# print(classification_report(val_label,pred_labels,target_names=target_names))

# print('Training and saving built model.....')   

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


TFSequenceClassifierOutput(loss=None, logits=array([[ 0.93973464, -0.6765221 ],
       [ 1.2597959 , -0.986002  ],
       [ 1.3038394 , -0.93121046],
       ...,
       [ 1.2953413 , -0.94678944],
       [ 0.6063849 , -0.3072171 ],
       [ 1.1619651 , -0.8398766 ]], dtype=float32), hidden_states=None, attentions=None)