# Pretrained Bert from TensorHub

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../')
from src.preprocessing.text import *

import matplotlib.pyplot as plt
plt.style.use('ggplot')
import datetime
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential, layers, regularizers, constraints
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

%load_ext tensorboard

## Preprocessing

In [2]:
def clean_wrapper(text): 
    text = remove_url(text)
    text = remove_html(text)
    text = remove_emoji(text)
    text = remove_punctuation(text)
    return text

In [3]:
train_data = pd.read_csv('../data/train.csv')
test_data  = pd.read_csv('../data/test.csv')
sample_submission = pd.read_csv('../data/sample_submission.csv')

In [4]:
train_data['text'] = train_data['text'].apply(lambda x : clean_wrapper(x))
test_data['text'] = test_data['text'].apply(lambda x : clean_wrapper(x))

sent_data = train_data.text.values
labels_data = train_data.target.values
sent_submission = test_data.text.values

## Preparation for modelling

In [5]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sent_data)

X_train = tokenizer.texts_to_sequences(sent_data)
X_submission = tokenizer.texts_to_sequences(sent_submission)

y_train = labels_data

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_submission = pad_sequences(X_submission, padding='post', maxlen=maxlen)

print(X_train[0, :])

[ 109 4493   20    1  826    5   18  241  123 1569 4494   69   38    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [6]:
# Function To plot model accuracy and model loss
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

Download pretrained BERT

In [11]:
%%time
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-large/4'
embed = hub.KerasLayer(module_url, trainable=False, name='USE_embedding')
embed_trainable = hub.KerasLayer(module_url, trainable=True, name='USE_embedding_2')

CPU times: user 41.3 s, sys: 10.7 s, total: 52 s
Wall time: 54.9 s


In [8]:
def build_model(embed, dropout_rate=0.5, lr=.0005, l1=0.01, l2=0.01, max_norm = 2.):
    model = Sequential([
        layers.Input(shape=[], dtype=tf.string),
        embed,
        layers.Dense(32, activation='relu',
                     # kernel_regularizer=regularizers.l1_l2(l1=l1, l2=l2),
                     kernel_constraint=constraints.max_norm(max_norm),
                     activity_regularizer=regularizers.l1_l2(l1=l1, l2=l2)),
        layers.BatchNormalization(),
        layers.Dropout(dropout_rate),
        #layers.Dense(128, activation='relu',
        #             # kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01),
        #             kernel_constraint=constraints.max_norm(max_norm),
        #             activity_regularizer=regularizers.l1_l2(l1=l1, l2=l2)),
        #layers.BatchNormalization(),
        #layers.Dropout(dropout_rate),        
        #layers.Dense(64, activation='relu',
        #             # kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01),
        #             kernel_constraint=constraints.max_norm(max_norm),
        #             activity_regularizer=regularizers.l1_l2(l1=l1, l2=l2)),
        #layers.BatchNormalization(),
        #layers.Dropout(dropout_rate),
        #layers.Dense(32, activation='relu',
                     # kernel_regularizer=regularizers.l1_l2(l1=0.01, l2=0.01),
        #             kernel_constraint=constraints.max_norm(max_norm),
        #             activity_regularizer=regularizers.l1_l2(l1=l1, l2=l2)),
        #layers.BatchNormalization(),
        #layers.Dropout(dropout_rate),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer = "adam", lr=lr, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [12]:
model = build_model(embed_trainable, dropout_rate=0.5, l2=0.05, l1=0)
model.summary()

log_dir="logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
USE_embedding_2 (KerasLayer) {'outputs': (None, 512)}  147354880 
_________________________________________________________________
dense_2 (Dense)              (None, 32)                16416     
_________________________________________________________________
batch_normalization_1 (Batch (None, 32)                128       
_________________________________________________________________
dropout_1 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 33        
Total params: 147,371,457
Trainable params: 147,371,393
Non-trainable params: 64
_________________________________________________________________


In [13]:
checkpoint = ModelCheckpoint('../models/model.h5', monitor='val_loss', save_best_only=True)

history = model.fit(
    sent_data, labels_data,
    validation_split=0.1,
    epochs=100,
    callbacks=[checkpoint, tensorboard_callback],
    batch_size=32
)

plot_history(history)

Train on 6851 samples, validate on 762 samples
Epoch 1/100








Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100

InvalidArgumentError: Nan in summary histogram for: Embeddings/sharded_0_0 [Op:WriteHistogramSummary] name: Embeddings/sharded_0_0/

In [None]:
model.load_weights('../models/model.h5')
test_pred = model.predict(sent_submission)

test_data['target'] = test_pred.round().astype(int)
submission = test_data[['id', 'target']]
submission.to_csv('submission.csv', index=False)

In [None]:
tensorboard --logdir logs/fit