# Pretrained Bert from TensorHub

In [2]:
import pandas as pd
import numpy as np
import sys
sys.path.append('../')
from src.preprocessing.text import *

import matplotlib.pyplot as plt
plt.style.use('ggplot')

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential, layers, regularizers
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

## Preprocessing

In [7]:
def clean_wrapper(text): 
    text = remove_url(text)
    text = remove_html(text)
    text = remove_emoji(text)
    text = remove_punctuation(text)
    return text

In [8]:
train_data = pd.read_csv('../data/train.csv')
test_data  = pd.read_csv('../data/test.csv')
sample_submission = pd.read_csv('../data/sample_submission.csv')

In [9]:
train_data['text'] = train_data['text'].apply(lambda x : clean_wrapper(x))
test_data['text'] = test_data['text'].apply(lambda x : clean_wrapper(x))

sent_data = train_data.text.values
labels_data = train_data.target.values
sent_submission = test_data.text.values

## Preparation for modelling

In [10]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sent_data)

X_train = tokenizer.texts_to_sequences(sent_data)
X_submission = tokenizer.texts_to_sequences(sent_submission)

y_train = labels_data

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
maxlen = 100

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_submission = pad_sequences(X_submission, padding='post', maxlen=maxlen)

print(X_train[0, :])

[ 109 4493   20    1  826    5   18  241  123 1569 4494   69   38    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0]


In [11]:
# Function To plot model accuracy and model loss
def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()

Download pretrained BERT

In [12]:
%%time
module_url = 'https://tfhub.dev/google/universal-sentence-encoder-large/4'
embed = hub.KerasLayer(module_url, trainable=False, name='USE_embedding')

CPU times: user 35.3 s, sys: 8.9 s, total: 44.2 s
Wall time: 4min 7s


In [16]:
def build_model(embed, dropout_rate=0.5, lr=.0005):
    model = Sequential([
        layers.Input(shape=[], dtype=tf.string),
        embed,
        layers.Dense(256, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(dropout_rate),
        layers.Dense(128, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(dropout_rate),        
        layers.Dense(64, activation='relu'),
        layers.BatchNormalization(),
        layers.Dropout(dropout_rate),
        layers.Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer = "adam", lr=lr, loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [20]:
model = build_model(embed, dropout_rate=0.6)
model.summary()













Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
USE_embedding (KerasLayer)   {'outputs': (None, 512)}  147354880 
_________________________________________________________________
dense_7 (Dense)              (None, 256)               131328    
_________________________________________________________________
batch_normalization_5 (Batch (None, 256)               1024      
_________________________________________________________________
dropout_5 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 128)               32896     
_________________________________________________________________
batch_normalization_6 (Batch (None, 128)               512       
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)              

In [None]:
checkpoint = ModelCheckpoint('../models/model.h5', monitor='val_loss', save_best_only=True)

history = model.fit(
    sent_data, labels_data,
    validation_split=0.15,
    epochs=20,
    callbacks=[checkpoint],
    batch_size=100
)

plot_history(history)

Train on 6471 samples, validate on 1142 samples
Epoch 1/20








Epoch 2/20
Epoch 3/20
Epoch 4/20

In [19]:
model.load_weights('../models/model.h5')
test_pred = model.predict(sent_submission)

test_data['target'] = test_pred.round().astype(int)
submission = test_data[['id', 'target']]
submission.to_csv('submission.csv', index=False)