In [1]:
import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_hub as hub

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/nlp-getting-started/train.csv


In [2]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py
import tokenization

In [3]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
submission = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [4]:
def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        #tokenize text
        text = tokenizer.tokenize(text)
        text = text[:max_len-2]
        
        #append special tokens and create ids (a sequence of integers identifying each input token to its index number in the BERT tokenizer vocabulary)
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        tokens = tokenizer.convert_tokens_to_ids(input_sequence)
        
        # pad input tokens
        pad_len = max_len - len(input_sequence)
        tokens += [0] * pad_len
        
        # create attention masks
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)   
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)

In [5]:
 def build_model(bert_layer, max_len = 512):
    input_word_ids = Input(shape = (max_len,), dtype = tf.int32, name = "input_word_ids")
    input_mask = Input(shape = (max_len,), dtype = tf.int32, name = "input_mask")
    segment_ids = Input(shape = (max_len,), dtype = tf.int32, name = "segment_ids")

    _, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    out = Dense(1, activation = 'sigmoid')(clf_output)
    
    model = Model(inputs = [input_word_ids, input_mask, segment_ids], outputs = out)
    model.compile(Adam(lr = 2e-6), loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model

In [6]:
%%time
# load BERT from the Tensorflow Hub
module_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-24_H-1024_A-16/1"
bert_layer = hub.KerasLayer(module_url, trainable=True)

CPU times: user 1min 27s, sys: 9.15 s, total: 1min 36s
Wall time: 1min 39s


In [7]:
# load tokenizer from the bert layer
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

In [8]:
#  encode the text into tokens, masks, and segment flags:
train_input = bert_encode(train.text.values, tokenizer, max_len = 160)
test_input = bert_encode(test.text.values, tokenizer, max_len = 160)
train_labels = train.target.values

In [9]:
# build a model
model = build_model(bert_layer, max_len = 160)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 160)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 160)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 160)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 1024), (None 335141889   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [10]:
# train a model
train_history = model.fit(train_input, train_labels, validation_split=0.2, epochs=3, batch_size=16)
model.save('model.h5')

Train on 6090 samples, validate on 1523 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [11]:
#predictions
test_pred = model.predict(test_input)

In [12]:
submission['target'] = test_pred.round().astype(int)
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
