In [1]:
import tensorflow as tf
import pandas as pd
import tensorflow_hub as hub
import os
import re
import numpy as np
from bert.tokenization import FullTokenizer
from tqdm import tqdm_notebook
from tensorflow.keras import backend as K


W0829 16:40:43.951755 4633806272 __init__.py:56] Some hub symbols are not available because TensorFlow version is less than 1.14


In [2]:
# Initialize session
sess = tf.compat.v1.Session()

import os
os.environ['TFHUB_CACHE_DIR'] = '/Users/gius/documents/lavoro/bert/tf_cache'

# Params for bert model and tokenization
bert_path = "https://tfhub.dev/google/bert_multi_cased_L-12_H-768_A-12/1"
max_seq_length = 23


In [3]:
# Load data and split it to train and test
train_df = pd.DataFrame()
train_df = pd.read_csv('desktop/sentipolc/trainingSentiPolcSuperClean.csv')

test_df = pd.DataFrame()
test_df = pd.read_csv('desktop/sentipolc/testSentiPolcSuperClean.csv')



In [4]:
train_text = train_df['text'].tolist()
train_text = [' '.join(t.split()[0:max_seq_length]) for t in train_text]

lst = []
for x in train_text:
    lst.append(' '.join(x.split()[-1::-1]))

train_text.extend(lst)

train_text = np.array(train_text, dtype=object)[:, np.newaxis]
train_label = train_df['subj'].tolist()
train_label = np.concatenate((train_label, train_label), axis=0)

test_text = test_df['text'].tolist()
test_text = [' '.join(t.split()[0:max_seq_length]) for t in test_text]
test_text = np.array(test_text, dtype=object)[:, np.newaxis]
test_label = test_df['subj'].tolist()


In [5]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, text_a, label=None):
        """Constructs a InputExample.
    Args:
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.text_a = text_a
        self.label = label

def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    bert_module =  hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"],tokenization_info["do_lower_case"],])

    return FullTokenizer(vocab_file=vocab_file, do_lower_case='false')

def convert_single_example(tokenizer, example, max_seq_length=23):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, type(None)):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label = 0
        return input_ids, input_mask, segment_ids, label

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label

def convert_examples_to_features(tokenizer, examples, max_seq_length=23):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for example in tqdm_notebook(examples, desc="Converting examples to features"):
        input_id, input_mask, segment_id, label = convert_single_example(tokenizer, example, max_seq_length)
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
        np.array(labels).reshape(-1, 1),
    )

def convert_text_to_examples(texts, labels):
    """Create InputExamples"""
    InputExamples = []
    for text, label in zip(texts, labels):
        InputExamples.append(
            InputExample(text_a=" ".join(text), label=label)
        )
    return InputExamples

# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module()

# Convert data to InputExample format
train_examples = convert_text_to_examples(train_text, train_label)
test_examples = convert_text_to_examples(test_text, test_label)

# Convert to features
(train_input_ids, train_input_masks, train_segment_ids, train_labels) = convert_examples_to_features(tokenizer, train_examples, max_seq_length=max_seq_length)
(test_input_ids, test_input_masks, test_segment_ids, test_labels) = convert_examples_to_features(tokenizer, test_examples, max_seq_length=max_seq_length)

class BertLayer(tf.keras.layers.Layer):
    def __init__(self, n_fine_tune_layers=10, **kwargs):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            bert_path,
            trainable=self.trainable,
            name="{}_module".format(self.name)
        )

        trainable_vars = self.bert.variables

        # Remove unused layers
        trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]

        # Select how many layers to fine tune
        trainable_vars = trainable_vars[-self.n_fine_tune_layers :]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)
            
        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        print("PING PING PING")
        bert_inputs = dict(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids)
        result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)["pooled_output"]
        return result

    def compute_output_shape(self, input_shape):
        print("PING PING PING")
        return (input_shape[0], self.output_size)

# Build model
def build_model(max_seq_length): 
    in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
    in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]
    
    bert_output = BertLayer(n_fine_tune_layers=3)(bert_inputs)
    dense = tf.keras.layers.Dense(23, activation='relu')(bert_output)
    pred = tf.keras.layers.Dense(1, activation='sigmoid')(dense)
    
    model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.summary()
    
    return model


Instructions for updating:
Colocations handled automatically by placer.


W0829 16:40:45.780322 4633806272 deprecation.py:323] From /Users/gius/Library/Python/3.7/lib/python/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0829 16:40:48.312029 4633806272 saver.py:1483] Saver not created because there are no variables in the graph to restore


HBox(children=(IntProgress(value=0, description='Converting examples to features', max=14820, style=ProgressSt…




HBox(children=(IntProgress(value=0, description='Converting examples to features', max=1998, style=ProgressSty…




In [6]:
def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)



In [7]:
model = build_model(max_seq_length)

# Instantiate variables
initialize_vars(sess)

from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score


PING PING PING
INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0829 16:41:00.603934 4633806272 saver.py:1483] Saver not created because there are no variables in the graph to restore


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          (None, 23)           0                                            
__________________________________________________________________________________________________
input_masks (InputLayer)        (None, 23)           0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        (None, 23)           0                                            
__________________________________________________________________________________________________
bert_layer (BertLayer)          (None, 768)          178565115   input_ids[0][0]                  
                                                                 input_masks[0][0]                
          

In [None]:
model.fit(
    [train_input_ids, train_input_masks, train_segment_ids], 
    train_labels,
    validation_data=([test_input_ids, test_input_masks, test_segment_ids], test_labels),
    epochs=1,
    batch_size=32
)

y_test_prediction = model.predict([test_input_ids, test_input_masks, test_segment_ids])


from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
print ("F1 micro averaging label 0 class 0:",(f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[0])))
a = (f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[0]))
print ("F1 micro averaging label 0 class 0:",(f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[1])))
b = (f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[1]))

print ("\nF1 score: ", (a+b)/2)


Train on 14820 samples, validate on 1998 samples
Instructions for updating:
Use tf.cast instead.


W0829 16:41:06.532536 4633806272 deprecation.py:323] From /Users/gius/Library/Python/3.7/lib/python/site-packages/tensorflow/python/ops/math_ops.py:3066: to_int32 (from tensorflow.python.ops.math_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.cast instead.




In [None]:
model.fit(
    [train_input_ids, train_input_masks, train_segment_ids], 
    train_labels,
    validation_data=([test_input_ids, test_input_masks, test_segment_ids], test_labels),
    epochs=1,
    batch_size=32
)

y_test_prediction = model.predict([test_input_ids, test_input_masks, test_segment_ids])



print ("F1 micro averaging label 0 class 0:",(f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[0])))
a = (f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[0]))
print ("F1 micro averaging label 0 class 0:",(f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[1])))
b = (f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[1]))

print ("\nF1 score: ", (a+b)/2)

In [None]:
model.fit(
    [train_input_ids, train_input_masks, train_segment_ids], 
    train_labels,
    validation_data=([test_input_ids, test_input_masks, test_segment_ids], test_labels),
    epochs=1,
    batch_size=32
)

y_test_prediction = model.predict([test_input_ids, test_input_masks, test_segment_ids])



print ("F1 micro averaging label 0 class 0:",(f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[0])))
a = (f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[0]))
print ("F1 micro averaging label 0 class 0:",(f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[1])))
b = (f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[1]))

print ("\nF1 score: ", (a+b)/2)

In [None]:
model.fit(
    [train_input_ids, train_input_masks, train_segment_ids], 
    train_labels,
    validation_data=([test_input_ids, test_input_masks, test_segment_ids], test_labels),
    epochs=1,
    batch_size=32
)

y_test_prediction = model.predict([test_input_ids, test_input_masks, test_segment_ids])



print ("F1 micro averaging label 0 class 0:",(f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[0])))
a = (f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[0]))
print ("F1 micro averaging label 0 class 0:",(f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[1])))
b = (f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[1]))

print ("\nF1 score: ", (a+b)/2)

In [None]:
model.fit(
    [train_input_ids, train_input_masks, train_segment_ids], 
    train_labels,
    validation_data=([test_input_ids, test_input_masks, test_segment_ids], test_labels),
    epochs=1,
    batch_size=32
)

y_test_prediction = model.predict([test_input_ids, test_input_masks, test_segment_ids])



print ("F1 micro averaging label 0 class 0:",(f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[0])))
a = (f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[0]))
print ("F1 micro averaging label 0 class 0:",(f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[1])))
b = (f1_score(test_labels, y_test_prediction.round().astype(int), average='micro', labels=[1]))

print ("\nF1 score: ", (a+b)/2)