In [None]:
# !pip install bert-tensorflow
# gast is imported due to error of compatibility layer
# !pip install gast==0.2.2
# !pip install bert-for-tf2

In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from tqdm import tqdm_notebook 

# Tensorflow 1 being used in this model but easy upgrade for TF2 with attention 
# to graph, saving model and calling sess
# %tensorflow_version 1.x
import tensorflow as tf
import tensorflow_hub as hub
from bert.tokenization import FullTokenizer
from tensorflow.keras import backend as K
sess = tf.Session()

# Params for bert model and tokenization
bert_path = "https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1"

In [None]:
# Get the GPU device name. Checking if exists
device_name = tf.test.gpu_device_name()

# device name should appear as /device:GPU:0 if present and running
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
#     raise SystemError('GPU device not found')
    print('checked')

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
train = pd.read_csv('data/train_skills.csv')
# colab path to data:
# /content/drive/My Drive/train_skills.csv
train.info()

In [None]:
# 100 missing values in skill column
train.fillna(0, inplace = True)
train.skill = train.skill.astype('int8')
# due to gathered training data casing cannot be considered unless overhauling training data
train['phrase'] = train.phrase.str.lower()
# 10 was found upon inspection of unique values
train.skill = train.skill.replace(10, 1)
train.tail()

In [None]:
# ratio of classes; checking for balance in training data 
print('Number of skill:', train.skill.sum())
print('Number of Not Skill:', len(train.skill) - train.skill.sum())
print('Ratio of classes: Skill=', train.skill.sum() / len(train.skill), ' Not Skill=', 1 - train.skill.sum() / len(train.skill))

# Train Test Split

In [None]:
# splitting data 1st iter 85/15 2nd iter 80/20 
# ensuring shuffling due to the sequentual nature of the training set
X = train.phrase
y = train.skill

X_train, X_test, y_train, y_test = train_test_split(
                                                    X, y, 
                                                    test_size = .2,
                                                    random_state = 42,
                                                    shuffle = True
                                                    )

print('{:>5,} training samples'.format(len(X_train)))
print('{:>5,} validation samples'.format(len(X_test)))

In [None]:
# According to analysis the largest chunk captured by regex was 18 tokens long
max_seq_length = 18
# data should be np array with new dimension for input 
train_text = X_train[:, np.newaxis]
test_text = X_test[:, np.newaxis]
# always should check and check again
train_text.shape, test_text.shape

# Tokenization

In [None]:
class PaddingInputExample(object):
    """Fake example so the num input examples is a multiple of the batch size.
  When running eval/predict on the TPU, we need to pad the number of examples
  to be a multiple of the batch size, because the TPU requires a fixed batch
  size. The alternative is to drop the last batch, which is bad because it means
  the entire output data won't be generated.
  We use this class instead of `None` because treating `None` as padding
  battches could cause silent errors.
  """
  

In [None]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, guid, text_a, text_b=None, label=None):
        """Constructs a InputExample.
    Args:
      guid: Unique id for the example.
      text_a: string. The untokenized text of the first sequence. For single
        sequence tasks, only this sequence must be specified.
      text_b: (Optional) string. The untokenized text of the second sequence.
        Only must be specified for sequence pair tasks.
      label: (Optional) string. The label of the example. This should be
        specified for train and dev examples, but not for test examples.
    """
        self.guid = guid
        self.text_a = text_a
        self.text_b = text_b
        self.label = label

def create_tokenizer_from_hub_module():
    """Get the vocab file and casing info from the Hub module."""
    bert_module =  hub.Module(bert_path)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    vocab_file, do_lower_case = sess.run(
        [
            tokenization_info["vocab_file"],
            tokenization_info["do_lower_case"],
        ]
    )

    return FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)

def convert_single_example(tokenizer, example, max_seq_length=256):
    """Converts a single `InputExample` into a single `InputFeatures`."""

    if isinstance(example, PaddingInputExample):
        input_ids = [0] * max_seq_length
        input_mask = [0] * max_seq_length
        segment_ids = [0] * max_seq_length
        label = 0
        return input_ids, input_mask, segment_ids, label

    tokens_a = tokenizer.tokenize(example.text_a)
    if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0 : (max_seq_length - 2)]

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    return input_ids, input_mask, segment_ids, example.label

def convert_examples_to_features(tokenizer, examples, max_seq_length=256):
    """Convert a set of `InputExample`s to a list of `InputFeatures`."""

    input_ids, input_masks, segment_ids, labels = [], [], [], []
    for example in tqdm_notebook(examples, desc="Converting examples to features"):
        input_id, input_mask, segment_id, label = convert_single_example(
            tokenizer, example, max_seq_length
        )
        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        labels.append(label)
    return (
        np.array(input_ids),
        np.array(input_masks),
        np.array(segment_ids),
        np.array(labels).reshape(-1, 1),
    )

def convert_text_to_examples(texts, labels):
    """Create InputExamples"""
    InputExamples = []
    for text, label in zip(texts, labels):
        InputExamples.append(
            InputExample(guid=None, text_a=" ".join(text), text_b=None, label=label)
        )
    return InputExamples


In [None]:
# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module()

# Convert data to InputExample format
train_examples = convert_text_to_examples(train_text, y_train)
test_examples = convert_text_to_examples(test_text, y_test)

# Convert to features
(train_input_ids, train_input_masks, train_segment_ids, train_labels 
) = convert_examples_to_features(tokenizer, train_examples, max_seq_length=max_seq_length)
(test_input_ids, test_input_masks, test_segment_ids, test_labels
) = convert_examples_to_features(tokenizer, test_examples, max_seq_length=max_seq_length)

# Embedding

In [None]:
class BertLayer(tf.keras.layers.Layer):
    ''' a function had to be added at the end in order to save the model due to 
    not identifying the shape early in the model build; as a result a function 
    that overrides original configuration file was made from issues section on guthub'''
    
    def __init__(
        self,
        n_fine_tune_layers=10,
        pooling="first",
        bert_path="https://tfhub.dev/google/bert_uncased_L-12_H-768_A-12/1",
        **kwargs,
    ):
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.pooling = pooling
        self.bert_path = bert_path
        if self.pooling not in ["first", "mean"]:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )

        super(BertLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        self.bert = hub.Module(
            self.bert_path, trainable=self.trainable, name=f"{self.name}_module"
        )

        # Remove unused layers
        trainable_vars = self.bert.variables
        if self.pooling == "first":
            trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
            trainable_layers = ["pooler/dense"]

        elif self.pooling == "mean":
            trainable_vars = [
                var
                for var in trainable_vars
                if not "/cls/" in var.name and not "/pooler/" in var.name
            ]
            trainable_layers = []
        else:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )

        # Select how many layers to fine tune
        for i in range(self.n_fine_tune_layers):
            trainable_layers.append(f"encoder/layer_{str(11 - i)}")

        # Update trainable vars to contain only the specified layers
        trainable_vars = [
            var
            for var in trainable_vars
            if any([l in var.name for l in trainable_layers])
        ]

        # Add to trainable weights
        for var in trainable_vars:
            self._trainable_weights.append(var)

        for var in self.bert.variables:
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super(BertLayer, self).build(input_shape)

    def call(self, inputs):
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        input_ids, input_mask, segment_ids = inputs
        bert_inputs = dict(
            input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids
        )
        if self.pooling == "first":
            pooled = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "pooled_output"
            ]
        elif self.pooling == "mean":
            result = self.bert(inputs=bert_inputs, signature="tokens", as_dict=True)[
                "sequence_output"
            ]

            mul_mask = lambda x, m: x * tf.expand_dims(m, axis=-1)
            masked_reduce_mean = lambda x, m: tf.reduce_sum(mul_mask(x, m), axis=1) / (
                    tf.reduce_sum(m, axis=1, keepdims=True) + 1e-10)
            input_mask = tf.cast(input_mask, tf.float32)
            pooled = masked_reduce_mean(result, input_mask)
        else:
            raise NameError(f"Undefined pooling type (must be either first or mean, but is {self.pooling}")

        return pooled

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_size)

    def get_config(self):
        config = super(BertLayer, self).get_config().copy()
        config.update({
            'n_fine_tune_layers': self.n_fine_tune_layers,
            # 'trainable': self.trainable,
            # 'output_size': self.output_size,
            'pooling': self.pooling,
            'bert_path': self.bert_path,
        })

        return config

# Network Archetecture

In [None]:
# Build model lines indicated where to change the model when tweaking
def build_model(max_seq_length): 
    in_id = tf.keras.layers.Input(shape=(max_seq_length,), name="input_ids")
    in_mask = tf.keras.layers.Input(shape=(max_seq_length,), name="input_masks")
    in_segment = tf.keras.layers.Input(shape=(max_seq_length,), name="segment_ids")
    bert_inputs = [in_id, in_mask, in_segment]
    bert_output = BertLayer(n_fine_tune_layers=10, )(bert_inputs)
################################################################################
    x = tf.keras.layers.Dense(128, activation='relu')(bert_output)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.Dense(32, activation='relu')(x)
    x = tf.keras.layers.Dense(24, activation='tanh')(x)
    x = tf.keras.layers.Dense(12, activation='relu')(x)
    pred = tf.keras.layers.Dense(1, activation='softplus')(x)
################################################################################
    optimizer = tf.keras.optimizers.Adam(lr=0.00001)
    model = tf.keras.models.Model(inputs=bert_inputs, outputs=pred)
################################################################################  
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
################################################################################
    model.summary()
    return model

# Training

In [None]:
# function for intializing the variables which should not be forgotten!!!!!!!
def initialize_vars(sess):
    sess.run(tf.local_variables_initializer())
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    K.set_session(sess)

In [None]:
# training model 1st iter 25 epochs and was clear that less than 10 epochs were sufficient
# anywhere from 6~8 epochs deliver optimal results; batch size was unchanged, softplus

# training model 2nd iter 8 epochs; still only 60% on unseen
# splitting changed 80/20 batch size was increased tp 50, sigmoid in place of soft plus
model = build_model(max_seq_length)

# Instantiate variables
initialize_vars(sess)

model.fit(
        [train_input_ids, train_input_masks, train_segment_ids], 
        train_labels,
        validation_data=([test_input_ids, test_input_masks, test_segment_ids], test_labels),
        epochs=6,
        batch_size=60
         )


# Model Save

In [None]:
# after many interations to find an optimal structure this is the result
# I should consider inputting size for easier saving and loading
tf.keras.utils.plot_model(model, show_shapes=True, dpi=48)

In [None]:
# found it was necessary to make a directory on colab while saving the model for easier retrieval
# model.save('bert_skills_clf.h5', overwrite=True, include_optimizer=True)
# # predictions before we clear and reload model
pre_save_preds = model.predict([test_input_ids, 
                                test_input_masks, 
                                test_segment_ids]
                              ) 
# # # Clear and load model
model = None
model = build_model(max_seq_length)
initialize_vars(sess)
model.load_weights('bert_skills_clf.h5')

# # predictions after we clear and reload model
post_save_preds = model.predict([test_input_ids, 
                                test_input_masks, 
                                test_segment_ids]
                              ) 
# Are they the same?
all(pre_save_preds == post_save_preds) 

# Evaluation

In [None]:
# Accuracy on 1st iter ~85%; Accuracy on 2nd iter +0.5

accr = model.evaluate([test_input_ids, 
                       test_input_masks, 
                       test_segment_ids], 
                       y_test)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))


In [None]:
# Test on (assuming) unseen data
unseen = np.array([ 
                   'mongodb',
                   'statistics , mathematics , biostatistics',
                   'experience in python', 
                   'language e.g',
                   'this is fucking bullshit', 
                   'the definition', 
                   'hard-worker',
                   'san francisco bay area' ,
                   'many things',
                   'passion for data',
                   ])
unseen_text = unseen[:, np.newaxis]

# Instantiate tokenizer
tokenizer = create_tokenizer_from_hub_module()

# Convert data to InputExample format
unseen_examples = convert_text_to_examples(train_text, np.zeros((10,1)))

# Convert to features
(unseen_input_ids, unseen_input_masks, unseen_segment_ids, unseen_labels 
) = convert_examples_to_features(tokenizer,
                                 unseen_examples,
                                 max_seq_length=max_seq_length)

In [None]:
unseen_truth = [0, 1, 1, 0, 1, 0, 1, 0, 0, 1]
accr = model.evaluate([unseen_input_ids, 
                       unseen_input_masks, 
                       unseen_segment_ids], 
                       unseen_truth)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))


In [None]:
unseen_predictions = model.predict([unseen_input_ids, 
                                    unseen_input_masks, 
                                    unseen_segment_ids] 
                                    )


In [None]:
predictions = pd.DataFrame(dict(list(zip(unseen, unseen_predictions))))
predictions.T