In [529]:
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_hub as thub
import bert
from tensorflow.keras import backend as K

import pandas as pd
import numpy as np

import re

import random

import os
from tqdm import tqdm

import model_utils

# where to save intermediary checkpoints
checkpoint_dir = os.getcwd() + "/checkpoints"
# where to save final model
saved_model_dir = os.getcwd() + "/saved_models"
# maximum length of token sequences to input to bert model
max_seq_length = 128


In [511]:
def tokenize_sample(context):
    
    """
    To be applied over Spark dataframe.
    Takes a string and converts it to token IDs via bert_tokenizer,
    adding the necessary beginning and end tokens

    Returns: Array of bert token ids for each row of Spark dataframe (requires udf)
    """
    
    tokenized = ["[CLS]"] + tokenizer.tokenize(context) + ["[SEP]"]
    ids = tokenizer.convert_tokens_to_ids(tokenized)
    
    return ids

In [512]:
def generate_sample_df(sarcastic, non_sarcastic, ratio, n_samples):
    
    """
    Returns: Spark df of equal label distribution with text 
    tokenized. Each generated df is to be iterator over multiple 
    times during training
    """
    
    number = 0
    while number < n_samples:
        non_sarc_samp = non_sarcastic.sample(ratio) # making label dist equal
        
        # combine sampled non_sarcastic and whole sarcastic
        sample_df = sarcastic.union(non_sarc_samp)
        
        # tokenize context column via spark udf
        tokenize_sample_udf = F.udf(tokenize_sample, ArrayType(IntegerType()))
        sample_df = sample_df.withColumn("tokens", tokenize_sample_udf(sample_df.context))
        
        # drop context column
        sample_df = sample_df.drop("context")
        
        # yield one call at a time
        yield sample_df
        number += 1

In [513]:
class BertLayer(tf.keras.layers.Layer):
    
    def __init__(self,
        n_fine_tune_layers=2,
        output_type="sequence_output",
        bert_path="https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/1",
        **kwargs):
        
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.output_type = output_type
        self.bert_path = bert_path
        
        if self.output_type not in ["sequence_output", "pooled_output"]:
            raise NameError("Undefined pooling type (must be either sequence_output or pooled_output, but is {self.output_type}")

        super(BertLayer, self).__init__(**kwargs)
        
    def layer_number(self, var):
    
        """
        Get the layer number corresponding to the 
        given variable
        """
        m = re.search(r'/layer_(\d+)/', var)
        
        if m:
            return int(m.group(1))
        else:
            return None


    def build(self, input_shape):

        """
        Creates the variables of the layer (optional, for subclass implementers).
 |      
 |      This is a method that implementers of subclasses of `Layer` or `Model`
 |      can override if they need a state-creation step in-between
 |      layer instantiation and layer call.
 |      
 |      This is typically used to create the weights of `Layer` subclasses.
 
        Called once from `__call__`, when we know the shapes of input and `dtype`.
        """
        
        self.bert_model = thub.KerasLayer(self.bert_path, self.trainable)

        # extract all pre-trained variables and weights
        # note that vars and weight outputs appear to be identical; unclear on the difference
        # note also that when trainable=True, all variables/weights start out as trainable
        trainable_vars = self.bert_model.trainable_variables
        trainable_weights = self.bert_model.trainable_weights
        
        if self.output_type == "pooled_output":
            
            # removing '/cls/' layers (there don't appear to be any) 
            trainable_vars = [var.name for var in trainable_vars if not "/cls/" in var.name]
            
        elif output_type == "sequence_output":
            
            # removing '/cls/' (there don't appear to be any) and '/pooler_transform/' layers 
            trainable_vars = [var.name for var in trainable_vars if not "/cls/" in var.name
                              and not "/pooler_transform" in var.name] 
            
        ### select how many layers to fine tune starting from top-most layer ###
        
        # outputs a list of either Nonetype or layer number
        layer_numbers = list(map(self.layer_number, trainable_vars))
        # returns the total number of layers in pre-trained model (note: layers are zero-indexed)
        n_total_layers = max(n for n in layer_numbers if n is not None) + 1 
        # finally, create list of just layers to be trained
        trainable_vars = [var for n, var in zip(layer_numbers, trainable_vars) if n is not None and n >= n_total_layers - self.n_fine_tune_layers]    
        
        # add variables NOT to be trained to _non_trainable_weights and 
        # remove them from _trainable_weights
        # note: underscore is necessary for accessing the writable object
        
        for var in self.bert_model.variables:
    
            if var.name not in trainable_vars and 'Variable:0' not in var.name:
                
                self.bert_model._non_trainable_weights.append(var)
                self.bert_model._trainable_weights.remove(var)

        super(BertLayer, self).build(input_shape)
        
    def call(self, inputs): 
        
        """
        Called in `__call__` after making sure `build()` has been called
 |      once. Should actually perform the logic of applying the layer to the
 |      input tensors (which should be passed in as the first argument).
        """
        # takes in list of input tensors and casts them in Keras
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        
        #bert-for-tf2 returns (pooled_output,sequence_output) when called
        if self.output_type == "pooled_output":
            
            output = self.bert_model(inputs)[0]
                
        elif self.output_type == "sequence_output":
            
            output = self.bert_model(inputs)[1]
            
        return output
    
    def compute_output_shape(self, input_shape):
        
        return (input_shape[0], self.output_size)

       
        

In [537]:
class training(object):
    
    def __init__(self,
                max_seq_length=128,
                n_epochs=50,
                batch_size=13,
                patience=5,
                validation_split=0.1,
                checkpoint_dir=os.getcwd() + "/checkpoints",
                saved_model_dir=os.getcwd() + "/saved_models",
                pad_by_batch=False):
    
        self.max_seq_length = max_seq_length
        self.n_epochs = n_epochs
        self.batch_size = batch_size
        self.patience = patience
        self.validation_split = validation_split
        self.checkpoint_dir = checkpoint_dir
        self.saved_model_dir = saved_model_dir
        self.pad_by_batch = pad_by_batch

    def build_model(self): 
    
        """
        Defines input shapes of bert input tensors,
        """
        input_word_ids = tf.keras.layers.Input(shape=(self.max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
        input_mask = tf.keras.layers.Input(shape=(self.max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
        segment_ids = tf.keras.layers.Input(shape=(self.max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")
        bert_inputs = [input_word_ids, input_mask, segment_ids]
    
        bert_output = BertLayer()(bert_inputs)
        dense_out = tf.keras.layers.Dense(self.max_seq_length, activation='relu')(bert_output)
        dense_out = tf.keras.layers.Dropout(0.5)(dense_out)
        logits = tf.keras.layers.Dense(1, activation='sigmoid')(dense_out)
    
        model = tf.keras.models.Model(inputs=bert_inputs, outputs=logits)
        model.compile(loss='binary_crossentropy', 
                  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), 
                  metrics=['accuracy'])
    
        model.summary()
    
        return model
    
    def train_model(self, model, train_inputs, train_labels):
        
        """
        Initiates training process using self.build_model output as input.
        If self.pad_by_batch == False, input should be the entire epoch of 
        training inputs (including masks) and labels, as numpy arrays.
        """
    
        checkpoints = tf.keras.callbacks.ModelCheckpoint(self.checkpoint_dir, verbose=1, 
                                                     save_best_only=False,
                                                     save_weights_only=True, mode='auto', 
                                                     save_freq='epoch')
    
        early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=self.patience)
    
        if self.pad_by_batch == False:
            
            train_input_ids, train_input_masks, train_segment_ids = train_inputs
        
            model.fit([train_input_ids, train_input_masks, train_segment_ids], 
                  train_labels,
                  validation_split = self.validation_split,
                  epochs = self.n_epochs,
                  batch_size = self.batch_size,
                  callbacks = [checkpoints, early_stopping])
    
            model.save(self.saved_model_dir+'/my_model.h5')



In [None]:
# Initialize BERT model and tokenizer

%time bert_layer, tokenizer = model_utils.init_bert()

# Initialize Spark context

%time sc, spark = model_utils.init_spark()


In [8]:
# Read in sarcastic samples, non-sarcastic samples, and the ratio between the two

%time sarcastic, non_sarcastic, ratio = model_utils.load_data(spark, 
                                                              bucket_name="sarc-bucket-5",
                                                              dataset="politics")

CPU times: user 17.3 ms, sys: 1.95 ms, total: 19.3 ms
Wall time: 33.8 s


In [48]:
# Initialize sample_df generator

%time sample_generator = generate_sample_df(sarcastic, non_sarcastic, ratio, 5)

# Output first smaple

%time sample_df = next(sample_generator)


NameError: name 'sarcastic' is not defined

In [None]:
# Initialize training class object and build Bert layer

t = training()
%time model = t.build_model()

In [None]:
# Produce padded tokens, input masks, and segment ids as nparrays

%time padded_tokens, train_labels = model_utils.pad(sample_df, pad_by_batch=False, t.batch_size)

%time input_mask = model_utils.input_mask(padded_tokens)

%time segment_id = model_utils.segment_id(padded_tokens)

train_inputs = [padded_tokens, input_mask, segment_id]


In [515]:
t.train_model(model, train_inputs, train_labels)

   def call(self, inputs): 

       """
       Called in `__call__` after making sure `build()` has been called
|      once. Should actually perform the logic of applying the layer to the
|      input tensors (which should be passed in as the first argument).
       """
       # takes in list of input tensors and casts them in Keras
       inputs = [K.cast(x, dtype="int32") for x in inputs]

       #bert-for-tf2 returns (pooled_output,sequence_output) when called
       if self.output_type == "pooled_output":

           output = self.bert_model(inputs)[0]

       elif self.output_type == "sequence_output":

           output = self.bert_model(inputs)[1]

       return output

This may be caused by multiline strings or comments not indented at the same level as the code.


   def call(self, inputs): 

       """
       Called in `__call__` after making sure `build()` has been called
|      once. Should actually perform the logic of applying the layer to the
|      input tensors (which should be passed in as the first argument).
       """
       # takes in list of input tensors and casts them in Keras
       inputs = [K.cast(x, dtype="int32") for x in inputs]

       #bert-for-tf2 returns (pooled_output,sequence_output) when called
       if self.output_type == "pooled_output":

           output = self.bert_model(inputs)[0]

       elif self.output_type == "sequence_output":

           output = self.bert_model(inputs)[1]

       return output

This may be caused by multiline strings or comments not indented at the same level as the code.


   def call(self, inputs): 

       """
       Called in `__call__` after making sure `build()` has been called
|      once. Should actually perform the logic of applying the layer to the
|      input tensors (which should be passed in as the first argument).
       """
       # takes in list of input tensors and casts them in Keras
       inputs = [K.cast(x, dtype="int32") for x in inputs]

       #bert-for-tf2 returns (pooled_output,sequence_output) when called
       if self.output_type == "pooled_output":

           output = self.bert_model(inputs)[0]

       elif self.output_type == "sequence_output":

           output = self.bert_model(inputs)[1]

       return output

This may be caused by multiline strings or comments not indented at the same level as the code.
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inp