In [510]:
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_hub as thub
import bert
from tensorflow.keras import backend as K

import pandas as pd
import numpy as np

import re

import random

import os
from tqdm import tqdm

import model_utils

In [511]:
def tokenize_sample(context):
    
    """
    To be applied over Spark dataframe.
    Takes a string and converts it to token IDs via bert_tokenizer,
    adding the necessary beginning and end tokens

    Returns: Array of bert token ids for each row of Spark dataframe (requires udf)
    """
    
    tokenized = ["[CLS]"] + tokenizer.tokenize(context) + ["[SEP]"]
    ids = tokenizer.convert_tokens_to_ids(tokenized)
    
    return ids

In [512]:
def generate_epoch_df(sarcastic, non_sarcastic, ratio, n_epochs):
    
    """
    Returns: Spark df of equal label distribution with text 
    tokenized. Each generated df is to be iterator over multiple 
    times during training
    """
    
    number = 0
    while number < n_epochs:
        non_sarc_samp = non_sarcastic.sample(ratio) # making label dist equal
        
        # combine sampled non_sarcastic and whole sarcastic
        epoch_df = sarcastic.union(non_sarc_samp)
        
        # tokenize context column via spark udf
        tokenize_sample_udf = F.udf(tokenize_sample, ArrayType(IntegerType()))
        epoch_df = epoch_df.withColumn("tokens", tokenize_sample_udf(epoch_df.context))
        
        # drop context column
        epoch_df = epoch_df.drop("context")
        
        # yield one call at a time
        yield epoch_df
        number += 1

In [513]:
class BertLayer(tf.keras.layers.Layer):
    
    def __init__(self,
        n_fine_tune_layers=2,
        output_type="sequence_output",
        bert_path="https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/1",
        **kwargs):
        
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 768
        self.output_type = output_type
        self.bert_path = bert_path
        
        if self.output_type not in ["sequence_output", "pooled_output"]:
            raise NameError("Undefined pooling type (must be either sequence_output or pooled_output, but is {self.output_type}")

        super(BertLayer, self).__init__(**kwargs)
        
    def layer_number(self, var):
    
        """
        Get the layer number corresponding to the 
        given variable
        """
        m = re.search(r'/layer_(\d+)/', var)
        
        if m:
            return int(m.group(1))
        else:
            return None


    def build(self, input_shape):

        """
        Creates the variables of the layer (optional, for subclass implementers).
 |      
 |      This is a method that implementers of subclasses of `Layer` or `Model`
 |      can override if they need a state-creation step in-between
 |      layer instantiation and layer call.
 |      
 |      This is typically used to create the weights of `Layer` subclasses.
 
        Called once from `__call__`, when we know the shapes of input and `dtype`.
        """
        
        self.bert_model = thub.KerasLayer(self.bert_path, self.trainable)

        # extract all pre-trained variables and weights
        # note that vars and weight outputs appear to be identical; unclear on the difference
        # note also that when trainable=True, all variables/weights start out as trainable
        trainable_vars = self.bert_model.trainable_variables
        trainable_weights = self.bert_model.trainable_weights
        
        if self.output_type == "pooled_output":
            
            # removing '/cls/' layers (there don't appear to be any) 
            trainable_vars = [var.name for var in trainable_vars if not "/cls/" in var.name]
            
        elif output_type == "sequence_output":
            
            # removing '/cls/' (there don't appear to be any) and '/pooler_transform/' layers 
            trainable_vars = [var.name for var in trainable_vars if not "/cls/" in var.name
                              and not "/pooler_transform" in var.name] 
            
        ### select how many layers to fine tune starting from top-most layer ###
        
        # outputs a list of either Nonetype or layer number
        layer_numbers = list(map(self.layer_number, trainable_vars))
        # returns the total number of layers in pre-trained model (note: layers are zero-indexed)
        n_total_layers = max(n for n in layer_numbers if n is not None) + 1 
        # finally, create list of just layers to be trained
        trainable_vars = [var for n, var in zip(layer_numbers, trainable_vars) if n is not None and n >= n_total_layers - self.n_fine_tune_layers]    
        
        # add variables NOT to be trained to _non_trainable_weights and 
        # remove them from _trainable_weights
        # note: underscore is necessary for accessing the writable object
        
        for var in self.bert_model.variables:
    
            if var.name not in trainable_vars and 'Variable:0' not in var.name:
                
                self.bert_model._non_trainable_weights.append(var)
                self.bert_model._trainable_weights.remove(var)

        super(BertLayer, self).build(input_shape)
        
    def call(self, inputs): 
        
        """
        Called in `__call__` after making sure `build()` has been called
 |      once. Should actually perform the logic of applying the layer to the
 |      input tensors (which should be passed in as the first argument).
        """
        # takes in list of input tensors and casts them in Keras
        inputs = [K.cast(x, dtype="int32") for x in inputs]
        
        #bert-for-tf2 returns (pooled_output,sequence_output) when called
        if self.output_type == "pooled_output":
            
            output = self.bert_model(inputs)[0]
                
        elif self.output_type == "sequence_output":
            
            output = self.bert_model(inputs)[1]
            
        return output
    
    def compute_output_shape(self, input_shape):
        
        return (input_shape[0], self.output_size)

       
        

In [514]:
def build_model(max_seq_length): 
    
    """
    Defines input shapes of bert input tensors,
    """
    input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                       name="input_word_ids")
    input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                   name="input_mask")
    segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32,
                                    name="segment_ids")
    bert_inputs = [input_word_ids, input_mask, segment_ids]
    
    bert_output = BertLayer()(bert_inputs)
    dense_out = tf.keras.layers.Dense(max_seq_length, activation='relu')(bert_output)
    dense_out = tf.keras.layers.Dropout(0.5)(dense_out)
    logits = tf.keras.layers.Dense(1, activation='sigmoid')(dense_out)
    
    model = tf.keras.models.Model(inputs=bert_inputs, outputs=logits)
    model.compile(loss='binary_crossentropy', 
                  optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5), 
                  metrics=['accuracy'])
    model.summary()
    
    return model



In [4]:
# Initialize BERT model and tokenizer

%time bert_layer, tokenizer = model_utils.init_bert()

NotFoundError: Unsuccessful TensorSliceReader constructor: Failed to find any matching files for /var/folders/2n/278rn1t177gb_0815m4_n4800000gn/T/tfhub_modules/aba332320f092d9eba6abca17ad5b73317326ab7/variables/variables

In [7]:
# Initialize Spark context

%time sc, spark = model_utils.init_spark()

CPU times: user 36.1 ms, sys: 24.1 ms, total: 60.2 ms
Wall time: 16.1 s


In [8]:
# Read in sarcastic samples, non-sarcastic samples, and the ratio between the two

%time sarcastic, non_sarcastic, ratio = model_utils.load_data(spark, bucket_name="sarc-bucket-5", dataset="politics")

CPU times: user 17.3 ms, sys: 1.95 ms, total: 19.3 ms
Wall time: 33.8 s


In [48]:
# Initialize epoch_df generator

%time epoch_generator = generate_epoch_df(sarcastic, non_sarcastic, ratio, 5)

NameError: name 'sarcastic' is not defined

In [10]:
%time epoch_df = next(epoch_generator)

CPU times: user 274 ms, sys: 7.13 ms, total: 281 ms
Wall time: 392 ms


In [515]:
max_seq_length = 128

model = build_model(max_seq_length)

   def call(self, inputs): 

       """
       Called in `__call__` after making sure `build()` has been called
|      once. Should actually perform the logic of applying the layer to the
|      input tensors (which should be passed in as the first argument).
       """
       # takes in list of input tensors and casts them in Keras
       inputs = [K.cast(x, dtype="int32") for x in inputs]

       #bert-for-tf2 returns (pooled_output,sequence_output) when called
       if self.output_type == "pooled_output":

           output = self.bert_model(inputs)[0]

       elif self.output_type == "sequence_output":

           output = self.bert_model(inputs)[1]

       return output

This may be caused by multiline strings or comments not indented at the same level as the code.


   def call(self, inputs): 

       """
       Called in `__call__` after making sure `build()` has been called
|      once. Should actually perform the logic of applying the layer to the
|      input tensors (which should be passed in as the first argument).
       """
       # takes in list of input tensors and casts them in Keras
       inputs = [K.cast(x, dtype="int32") for x in inputs]

       #bert-for-tf2 returns (pooled_output,sequence_output) when called
       if self.output_type == "pooled_output":

           output = self.bert_model(inputs)[0]

       elif self.output_type == "sequence_output":

           output = self.bert_model(inputs)[1]

       return output

This may be caused by multiline strings or comments not indented at the same level as the code.


   def call(self, inputs): 

       """
       Called in `__call__` after making sure `build()` has been called
|      once. Should actually perform the logic of applying the layer to the
|      input tensors (which should be passed in as the first argument).
       """
       # takes in list of input tensors and casts them in Keras
       inputs = [K.cast(x, dtype="int32") for x in inputs]

       #bert-for-tf2 returns (pooled_output,sequence_output) when called
       if self.output_type == "pooled_output":

           output = self.bert_model(inputs)[0]

       elif self.output_type == "sequence_output":

           output = self.bert_model(inputs)[1]

       return output

This may be caused by multiline strings or comments not indented at the same level as the code.
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
inp

In [67]:
df = pd.DataFrame([{"label":1, "tokens":[5,7,1,90,76]},
             {"label":0, "tokens":[1,90,100]},
             {"label":0, "tokens":[910,442,85,100]},
             {"label":1, "tokens":[123,886]},
             {"label":0, "tokens":[3335,78787,56,3232,57,7775,4333,56]},
             {"label":0, "tokens":[65,43,6787,6564,23234,5,5]},
             {"label":0, "tokens":[2345,7654,457,5454,3356,776,4]},
             {"label":0, "tokens":[8]},
             {"label":1, "tokens":[3456,6664,3335,78,7,6,43,5,777,4443,456,66543,3467,787765,44]},
             {"label":0, "tokens":[66,444,6778,33,5,77,6554,3]}])

In [68]:
def pad(epoch_df, pad_by_batch=False):

    # convert to list of tuples
    dflist = [(epoch_df['tokens'].iloc[i], epoch_df['label'].iloc[i]) for i in range(len(epoch_df))]

    if pad_by_batch==False:

        # convert to tf dataset via tf generator
        processed_dataset = tf.data.Dataset.from_generator(lambda: dflist, output_types=(tf.int32, tf.int32))

        # call the generator where 'batch' size is just the length of the whole dataset. 
        padded_dataset = processed_dataset.padded_batch(len(epoch_df), padded_shapes=((None,),()))

        padded_tensor = next(iter(padded_dataset))
        
        labels = padded_tensor[1].numpy().reshape(-1,1)
        padded_tokens = padded_tensor[0].numpy()

        return padded_tokens, labels

    elif pad_by_batch==True:

        # convert to tf dataset via tf generator
        processed_dataset = tf.data.Dataset.from_generator(lambda: dflist, output_types=(tf.int32, tf.int32))

        # call the generator where batch size is pre-determined
        padded_dataset = processed_dataset.padded_batch(5, padded_shapes=((),(None,)))
        
        iterator = padded_dataset.make_one_shot_iterator()
        
        padded_tensor = iterator.get_next()

        labels = padded_tensor[1].numpy().reshape(-1,1)
        padded_tokens = padded_tensor[0].numpy()

        return padded_tokens, labels

In [83]:
padded_tokens, labels = pad(df, pad_by_batch=False)


In [70]:
segment_ids = model_utils.segment_id(padded_tokens)
segment_ids = tf.convert_to_tensor(segment_ids, name = "segment_ids")

In [71]:
input_mask = model_utils.input_mask(padded_tokens)
input_mask = tf.convert_to_tensor(input_mask, name = "input_mask")


In [84]:
input_ids = padded_tokens
#input_ids = tf.convert_to_tensor(input_ids, name = "input_ids")


In [73]:
bert_inputs = [input_ids, input_mask, segment_ids]

In [74]:
bert_inputs

[<tf.Tensor: id=368047, shape=(10, 15), dtype=int32, numpy=
 array([[     5,      7,      1,     90,     76,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0],
        [     1,     90,    100,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0],
        [   910,    442,     85,    100,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0],
        [   123,    886,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0],
        [  3335,  78787,     56,   3232,     57,   7775,   4333,     56,
              0,      0,      0,      0,      0,      0,      0],
        [    65,     43,   6787,   6564,  23234,      5,      5,      0,
              0,      0,      0,      0,      0,      0,      0],
        [  2345,   7654,    457,   5454,   3356,    776,      4,      0,
              0,      0,      0, 

In [90]:
from tensorflow.keras import backend as K

In [93]:
K.cast(input_ids, dtype="int32")

<tf.Tensor: id=460082, shape=(10, 15), dtype=int32, numpy=
array([[     5,      7,      1,     90,     76,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0],
       [     1,     90,    100,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0],
       [   910,    442,     85,    100,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0],
       [   123,    886,      0,      0,      0,      0,      0,      0,
             0,      0,      0,      0,      0,      0,      0],
       [  3335,  78787,     56,   3232,     57,   7775,   4333,     56,
             0,      0,      0,      0,      0,      0,      0],
       [    65,     43,   6787,   6564,  23234,      5,      5,      0,
             0,      0,      0,      0,      0,      0,      0],
       [  2345,   7654,    457,   5454,   3356,    776,      4,      0,
             0,      0,      0,      0,      0,