In [1]:
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_hub as thub
import bert

import pandas as pd
import numpy as np

import re

import random

import os
from tqdm import tqdm

import model_utils

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
def tokenize_sample(context):
    
    """
    To be applied over Spark dataframe.
    Takes a string and converts it to token IDs via bert_tokenizer,
    adding the necessary beginning and end tokens

    Returns: Array of bert token ids for each row of Spark dataframe (requires udf)
    """
    
    tokenized = ["[CLS]"] + tokenizer.tokenize(context) + ["[SEP]"]
    ids = tokenizer.convert_tokens_to_ids(tokenized)
    
    return ids

In [3]:
def generate_epoch_df(sarcastic, non_sarcastic, ratio, n_epochs):
    
    """
    Returns: Spark df of equal label distribution with text 
    tokenized. Each generated df is to be iterator over multiple 
    times during training
    """
    
    number = 0
    while number < n_epochs:
        non_sarc_samp = non_sarcastic.sample(ratio) # making label dist equal
        
        # combine sampled non_sarcastic and whole sarcastic
        epoch_df = sarcastic.union(non_sarc_samp)
        
        # tokenize context column via spark udf
        tokenize_sample_udf = F.udf(tokenize_sample, ArrayType(IntegerType()))
        epoch_df = epoch_df.withColumn("tokens", tokenize_sample_udf(epoch_df.context))
        
        # drop context column
        epoch_df = epoch_df.drop("context")
        
        # yield one call at a time
        yield epoch_df
        number += 1

In [4]:
# Initialize BERT model and tokenizer

%time bert_layer, tokenizer = model_utils.init_bert()

TypeError: load() got an unexpected keyword argument 'trainable'

In [7]:
# Initialize Spark context

%time sc, spark = model_utils.init_spark()

CPU times: user 36.1 ms, sys: 24.1 ms, total: 60.2 ms
Wall time: 16.1 s


In [8]:
# Read in sarcastic samples, non-sarcastic samples, and the ratio between the two

%time sarcastic, non_sarcastic, ratio = model_utils.load_data(spark, bucket_name="sarc-bucket-5", dataset="politics")

CPU times: user 17.3 ms, sys: 1.95 ms, total: 19.3 ms
Wall time: 33.8 s


In [9]:
# Initialize epoch_df generator

%time generator = generate_epoch_df(sarcastic, non_sarcastic, ratio, 5)

CPU times: user 14 µs, sys: 3 µs, total: 17 µs
Wall time: 19.6 µs


In [10]:
%time epoch_df = next(generator)

CPU times: user 274 ms, sys: 7.13 ms, total: 281 ms
Wall time: 392 ms


In [97]:
class BertLayer(tf.keras.layers.Layer):
    
    def __init__(self,
        n_fine_tune_layers=2,
        pooling="first",
        bert_path="https://tfhub.dev/tensorflow/bert_en_wwm_cased_L-24_H-1024_A-16/1",
        **kwargs):
        
        self.n_fine_tune_layers = n_fine_tune_layers
        self.trainable = True
        self.output_size = 1024
        self.pooling = pooling
        self.bert_path = bert_path
        
        if self.pooling not in ["first", "mean"]:
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}"
            )

        super().__init__(**kwargs)
        
    def unfreeze_layers(self, input_shape):
        
        self.bert = thub.KerasLayer(self.bert_path, trainable=self.trainable)
        
        # Extract all pre-trained layers
        trainable_vars = self.bert.variables
        
        
        if self.pooling == "first":
            
            # removing '\cls\' layers (there don't appear to be any) 
            trainable_vars = [var for var in trainable_vars if not "/cls/" in var.name]
            # layers to be trained
            trainable_layers = ["pooler/dense"]
            
        elif self.pooling == "mean":
            
            # removing '\cls\' and '\pooler\' layers (there don't appear to be any) 
            trainable_vars = [var for var in trainable_vars
                              if not "/cls/" in var.name and not "/pooler/" in var.name]
            trainable_layers = []
        
        else:
            
            raise NameError(
                f"Undefined pooling type (must be either first or mean, but is {self.pooling}")
            
        # Select how many layers to fine tune 
        # (starting from the final layer and moving backward)
        for i in range(self.n_fine_tune_layers):
            
            trainable_layers.append(f"encoder/layer_{str(24 - i)}") # 24 total layers

        # Update trainable vars to contain only the specified layers
        # (there are multiple vars per layer)
        trainable_vars = [var for var in trainable_vars
                          if any([l in var.name for l in trainable_layers])]

        # Add to trainable weights to object "self._trainable_weights"
        for var in trainable_vars:
            
            self._trainable_weights.append(var)

        # Add weights to NOT be trained to object "self._non_trainable_weights"    
        for var in self.bert.variables:
            
            if var not in self._trainable_weights:
                self._non_trainable_weights.append(var)

        super().build(input_shape)
        
        
        
        

In [289]:
def pad(epoch_df, pad_by_batch=False):

    # convert to list of tuples
    dflist = [(epoch_df['tokens'].iloc[i], epoch_df['label'].iloc[i]) for i in range(len(epoch_df))]

    if pad_by_batch==False:

        # convert to tf dataset via tf generator
        processed_dataset = tf.data.Dataset.from_generator(lambda: dflist, output_types=(tf.int32, tf.int32))

        # call the generator where 'batch' size is just the length of the whole dataset. 
        batched_dataset = processed_dataset.padded_batch(len(epoch_df), padded_shapes=((None,),()))

        return batched_dataset

    elif pad_by_batch==True:

        # convert to tf dataset via tf generator
        processed_dataset = tf.data.Dataset.from_generator(lambda: dflist, output_types=(tf.int32, tf.int32))

        # call the generator where batch size is pre-determined
        batched_dataset = processed_dataset.padded_batch(5, padded_shapes=((None,),()))

        return batched_dataset

In [290]:
batched_dataset = pad(df, pad_by_batch=False)


In [291]:
# calling single batch pad
next(iter(batched_dataset))

(<tf.Tensor: id=2619, shape=(10, 15), dtype=int32, numpy=
 array([[     5,      7,      1,     90,     76,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0],
        [     1,     90,    100,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0],
        [   910,    442,     85,    100,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0],
        [   123,    886,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0],
        [  3335,  78787,     56,   3232,     57,   7775,   4333,     56,
              0,      0,      0,      0,      0,      0,      0],
        [    65,     43,   6787,   6564,  23234,      5,      5,      0,
              0,      0,      0,      0,      0,      0,      0],
        [  2345,   7654,    457,   5454,   3356,    776,      4,      0,
              0,      0,      0,   

In [292]:
batched_dataset = pad(df, pad_by_batch=True)
iterator = batched_dataset.make_one_shot_iterator()

In [295]:
iterator.get_next()

OutOfRangeError: End of sequence [Op:IteratorGetNextSync]

In [276]:
df = pd.DataFrame([{"label":1, "tokens":[5,7,1,90,76]},
             {"label":0, "tokens":[1,90,100]},
             {"label":0, "tokens":[910,442,85,100]},
             {"label":1, "tokens":[123,886]},
             {"label":0, "tokens":[3335,78787,56,3232,57,7775,4333,56]},
             {"label":0, "tokens":[65,43,6787,6564,23234,5,5]},
             {"label":0, "tokens":[2345,7654,457,5454,3356,776,4]},
             {"label":0, "tokens":[8]},
             {"label":1, "tokens":[3456,6664,3335,78,7,6,43,5,777,4443,456,66543,3467,787765,44]},
             {"label":0, "tokens":[66,444,6778,33,5,77,6554,3]}])

In [260]:

dflist = [(i, df['tokens'].iloc[i]) for i in range(len(df))]

In [261]:
dflist

[(0, [5, 7, 1, 90, 76]),
 (1, [1, 90, 100]),
 (2, [910, 442, 85, 100]),
 (3, [123, 886]),
 (4, [3335, 78787, 56, 3232, 57, 7775, 4333, 56]),
 (5, [65, 43, 6787, 6564, 23234, 5, 5]),
 (6, [2345, 7654, 457, 5454, 3356, 776, 4]),
 (7, [8]),
 (8,
  [3456,
   6664,
   3335,
   78,
   7,
   6,
   43,
   5,
   777,
   4443,
   456,
   66543,
   3467,
   787765,
   44]),
 (9, [66, 444, 6778, 33, 5, 77, 6554, 3])]

In [262]:
processed_dataset = tf.data.Dataset.from_generator(lambda: dflist, output_types=(tf.int32, tf.int32))


In [263]:
batched_dataset = processed_dataset.padded_batch(5, padded_shapes=((),(None,)))

In [269]:
next(iter(batched_dataset))

(<tf.Tensor: id=2417, shape=(5,), dtype=int32, numpy=array([0, 1, 2, 3, 4], dtype=int32)>,
 <tf.Tensor: id=2418, shape=(5, 8), dtype=int32, numpy=
 array([[    5,     7,     1,    90,    76,     0,     0,     0],
        [    1,    90,   100,     0,     0,     0,     0,     0],
        [  910,   442,    85,   100,     0,     0,     0,     0],
        [  123,   886,     0,     0,     0,     0,     0,     0],
        [ 3335, 78787,    56,  3232,    57,  7775,  4333,    56]],
       dtype=int32)>)

In [270]:
iterator = batched_dataset.make_one_shot_iterator()

In [273]:
next_element = iterator.get_next()


In [274]:
next_element

(<tf.Tensor: id=2430, shape=(5,), dtype=int32, numpy=array([5, 6, 7, 8, 9], dtype=int32)>,
 <tf.Tensor: id=2431, shape=(5, 15), dtype=int32, numpy=
 array([[    65,     43,   6787,   6564,  23234,      5,      5,      0,
              0,      0,      0,      0,      0,      0,      0],
        [  2345,   7654,    457,   5454,   3356,    776,      4,      0,
              0,      0,      0,      0,      0,      0,      0],
        [     8,      0,      0,      0,      0,      0,      0,      0,
              0,      0,      0,      0,      0,      0,      0],
        [  3456,   6664,   3335,     78,      7,      6,     43,      5,
            777,   4443,    456,  66543,   3467, 787765,     44],
        [    66,    444,   6778,     33,      5,     77,   6554,      3,
              0,      0,      0,      0,      0,      0,      0]],
       dtype=int32)>)

In [9]:
bert_layer = thub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_wwm_cased_L-24_H-1024_A-16/1")

In [10]:
help(bert_layer)

Help on KerasLayer in module tensorflow_hub.keras_layer object:

class KerasLayer(tensorflow.python.keras.engine.base_layer.Layer)
 |  KerasLayer(handle, trainable=False, arguments=None, _sentinel=None, tags=None, signature=None, signature_outputs_as_dict=None, output_key=None, output_shape=None, **kwargs)
 |  
 |  Wraps a SavedModel (or a legacy Hub.Module) as a Keras Layer.
 |  
 |  This layer wraps a callable object for use as a Keras layer. The callable
 |  object can be passed directly, or be specified by a Python string with a
 |  handle that gets passed to `hub.load()`.
 |  
 |  This is the preferred API to load a TF2-style SavedModel from TF Hub
 |  into a Keras model. Calling this function requires TF 1.15 or newer.
 |  It can be called both in eager and graph mode.
 |  
 |  The callable object is expected to follow the conventions detailed below.
 |  (These are met by TF2-compatible modules loaded from TensorFlow Hub.)
 |  
 |  The callable is invoked with a single positional