In [11]:
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_hub as thub
import bert

import pandas as pd
import numpy as np

import re

import random

import os
from tqdm import tqdm

import model_utils

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [3]:
def tokenize_sample(context):
    
    """
    To be applied over Spark dataframe.
    Takes a string and converts it to token IDs via bert_tokenizer,
    adding the necessary beginning and end tokens

    Returns: Array of bert token ids for each row of Spark dataframe (requires udf)
    """
    
    tokenized = ["[CLS]"] + tokenizer.tokenize(context) + ["[SEP]"]
    ids = tokenizer.convert_tokens_to_ids(tokenized)
    
    return ids

In [4]:
def generate_epoch_df(sarcastic, non_sarcastic, ratio, 
                      n_epochs, bucket_by_lengths=True):
    
    """
    Returns: Spark df of equal label distribution with text 
    tokenized. Each generated df is to be iterator over multiple 
    times during training
    """
    
    number = 0
    while number < n_epochs:
        non_sarc_samp = non_sarcastic.sample(ratio) # making label dist equal
        
        # combine sampled non_sarcastic and whole sarcastic
        epoch_df = sarcastic.union(non_sarc_samp)
        
        # tokenize context column via spark udf
        tokenize_sample_udf = F.udf(tokenize_sample, ArrayType(IntegerType()))
        epoch_df = epoch_df.withColumn("tokens", tokenize_sample_udf(epoch_df.context))
        
        # drop context column
        epoch_df = epoch_df.drop("context")
        
        # yield one call at a time
        yield epoch_df
        number += 1

In [2]:
def pad_by_batch_lengths(epoch_df, batch_size=16):
    
    # add sequence lengths
    epoch_df = epoch_df.withColumn("sequence_length", size(epoch_df.tokens))
        
    # order by sequence length
    epoch_df = epoch_df.orderBy("sequence_length", ascending=False)
            
    # drop sequence length column
    epoch_df = epoch_df.drop("sequence_length")
            
    # convert pandas
    epoch_df = epoch_df.toPandas()
    
    # convert to sorted list of tuples
    sorted_tokens = [(epoch_df['tokens'].iloc[i], epoch_df['labels'].iloc[i]) for i in len(epoch_df)]
    
    processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_tokens, 
                           output_types=(tf.int64, tf.int64))
            
    batch = processed_dataset.padded_batch(batch_size, padded_shapes=((None, ), ()))
    
    return batch
        
            

In [10]:
def pad_by_overall_length(epoch_df):
    
    # Start with three empty lists and initialize var for max_seq_len
    tokens, X, y = [], [], []
    max_seq_len = 0
    
    # convert pandas
    epoch_df = epoch_df.toPandas()
    
    # generator for iterating over df
    for _, row in tqdm(epoch_df.iterrows()):
        
        # pull out raw tokens and label for each row
        raw_tokens, label = row[epoch_df.tokens], row[epoch_df.labels]
        
        # update max sequence length var
        max_seq_len = max(max_seq_len, len(raw_tokens))
      
        # append results as list to empty list
        tokens.append(raw_tokens)
        y.append(label)
    
    # convert response to nparray
    y = np.array(y)

    # for each of the raw tokens
    for sample in tokens:
        
        # truncate sample list if for some reason max_seq_len is shorter than actual length 
        sample = sample[:min(len(sample), max_seq_len - 2)]
        
        # add zeros to pad the elements if length of sample is less then max_seq_len
        sample = sample + [0] * (max_seq_len - len(sample))
        
        # append result to empty list X
        X.append(np.array(sample))
    
    # convert predictor to nparray
    X = np.array(X)
    
    return X,y
        
            

In [None]:
def build_model(max_seq_len, bert_ckpt_file):
    
    with tf.io.gfile.GFile(bert_config_file, "r") as reader:
        bc = StockBertConfig.from_json_string(reader.read())
        bert_params = map_stock_config_to_params(bc)
        bert_params.adapter_size = None
        bert = BertModelLayer.from_params(bert_params, name="bert")

    input_tokens = keras.layers.Input(shape=(max_seq_len, ),
                                   dtype='int64',
                                   name="input_tokens")
  bert_output = bert(input_ids)

  print("bert shape", bert_output.shape)

  cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
  cls_out = keras.layers.Dropout(0.5)(cls_out)
  logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
  logits = keras.layers.Dropout(0.5)(logits)
  logits = keras.layers.Dense(
    units=len(classes),
    activation="softmax"
  )(logits)

  model = keras.Model(inputs=input_ids, outputs=logits)
  model.build(input_shape=(None, max_seq_len))

  load_stock_weights(bert, bert_ckpt_file)

  return model

In [12]:
# Initialize BERT model and tokenizer

%time bert_layer, tokenizer = model_utils.init_bert()

KeyboardInterrupt: 

In [6]:
# Initialize Spark context

%time sc, spark = model_utils.init_spark()

In [7]:
# Read in sarcastic samples, non-sarcastic samples, and the ratio between the two

%time sarcastic, non_sarcastic, ratio = model_utils.load_data(spark, 
                                                        bucket_name="sarc-bucket-5", 
                                                        dataset="politics")

In [8]:
# Generate epoch

generator = generate_epoch_df(sarcastic, non_sarcastic, ratio, 5)

In [9]:
%time X, y = next(generator)

CPU times: user 273 ms, sys: 3.36 ms, total: 277 ms
Wall time: 388 ms


In [14]:
a = model_utils.init_bert()

In [17]:
help(a[0])

Help on KerasLayer in module tensorflow_hub.keras_layer object:

class KerasLayer(tensorflow.python.keras.engine.base_layer.Layer)
 |  KerasLayer(handle, trainable=False, arguments=None, _sentinel=None, tags=None, signature=None, signature_outputs_as_dict=None, output_key=None, output_shape=None, **kwargs)
 |  
 |  Wraps a SavedModel (or a legacy Hub.Module) as a Keras Layer.
 |  
 |  This layer wraps a callable object for use as a Keras layer. The callable
 |  object can be passed directly, or be specified by a Python string with a
 |  handle that gets passed to `hub.load()`.
 |  
 |  This is the preferred API to load a TF2-style SavedModel from TF Hub
 |  into a Keras model. Calling this function requires TF 1.15 or newer.
 |  It can be called both in eager and graph mode.
 |  
 |  The callable object is expected to follow the conventions detailed below.
 |  (These are met by TF2-compatible modules loaded from TensorFlow Hub.)
 |  
 |  The callable is invoked with a single positional