In [1]:
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_hub as thub
import bert

import pandas as pd
import numpy as np

import re

import random

import os
from tqdm import tqdm

import model_utils

  from ._conv import register_converters as _register_converters


In [2]:
def tokenize_sample(context):
    
    """
    To be applied over Spark dataframe.
    Takes a string and converts it to token IDs via bert_tokenizer,
    adding the necessary beginning and end tokens

    Returns: Array of bert token ids for each row of Spark dataframe (requires udf)
    """
    
    tokenized = ["[CLS]"] + tokenizer.tokenize(context) + ["[SEP]"]
    ids = tokenizer.convert_tokens_to_ids(tokenized)
    
    return ids

In [3]:
def generate_epoch_df(sarcastic, non_sarcastic, ratio, 
                      n_epochs, bucket_by_lengths=True):
    
    """
    Returns: Spark df of equal label distribution with text 
    tokenized. Each generated df is to be iterator over multiple 
    times during training
    """
    
    number = 0
    while number < n_epochs:
        non_sarc_samp = non_sarcastic.sample(ratio) # making label dist equal
        
        # combine sampled non_sarcastic and whole sarcastic
        epoch_df = sarcastic.union(non_sarc_samp)
        
        # tokenize context column via spark udf
        tokenize_sample_udf = F.udf(tokenize_sample, ArrayType(IntegerType()))
        epoch_df = epoch_df.withColumn("tokens", tokenize_sample_udf(epoch_df.context))
        
        # drop context column
        epoch_df = epoch_df.drop("context")
        
        # yield one call at a time
        yield epoch_df
        number += 1

In [113]:
def sort_by_comment_length(epoch_df, batch_size=16):
    
    # add sequence lengths
    epoch_df = epoch_df.withColumn("sequence_length", F.size(epoch_df.tokens))
        
    # order by sequence length
    epoch_df = epoch_df.orderBy("sequence_length", ascending=False)
            
    # drop sequence length column
    epoch_df = epoch_df.drop("sequence_length")
            
    # convert pandas
    epoch_df = epoch_df.toPandas()
    
    # convert to sorted list of tuples
    sorted_tokens = [(epoch_df['tokens'].iloc[i], epoch_df['label'].iloc[i]) for i in range(len(epoch_df))]
    
    return sorted_tokens
            

    

        
            

In [5]:
def pad_by_overall_length(epoch_df):
    
    # Start with three empty lists and initialize var for max_seq_len
    tokens, X, y = [], [], []
    max_seq_len = 0
    
    # convert pandas
    epoch_df = epoch_df.toPandas()
    
    # generator for iterating over df
    for _, row in tqdm(epoch_df.iterrows()):
        
        # pull out raw tokens and label for each row
        raw_tokens, label = row[epoch_df.tokens], row[epoch_df.label]
        
        # update max sequence length var
        max_seq_len = max(max_seq_len, len(raw_tokens))
      
        # append results as list to empty list
        tokens.append(raw_tokens)
        y.append(label)
    
    # convert response to nparray
    y = np.array(y)

    # for each of the raw tokens
    for sample in tokens:
        
        # truncate sample list if for some reason max_seq_len is shorter than actual length 
        sample = sample[:min(len(sample), max_seq_len - 2)]
        
        # add zeros to pad the elements if length of sample is less then max_seq_len
        sample = sample + [0] * (max_seq_len - len(sample))
        
        # append result to empty list X
        X.append(np.array(sample))
    
    # convert predictor to nparray
    X = np.array(X)
    
    return X,y
        
            

In [None]:
def build_model(max_seq_len, bert_ckpt_file):
    
    with tf.io.gfile.GFile(bert_config_file, "r") as reader:
        bc = StockBertConfig.from_json_string(reader.read())
        bert_params = map_stock_config_to_params(bc)
        bert_params.adapter_size = None
        bert = BertModelLayer.from_params(bert_params, name="bert")

    input_tokens = keras.layers.Input(shape=(max_seq_len, ),
                                   dtype='int64',
                                   name="input_tokens")
  bert_output = bert(input_ids)

  print("bert shape", bert_output.shape)

  cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output)
  cls_out = keras.layers.Dropout(0.5)(cls_out)
  logits = keras.layers.Dense(units=768, activation="tanh")(cls_out)
  logits = keras.layers.Dropout(0.5)(logits)
  logits = keras.layers.Dense(
    units=len(classes),
    activation="softmax"
  )(logits)

  model = keras.Model(inputs=input_ids, outputs=logits)
  model.build(input_shape=(None, max_seq_len))

  load_stock_weights(bert, bert_ckpt_file)

  return model

In [6]:
# Initialize BERT model and tokenizer

%time bert_layer, tokenizer = model_utils.init_bert()

CPU times: user 23.6 s, sys: 3.92 s, total: 27.5 s
Wall time: 32.4 s


In [7]:
# Initialize Spark context

%time sc, spark = model_utils.init_spark()

CPU times: user 37.3 ms, sys: 14.3 ms, total: 51.6 ms
Wall time: 25.5 s


In [9]:
# Read in sarcastic samples, non-sarcastic samples, and the ratio between the two

%time sarcastic, non_sarcastic, ratio = model_utils.load_data(spark, bucket_name="sarc-bucket-5", dataset="politics")

CPU times: user 11.4 ms, sys: 7.91 ms, total: 19.3 ms
Wall time: 42.2 s


In [10]:
# Initialize epoch_df generator

%time generator = generate_epoch_df(sarcastic, non_sarcastic, ratio, 5)

In [11]:
%time epoch_df = next(generator)

CPU times: user 277 ms, sys: 3.41 ms, total: 281 ms
Wall time: 384 ms


In [114]:
sorted_tokens = sort_by_comment_length(epoch_df, batch_size=16)

In [115]:
sorted_dataset = tf.data.Dataset.from_generator(lambda: sorted_tokens, output_types=(tf.int64, tf.int64))
padded_batch = sorted_dataset.padded_batch(16, padded_shapes=((None, ), ()))

In [118]:
next(iter(padded_batch))

(<tf.Tensor: shape=(16, 1311), dtype=int64, numpy=
 array([[  101,   107,  3446, ...,  1122,   119,   102],
        [  101,   107,  1212, ...,     0,     0,     0],
        [  101,   107,  2431, ...,     0,     0,     0],
        ...,
        [  101,   107, 15859, ...,     0,     0,     0],
        [  101,   107,  1135, ...,     0,     0,     0],
        [  101,  1789,  1104, ...,     0,     0,     0]])>,
 <tf.Tensor: shape=(16,), dtype=int64, numpy=array([1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1])>)

In [22]:
epoch_df.show()

+-----+--------------------+
|label|              tokens|
+-----+--------------------+
|    1|[101, 4208, 117, ...|
|    1|[101, 12357, 112,...|
|    1|[101, 5651, 5797,...|
|    1|[101, 107, 107, 1...|
|    1|[101, 10364, 119,...|
|    1|[101, 107, 107, 1...|
|    1|[101, 18725, 117,...|
|    1|[101, 2814, 117, ...|
|    1|[101, 3046, 1190,...|
|    1|[101, 146, 2810, ...|
|    1|[101, 11205, 119,...|
|    1|[101, 1753, 1155,...|
|    1|[101, 119, 119, 1...|
|    1|[101, 1124, 1108,...|
|    1|[101, 5704, 1103,...|
|    1|[101, 27020, 160,...|
|    1|[101, 107, 14180,...|
|    1|[101, 1103, 1234,...|
|    1|[101, 17135, 2707...|
|    1|[101, 1192, 1274,...|
+-----+--------------------+
only showing top 20 rows



In [58]:
np.ceil(1.5)

2.0