In [1]:
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_hub as thub
import bert

import pandas as pd
import numpy as np

import re

import random

import os
from tqdm import tqdm

import model_utils

  from ._conv import register_converters as _register_converters


In [3]:
def tokenize_sample(context):
    
    """
    To be applied over Spark dataframe.
    Takes a string and converts it to token IDs via bert_tokenizer,
    adding the necessary beginning and end tokens

    Returns: Array of bert token ids for each row of Spark dataframe (requires udf)
    """
    
    tokenized = ["[CLS]"] + tokenizer.tokenize(context) + ["[SEP]"]
    ids = tokenizer.convert_tokens_to_ids(tokenized)
    
    return ids

In [4]:
def generate_epoch_df(sarcastic, non_sarcastic, ratio, n_epochs):
    
    """
    Returns: Ndarray of equal label distribution over which 
    we can perform mini-batch gradient descent. Each generated df is
    to be iterator over multiple times during training
    """
    
    number = 0
    while number < n_epochs:
        non_sarc_samp = non_sarcastic.sample(ratio) # making label dist equal
        
        # combine sampled non_sarcastic and whole sarcastic
        epoch_df = sarcastic.union(non_sarc_samp)
        
        # tokenize context column via spark udf
        tokenize_sample_udf = F.udf(tokenize_sample, ArrayType(IntegerType()))
        epoch_df = epoch_df.withColumn("tokens", tokenize_sample_udf(epoch_df.context))
        
        # split into X and y numpy arrays
        X = np.array(epoch_df.select('tokens').toPandas())
        y = np.array(epoch_df.select('label').toPandas())
        
        # yield one call at a time
        yield X, y
        number += 1

In [5]:
# Initialize BERT model and tokenizer

%time bert_layer, tokenizer = model_utils.init_bert()

In [6]:
# Initialize Spark context

%time sc, spark = model_utils.init_spark()

In [7]:
# Read in sarcastic samples, non-sarcastic samples, and the ratio between the two

%time sarcastic, non_sarcastic, ratio = model_utils.load_data(spark, 
                                                        bucket_name="sarc-bucket-5", 
                                                        dataset="politics")

In [8]:
# Generate epoch

generator = generate_epoch_df(sarcastic, non_sarcastic, ratio, 5)

In [9]:
%time X, y = next(generator)

CPU times: user 273 ms, sys: 3.36 ms, total: 277 ms
Wall time: 388 ms


In [None]:
epoch = tf.data.Dataset.from_generator(generator=elements_gen,
                                     output_types=(tf.int64, tf.int64))

def element_length_fn(x, y):
    return tf.shape(x)[0]

batch = epoch.\
apply(tf.data.experimental.\
bucket_by_sequence_length(element_length_func=
                          element_length_fn,
                          bucket_batch_sizes=[1,16], 
                          bucket_boundaries=[2]))

# bucket batch size second element defines 
# how many samples per batch
# upper length boundary refers to the minimum 
# allowable length of a seq 



In [197]:
help(tf.data.Dataset.from_generator)

Help on function from_generator in module tensorflow.python.data.ops.dataset_ops:

from_generator(generator, output_types, output_shapes=None, args=None)
    Creates a `Dataset` whose elements are generated by `generator`.
    
    The `generator` argument must be a callable object that returns
    an object that support the `iter()` protocol (e.g. a generator function).
    The elements generated by `generator` must be compatible with the given
    `output_types` and (optional) `output_shapes` arguments.
    
    For example:
    
    ```python
    import itertools
    tf.compat.v1.enable_eager_execution()
    
    def gen():
      for i in itertools.count(1):
        yield (i, [1] * i)
    
    ds = tf.data.Dataset.from_generator(
        gen, (tf.int64, tf.int64), (tf.TensorShape([]), tf.TensorShape([None])))
    
    for value in ds.take(2):
      print value
    # (1, array([1]))
    # (2, array([1, 1]))
    ```
    
    NOTE: The current implementation of `Dataset.from_generator(

In [177]:
def elements_gen():
    text = [[1, 2, 3], [3, 4,16,12,777], [1, 2], [8, 9, 0],[1, 2], [8, 9, 0],
           [1,2,3,4],[90,50]]
    label = [1, 2, 1, 2]
    for x, y in zip(text, label):
        yield (x, y)

def element_length_fn(x, y):
    return tf.shape(x)[0]

dataset = tf.data.Dataset.from_generator(generator=elements_gen,
                                     output_shapes=([None],[]),
                                     output_types=(tf.int32, tf.int32))

dataset = dataset.apply(tf.data.experimental.\
                        bucket_by_sequence_length(element_length_func=element_length_fn,
                                                              bucket_batch_sizes=[1,2], 
                                                              bucket_boundaries=[3]))

# bucket batch size second element defines how many samples per batch

# upper length boundary refers to the minimum allowable length of a seq 

batch = dataset.make_one_shot_iterator().get_next()


In [178]:
batch

(<tf.Tensor: id=7145, shape=(2, 5), dtype=int32, numpy=
 array([[  1,   2,   3,   0,   0],
        [  3,   4,  16,  12, 777]], dtype=int32)>,
 <tf.Tensor: id=7146, shape=(2,), dtype=int32, numpy=array([1, 2], dtype=int32)>)

In [198]:
def elements_gen():
    text = [[1, 2, 3], [3, 4,16,12,777], [1, 2], [8, 9, 0],[1, 2], [8, 9, 0],
           [1,2,3,4],[90,50]]
    label = [1, 2, 1, 2]
    for x, y in zip(text, label):
        yield (x, y)

def element_length_fn(x, y):
    return tf.shape(x)[0]

dataset = tf.data.Dataset.from_generator(generator=elements_gen,
                                     output_shapes=([None],[]),
                                     output_types=(tf.int32, tf.int32))

In [199]:
dataset

<DatasetV1Adapter shapes: ((None,), ()), types: (tf.int32, tf.int32)>