In [1]:
import pyspark
from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *

import tensorflow as tf
import tensorflow_hub as thub
import bert

import pandas as pd
import numpy as np

import re

import random

import os
from tqdm import tqdm

import model_utils

  from ._conv import register_converters as _register_converters


In [3]:
def tokenize_sample(context):
    
    """
    To be applied over Spark dataframe.
    Takes a string and converts it to token IDs via bert_tokenizer,
    adding the necessary beginning and end tokens

    Returns: Array of bert token ids for each row of Spark dataframe (requires udf)
    """
    
    tokenized = ["[CLS]"] + tokenizer.tokenize(context) + ["[SEP]"]
    ids = tokenizer.convert_tokens_to_ids(tokenized)
    
    return ids

In [4]:
def generate_epoch_df(sarcastic, non_sarcastic, ratio, n_epochs):
    
    """
    Returns: Ndarray of equal label distribution over which 
    we can perform mini-batch gradient descent. Each generated df is
    to be iterator over multiple times during training
    """
    
    number = 0
    while number < n_epochs:
        non_sarc_samp = non_sarcastic.sample(ratio) # making label dist equal
        
        # combine sampled non_sarcastic and whole sarcastic
        epoch_df = sarcastic.union(non_sarc_samp)
        
        # tokenize context column via spark udf
        tokenize_sample_udf = F.udf(tokenize_sample, ArrayType(IntegerType()))
        epoch_df = epoch_df.withColumn("tokens", tokenize_sample_udf(epoch_df.context))
        
        # split into X and y numpy arrays
        #X = np.array(epoch_df.select('tokens').collect())
        #y = np.array(epoch_df.select('label').collect())
        
        X = epoch_df.select('tokens')
        y = epoch_df.select('label')
        
        # yield one call at a time
        yield X, y
        number += 1

In [5]:
# Initialize BERT model and tokenizer

bert_layer, tokenizer = model_utils.init_bert()

In [6]:
# Initialize Spark context

sc, spark = model_utils.init_spark()

In [7]:
# Read in sarcastic samples, non-sarcastic samples, and the ratio between the two

sarcastic, non_sarcastic, ratio = model_utils.load_data(spark, 
                                                        bucket_name="sarc-bucket-5", 
                                                        dataset="politics")

In [8]:
# Generate epoch

generator = generate_epoch_df(sarcastic, non_sarcastic, ratio, 5)

In [9]:
%time X,y = next(generator)

CPU times: user 273 ms, sys: 3.36 ms, total: 277 ms
Wall time: 388 ms


In [35]:
%time new = np.array(X.toPandas())

CPU times: user 875 ms, sys: 7.99 ms, total: 883 ms
Wall time: 9.91 s


In [39]:
asdf = X.toPandas()

In [40]:
type(asdf)

pandas.core.frame.DataFrame

In [54]:
asdf.values[0][0]

[101,
 4208,
 117,
 1133,
 6557,
 1110,
 20560,
 113,
 2452,
 1106,
 22679,
 1116,
 114,
 1137,
 14284,
 113,
 2452,
 1106,
 181,
 24851,
 24279,
 1116,
 114,
 119,
 2809,
 21752,
 2059,
 1107,
 153,
 2591,
 13360,
 9741,
 4744,
 119,
 102]

In [51]:
pd.__version__

'0.23.0'

In [56]:
!pip install --upgrade pandas

Defaulting to user installation because normal site-packages is not writeable
Collecting pandas
  Downloading pandas-1.0.3-cp36-cp36m-manylinux1_x86_64.whl (10.0 MB)
[K     |████████████████████████████████| 10.0 MB 3.4 MB/s eta 0:00:01
Installing collected packages: pandas
Successfully installed pandas-1.0.3


In [57]:
import pandas as pd

In [62]:
pd.__version__

'0.23.0'

In [60]:
!conda update pandas -y


Solving environment: \ ^C
failed

CondaError: KeyboardInterrupt



In [69]:
!pip uninstall pandas==0.23.0 -y
!pip install pandas==0.24.0

Found existing installation: pandas 0.24.0
Uninstalling pandas-0.24.0:
  Successfully uninstalled pandas-0.24.0
Defaulting to user installation because normal site-packages is not writeable
Collecting pandas==0.24.0
  Using cached pandas-0.24.0-cp36-cp36m-manylinux1_x86_64.whl (10.1 MB)
Installing collected packages: pandas
Successfully installed pandas-0.24.0


In [66]:
import pandas as pd

In [2]:
pd.__version__

'0.24.0'

In [11]:
asdf = X.toPandas()

In [12]:
asdf.to_numpy()

array([[list([101, 4208, 117, 1133, 6557, 1110, 20560, 113, 2452, 1106, 22679, 1116, 114, 1137, 14284, 113, 2452, 1106, 181, 24851, 24279, 1116, 114, 119, 2809, 21752, 2059, 1107, 153, 2591, 13360, 9741, 4744, 119, 102])],
       [list([101, 12357, 112, 189, 1142, 170, 1632, 22275, 117, 8343, 2256, 1150, 5115, 1256, 21699, 1176, 3379, 2001, 12541, 1106, 1712, 1122, 3589, 1105, 4594, 1656, 1104, 1172, 136, 146, 112, 182, 1612, 1774, 1106, 8429, 2490, 112, 188, 2489, 1110, 1280, 1106, 4989, 170, 9908, 15867, 1309, 5940, 1254, 119, 2160, 117, 1133, 1517, 1195, 1838, 8077, 1158, 1172, 1107, 170, 8539, 117, 1105, 1173, 17400, 1172, 2469, 1106, 2218, 1614, 1115, 2999, 1234, 1138, 2469, 1106, 117, 1152, 1209, 5397, 1838, 1909, 1977, 1106, 5890, 1152, 1336, 1138, 170, 4910, 2463, 1105, 1209, 5397, 1136, 1712, 1122, 5346, 1181, 1656, 119, 102])],
       [list([101, 5651, 5797, 1114, 4067, 11981, 1107, 24993, 1337, 2869, 171, 15243, 5815, 106, 102])],
       ...,
       [list([101, 1192, 1137, 1