In [12]:
import re
import string

from spacy.en import English
from spacy.symbols import ORTH, LEMMA, POS, SYM, TAG

import pandas as pd

from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType, FloatType, IntegerType

from src.spacy_transformer import SpacyTokenize_Transformer

from pyspark.ml import Pipeline
from pyspark.ml.pipeline import Transformer

from pyspark.ml.feature import Tokenizer
# from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import NGram

%autoreload 2

In [2]:
# when starting jupyter with the sparkjupyter script, pyspark is already imported

print("sql session setup by script:\t", spark)
print("spark context setup by script:\t", sc)
print("pyspark imported by script:\t", str(pyspark)[:56], "...")

sql session setup by script:	 <pyspark.sql.session.SparkSession object at 0x109fd7fd0>
spark context setup by script:	 <pyspark.context.SparkContext object at 0x1020dc278>
pyspark imported by script:	 <module 'pyspark' from '/usr/local/Cellar/apache-spark/2 ...


In [3]:
data_file = 'data/data.json'
raw_df = spark.read.json(data_file)

raw_df.printSchema()
print("row count: ", raw_df.count())
raw_df.show(3)


# create copy of raw_df incase I mess things up :P
df = raw_df


root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- title: string (nullable = true)

row count:  9050
+--------------+--------------------+---------------+
|        author|             excerpt|          title|
+--------------+--------------------+---------------+
|CharlesDickens|A CHRISTMAS CAROL...|AChristmasCarol|
|CharlesDickens|Mind! I don't mea...|AChristmasCarol|
|CharlesDickens|Scrooge never pai...|AChristmasCarol|
+--------------+--------------------+---------------+
only showing top 3 rows



In [4]:
# a tiny sample dataframe for testing
tiny_df = df.sample(False, 1/1000).limit(5)
print(type(tiny_df))
print(tiny_df.count())
tiny_df.show()

<class 'pyspark.sql.dataframe.DataFrame'>
5
+--------------+--------------------+--------------------+
|        author|             excerpt|               title|
+--------------+--------------------+--------------------+
|     MarkTwain|The guards were l...|AConnecticutYanke...|
|CharlesDickens|I looked at the o...|    DavidCopperfield|
|CharlesDickens|‘Now, let me see,...|    DavidCopperfield|
|CharlesDickens|This avenging pha...|   GreatExpectations|
|CharlesDickens|So now, as an inf...|   GreatExpectations|
+--------------+--------------------+--------------------+



## Spacy: a brief aside

Spacy is a production oriented Natural Language Processing package with (among other things) very nice tokenization options. I use spaCy here because it tokenizes punctuation and contractions better than spark's tokenizer.

Here we will wrap the tokenization in a Spark UDF. Later we will include it in our customized transformer.

In [5]:
%%time
# timing to ensure spaCy is set up properly (should take ~100ms)

parser = English()


CPU times: user 76.6 ms, sys: 3.36 ms, total: 79.9 ms
Wall time: 86.2 ms


In [6]:
# Grab a couple excerpts for testing

excerpt = df.take(100)[80]['excerpt']
excerpt2 = df.take(100)[99]['excerpt']

In [7]:
%%time
parsedData = parser(excerpt)

# sentences = [sent.string.strip() for sent in parsedData.sents]
# for s in sentences:
#     print(s, '\n')

tokens = [tok.lower_ for tok in parsedData]
# print(type(token_lower[1]))
print(tokens[:8])

['but', 'they', 'did', "n't", 'devote', 'the', 'whole', 'evening']
CPU times: user 11.9 ms, sys: 2.56 ms, total: 14.5 ms
Wall time: 24.1 ms


## UDF demonstration
A quick way to create a User Defined Function (UDF) in spark:

Get (or create a function) in python and use a lambda function to insert it in to "udf(  )".

Don't forget to define your Spark DataType!

```
Other excerpt metadata to include via UDF:
num_chars, num_words, num_sent, num_para
(use these to calc word_len, word_per_sent, word_per_para, sent_per_para . . . etc.
per excerpt, book and author)
```

In [8]:
%%time

def tokenize(text):
    parser = English()
    return [tok.lower_ for tok in parser(text)]

tokenize_udf = udf(lambda x: tokenize(x), ArrayType(StringType()))

df_tokens = tiny_df.withColumn("tokens", tokenize_udf(df.excerpt))
df_tokens.show(3)

+--------------+--------------------+--------------------+--------------------+
|        author|             excerpt|               title|              tokens|
+--------------+--------------------+--------------------+--------------------+
|     MarkTwain|The guards were l...|AConnecticutYanke...|[the, guards, wer...|
|CharlesDickens|I looked at the o...|    DavidCopperfield|[i, looked, at, t...|
|CharlesDickens|‘Now, let me see,...|    DavidCopperfield|[‘, now, ,, let, ...|
+--------------+--------------------+--------------------+--------------------+
only showing top 3 rows

CPU times: user 24.4 ms, sys: 5.94 ms, total: 30.3 ms
Wall time: 5.44 s


# Transformers
add explanation and example of transformers

## Native Transformers

Many of the transformers in Spark's ML lib are great. (Sadly tokenizer leaves punctuation attached to the preceding word.)

In [9]:
tokenizer = Tokenizer(inputCol="excerpt", outputCol="tokenized")
df_spark_tokens = tokenizer.transform(tiny_df)
df_spark_tokens.show()

+--------------+--------------------+--------------------+--------------------+
|        author|             excerpt|               title|           tokenized|
+--------------+--------------------+--------------------+--------------------+
|     MarkTwain|The guards were l...|AConnecticutYanke...|[the, guards, wer...|
|CharlesDickens|I looked at the o...|    DavidCopperfield|[i, looked, at, t...|
|CharlesDickens|‘Now, let me see,...|    DavidCopperfield|[‘now,, let, me, ...|
|CharlesDickens|This avenging pha...|   GreatExpectations|[this, avenging, ...|
|CharlesDickens|So now, as an inf...|   GreatExpectations|[so, now,, as, an...|
+--------------+--------------------+--------------------+--------------------+



## Customized Transformers
Luckily, we can make our own transformers as well.

Here we build the spaCy tokenizer into a spark transformer

### Spacy Transformer:

In [13]:
%%time
tokenizer = SpacyTokenize_Transformer(inputCol='excerpt', outputCol='words')

CPU times: user 898 µs, sys: 620 µs, total: 1.52 ms
Wall time: 2.37 ms


In [14]:
%%time

df_tokens = tokenizer.transform(df)
df_tokens.show(3)


+--------------+--------------------+---------------+--------------------+
|        author|             excerpt|          title|               words|
+--------------+--------------------+---------------+--------------------+
|CharlesDickens|A CHRISTMAS CAROL...|AChristmasCarol|[A, CHRISTMAS, CA...|
|CharlesDickens|Mind! I don't mea...|AChristmasCarol|[Mind, !, I, do, ...|
|CharlesDickens|Scrooge never pai...|AChristmasCarol|[Scrooge, never, ...|
+--------------+--------------------+---------------+--------------------+
only showing top 3 rows

CPU times: user 26.2 ms, sys: 6.18 ms, total: 32.4 ms
Wall time: 16.6 s


# Pipeline

Pipelines allow for multiple transformers to be strung together efficiently.

By using ".getOutputCol( )" column names can be set in a single location.

Columns can then be added/dropped simply by adding or removing them from the "stages" list  in the Pipeline


```python

tokenizer = RegexTokenizer(inputCol="parsed_text", outputCol="raw_tokens"
            , pattern="\\W", minTokenLength=3)
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='tokens_stop')
stemmer = Stemming_Transformer(inputCol=remover.getOutputCol(), outputCol='tokens')
bigram = NGram(inputCol=stemmer.getOutputCol(), outputCol='bigrams'
         , n=2)
trigram = NGram(inputCol=stemmer.getOutputCol(), outputCol='trigrams'
          , n=3)
cv = CountVectorizer(inputCol=stemmer.getOutputCol(), outputCol='token_countvector'
     , minDF=10.0)
idf = IDF(inputCol=cv.getOutputCol(), outputCol='token_idf'
      , minDocFreq=10)
w2v_2d = Word2Vec(vectorSize=2, minCount=2, inputCol=stemmer.getOutputCol()
         , outputCol='word2vec_2d')
w2v_large = Word2Vec(vectorSize=250, minCount=2, inputCol=stemmer.getOutputCol()
            , outputCol='word2vec_large')

pipe = Pipeline(stages=[tokenizer, remover, stemmer, cv, idf, w2v_2d, w2v_large])
```

In [16]:
# Set up transformers
tokenizer = SpacyTokenize_Transformer(inputCol='excerpt', outputCol='words')
countvec = CountVectorizer(inputCol=tokenizer.getOutputCol(), outputCol='termfreq')
idf = IDF(inputCol=countvec.getOutputCol(), outputCol='tfidf')



In [17]:
%%time
# Build pipeline and run pipeline


pipeline = Pipeline(stages=[tokenizer, countvec, idf])
# data = pipeline.fit(tiny_df).transform(tiny_df)
data = pipeline.fit(tiny_df).transform(tiny_df)

data.show(3)



+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|        author|             excerpt|               title|               words|            termfreq|               tfidf|
+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|     MarkTwain|The guards were l...|AConnecticutYanke...|[The, guards, wer...|(609,[0,1,2,3,4,5...|(609,[0,1,2,3,4,5...|
|CharlesDickens|I looked at the o...|    DavidCopperfield|[I, looked, at, t...|(609,[0,1,2,3,4,5...|(609,[0,1,2,3,4,5...|
|CharlesDickens|‘Now, let me see,...|    DavidCopperfield|[‘, Now, ,, let, ...|(609,[0,1,2,3,4,5...|(609,[0,1,2,3,4,5...|
+--------------+--------------------+--------------------+--------------------+--------------------+--------------------+
only showing top 3 rows

CPU times: user 48.5 ms, sys: 9.45 ms, total: 58 ms
Wall time: 7.45 s


### Note:
%%time output for full df:
```
CPU times: user 136 ms, sys: 80.6 ms, total: 217 ms
Wall time: 22min 14s
```