In [1]:
import re
import string

from spacy.en import English
from spacy.symbols import ORTH, LEMMA, POS, SYM, TAG

import pandas as pd

from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType, FloatType

from src.spacy_transformer import Tokenize_Transformer

from pyspark.ml.feature import Tokenizer
# from pyspark.ml.feature import RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import CountVectorizer
from pyspark.ml.feature import IDF
from pyspark.ml.feature import Word2Vec
from pyspark.ml.feature import NGram

%autoreload 2

In [None]:
# when starting jupyter with the sparkjupyter script, pyspark is already imported

print("sql session setup by script:\t", spark)
print("spark context setup by script:\t", sc)
print("pyspark imported by script:\t", str(pyspark)[:56], "...")

In [2]:
data_file = 'data/data.json'
raw_df = spark.read.json(data_file)

raw_df.printSchema()
print("row count: ", raw_df.count())
raw_df.show(3)

root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- title: string (nullable = true)

row count:  9050
+--------------+--------------------+---------------+
|        author|             excerpt|          title|
+--------------+--------------------+---------------+
|CharlesDickens|A CHRISTMAS CAROL...|AChristmasCarol|
|CharlesDickens|Mind! I don't mea...|AChristmasCarol|
|CharlesDickens|Scrooge never pai...|AChristmasCarol|
+--------------+--------------------+---------------+
only showing top 3 rows



In [3]:
# create copy of raw_df incase I mess things up :P
df = raw_df

## UDF demonstration
add explanation and example of spark udf

In [None]:
def word_count(text):
    return len(text.split())

wordcount_udf = udf(lambda x: word_count(x))

df_with_wordcount = df.withColumn("word_count", wordcount_udf(df.excerpt).cast(FloatType()))
df_with_wordcount.show(3)

If we only wanted to add this column to our dataframe it could be added to our dataframe via:
```python 
df = df.withColumn("word_count", wordcount_udf(df.excerpt).cast(FloatType()))
```
But for now we will wait and add our UDFs to our pipeline later

## Transformer demonstration
add explanation and example of transformers

### Native Transformers

These are great, sadly tokenizer leaves punctuation attached to the preceding word

In [None]:
tokenizer = Tokenizer(inputCol="excerpt", outputCol="tokenized")
df_spark_tokenized = tokenizer.transform(df)
df_spark_tokenized.show(3)

### Customized Transformers
luckily, we can make our own transformers as well
here we borrow the spaCy tokenizer

In [None]:
# run david's stemming transformer on df_spark_tokenized
# to make sure it works

In [4]:
spacy_tokenizer = Tokenize_Transformer(inputCol="excerpt", outputCol="tokenized")
df_spacy_tokens = spacy_tokenizer(df)
df_spacy_tokens.show(3)

TypeError: 'Tokenize_Transformer' object is not callable

In [None]:
type(English)