In [6]:
import pyspark as ps
from pyspark.ml import Pipeline
from pyspark.ml.pipeline import Transformer
from spacy.en import English
from src.SpacyTransformer import SpacyTransformer
from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF
from pyspark.ml.classification import NaiveBayes

## Read data.json into Spark SQL context

In [7]:
data_file = 'data/data.json'
df = spark.read.json(data_file)

In [8]:
print(df.printSchema())
print(df.count())
df.show(3)

root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- title: string (nullable = true)

None
9050
+----------+--------------------+-----------------+
|    author|             excerpt|            title|
+----------+--------------------+-----------------+
|JaneAusten|Chapter 1 || It i...|PrideAndPrejudice|
|JaneAusten|“What is his name...|PrideAndPrejudice|
|JaneAusten|“In such cases, a...|PrideAndPrejudice|
+----------+--------------------+-----------------+
only showing top 3 rows



## Create pipeline and process data

In [13]:
# Set up transformers
tokenizer = SpacyTransformer(inputCol='excerpt', outputCol='words')
countvec = CountVectorizer(inputCol=tokenizer.getOutputCol(), outputCol='termfreq')
idf = IDF(inputCol=countvec.getOutputCol(), outputCol='tfidf')

In [14]:
%%time
# Build pipeline and run pipeline
pipeline = Pipeline(stages=[tokenizer, countvec, idf])
data = pipeline.fit(df).transform(df)
data.show(3)

+----------+--------------------+-----------------+--------------------+--------------------+--------------------+
|    author|             excerpt|            title|               words|            termfreq|               tfidf|
+----------+--------------------+-----------------+--------------------+--------------------+--------------------+
|JaneAusten|Chapter 1 || It i...|PrideAndPrejudice|[Chapter, 1, ||, ...|(50727,[0,1,2,3,4...|(50727,[0,1,2,3,4...|
|JaneAusten|“What is his name...|PrideAndPrejudice|[“, What, is, his...|(50727,[0,1,2,3,4...|(50727,[0,1,2,3,4...|
|JaneAusten|“In such cases, a...|PrideAndPrejudice|[“, In, such, cas...|(50727,[0,1,2,3,4...|(50727,[0,1,2,3,4...|
+----------+--------------------+-----------------+--------------------+--------------------+--------------------+
only showing top 3 rows

CPU times: user 72 ms, sys: 24 ms, total: 96 ms
Wall time: 14min 24s


## Make train/test split

In [9]:
training, test = data.randomSplit([0.75, 0.25])