In [1]:
import pyspark as ps
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType, IntegerType
from pyspark.ml import Pipeline
from pyspark.ml.pipeline import Transformer

from src.SpacyTransformer import SpacyTransformer
from src.AuthorLabelTransformer import AuthorLabeler
from src.TitleLabelTransformer import TitleLabeler

from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF, Word2Vec, NGram
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Read data.json into Spark SQL context

In [2]:
data_file = 'data/data.json'
df = spark.read.json(data_file)

In [3]:
print(df.printSchema())
print(df.count())
df.show(3)

root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- title: string (nullable = true)

None
9050
+----------+--------------------+-----------------+
|    author|             excerpt|            title|
+----------+--------------------+-----------------+
|JaneAusten|Chapter 1 || It i...|PrideAndPrejudice|
|JaneAusten|“What is his name...|PrideAndPrejudice|
|JaneAusten|“In such cases, a...|PrideAndPrejudice|
+----------+--------------------+-----------------+
only showing top 3 rows



In [4]:
# Create debugging data
df5 = df.sample(withReplacement=False, fraction=0.02, seed=42)
df5.persist()

DataFrame[author: string, excerpt: string, title: string]

## Create pipeline and process data

In [5]:
# Set up transformers
author_labeler = AuthorLabeler(inputCol='author', outputCol='author_label')
title_labeler = TitleLabeler(inputCol='title', outputCol='title_label')
tokenizer = SpacyTransformer(inputCol='excerpt', outputCol='words')
countvec = CountVectorizer(inputCol=tokenizer.getOutputCol(), outputCol='termfreq')
idf = IDF(inputCol=countvec.getOutputCol(), outputCol='tfidf')
w2v_2d = Word2Vec(vectorSize=2, minCount=2, inputCol=tokenizer.getOutputCol(), outputCol='w2v_2d')
w2v_large = Word2Vec(vectorSize=250, minCount=2, inputCol=tokenizer.getOutputCol(), outputCol='w2v_large')

In [6]:
%%time
# Build pipeline and run pipeline
pipeline = Pipeline(stages=[author_labeler, title_labeler,
                            tokenizer, countvec, 
                            idf, w2v_2d, w2v_large])
data = pipeline.fit(df).transform(df)
data.persist()

CPU times: user 224 ms, sys: 52 ms, total: 276 ms
Wall time: 40min 38s


In [7]:
%%time
# Create new data frame without extraneous columns
data2 = data.select(['words', 'tfidf', 'w2v_2d', 'w2v_large', 
                      'author_label', 'title_label'])

data2.persist()
print(data.printSchema())
print(data2.printSchema())

root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author_label: integer (nullable = true)
 |-- title_label: integer (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- termfreq: vector (nullable = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)
 |-- w2v_large: vector (nullable = true)

None
root
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)
 |-- w2v_large: vector (nullable = true)
 |-- author_label: integer (nullable = true)
 |-- title_label: integer (nullable = true)

None
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 57.1 ms


## Save processed data to parquet file

In [8]:
%%time
# Save data frame
data2.write.mode('overwrite').save('data/processed_data.parquet', format='parquet')

CPU times: user 116 ms, sys: 32 ms, total: 148 ms
Wall time: 31min 39s


In [None]:
data_aut_tfidf = data.select(['tfidf', 'author_label'])
data_aut_tfidf.write.mode('overwrite').save('data/data_aut_tfidf.parquet', format='parquet')