In [1]:
import pyspark as ps
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, NumericType, StringType, IntegerType
from pyspark.ml import Pipeline
from pyspark.ml.pipeline import Transformer

from spacy.en import English

from src.SpacyTransformer import SpacyTransformer
from src.LabelTransformer import LabelTransformer
from src.label_udfs import author_labels, title_labels 

from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF, Word2Vec, NGram
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Read data.json into Spark SQL context

In [2]:
data_file = 'data/data.json'
df = spark.read.json(data_file)

In [3]:
print(df.printSchema())
print(df.count())
df.show(3)

root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- title: string (nullable = true)

None
9050
+----------+--------------------+-----------------+
|    author|             excerpt|            title|
+----------+--------------------+-----------------+
|JaneAusten|Chapter 1 || It i...|PrideAndPrejudice|
|JaneAusten|“What is his name...|PrideAndPrejudice|
|JaneAusten|“In such cases, a...|PrideAndPrejudice|
+----------+--------------------+-----------------+
only showing top 3 rows



In [4]:
# Create debugging data
df5 = df.sample(withReplacement=False, fraction=0.02, seed=42)
df5.persist()

DataFrame[author: string, excerpt: string, title: string]

## Create pipeline and process data

In [5]:
# Set up transformers
tokenizer = SpacyTransformer(inputCol='excerpt', outputCol='words')
countvec = CountVectorizer(inputCol=tokenizer.getOutputCol(), outputCol='termfreq')
idf = IDF(inputCol=countvec.getOutputCol(), outputCol='tfidf')
w2v_2d = Word2Vec(vectorSize=2, minCount=2, inputCol=tokenizer.getOutputCol(), outputCol='w2v_2d')
w2v_large = Word2Vec(vectorSize=250, minCount=2, inputCol=tokenizer.getOutputCol(), outputCol='w2v_large')

In [6]:
%%time
# Build pipeline and run pipeline
pipeline = Pipeline(stages=[tokenizer, countvec, idf, w2v_2d, w2v_large])
data = pipeline.fit(df).transform(df)
data.persist()
data.show(3)

+----------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|    author|             excerpt|            title|               words|            termfreq|               tfidf|              w2v_2d|           w2v_large|
+----------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|JaneAusten|Chapter 1 || It i...|PrideAndPrejudice|[Chapter, 1, ||, ...|(50727,[0,1,2,3,4...|(50727,[0,1,2,3,4...|[-0.1537176836475...|[-0.0043571270131...|
|JaneAusten|“What is his name...|PrideAndPrejudice|[“, What, is, his...|(50727,[0,1,2,3,4...|(50727,[0,1,2,3,4...|[-0.1880662840808...|[0.00572840133550...|
|JaneAusten|“In such cases, a...|PrideAndPrejudice|[“, In, such, cas...|(50727,[0,1,2,3,4...|(50727,[0,1,2,3,4...|[-0.1603233745282...|[0.01362464603545...|
+----------+--------------------+-----------------+-------

In [7]:
%%time
# Create int labels for author and title
udf_author_labels = udf(author_labels, IntegerType())
udf_title_labels = udf(title_labels, IntegerType())

data2 = data.withColumn('author_label', udf_author_labels('author'))
data3 = data2.withColumn('title_label', udf_title_labels('title'))

data3.persist()
data.unpersist()
data3.show(3)

+----------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+-----------+
|    author|             excerpt|            title|               words|            termfreq|               tfidf|              w2v_2d|           w2v_large|author_label|title_label|
+----------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+------------+-----------+
|JaneAusten|Chapter 1 || It i...|PrideAndPrejudice|[Chapter, 1, ||, ...|(50727,[0,1,2,3,4...|(50727,[0,1,2,3,4...|[-0.1537176836475...|[-0.0043571270131...|           0|          4|
|JaneAusten|“What is his name...|PrideAndPrejudice|[“, What, is, his...|(50727,[0,1,2,3,4...|(50727,[0,1,2,3,4...|[-0.1880662840808...|[0.00572840133550...|           0|          4|
|JaneAusten|“In such cases, a...|PrideAndPrejudice|[“, In, such, cas...|(50727,[0,1,2,3,4.

In [8]:
%%time
# Create new data frame without extraneous columns
data4 = data3.select(['words', 'tfidf', 'w2v_2d', 'w2v_large', 
                      'author_label', 'title_label'])

print(data3.printSchema())
print(data4.printSchema())

data4.persist()
data3.unpersist()

root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- title: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- termfreq: vector (nullable = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)
 |-- w2v_large: vector (nullable = true)
 |-- author_label: integer (nullable = true)
 |-- title_label: integer (nullable = true)

None
root
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)
 |-- w2v_large: vector (nullable = true)
 |-- author_label: integer (nullable = true)
 |-- title_label: integer (nullable = true)

None
CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 59.3 ms


In [9]:
%%time
# Save data frame
data4.write.mode('overwrite').save('data/processed_data.parquet', format='parquet')

CPU times: user 136 ms, sys: 16 ms, total: 152 ms
Wall time: 32min 19s


## Make train/test split

In [10]:
%%time
splits = data4.randomSplit(weights=[0.75, 0.25], seed=42)
train = splits[0]
test = splits[1]

train.persist()
test.persist()
data4.unpersist()

CPU times: user 0 ns, sys: 4 ms, total: 4 ms
Wall time: 139 ms


## Naive Bayes Classifier for author

In [11]:
%%time
nb1 = NaiveBayes(smoothing=1.0, 
                 modelType='multinomial',
                 labelCol='author_label',
                 featuresCol='tfidf')
model1 = nb1.fit(train)

CPU times: user 124 ms, sys: 28 ms, total: 152 ms
Wall time: 32min 13s


In [12]:
%%time
predictions1 = model1.transform(test)
predictions1.persist()
predictions1.show(3)

+--------------------+--------------------+--------------------+--------------------+------------+-----------+--------------------+--------------------+----------+
|               words|               tfidf|              w2v_2d|           w2v_large|author_label|title_label|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+------------+-----------+--------------------+--------------------+----------+
|[ , ||, Appendix,...|(50727,[0,1,3,4,5...|[-0.0391773918012...|[-0.0073203907892...|           2|         15|[-13587.688600909...|   [0.0,0.0,1.0,0.0]|       2.0|
|[ , ||, CHAPTER, ...|(50727,[0,1,2,3,4...|[-0.1021926742108...|[0.00169267584128...|           1|          7|[-6328.0520035366...|[1.93642110382305...|       1.0|
|[ , ||, CHAPTER, ...|(50727,[0,1,2,3,4...|[-0.1324708268134...|[0.03663685781380...|           0|          5|[-5214.6952657203...|   [1.0,0.0,0.0,0.0]|       0.0|
+---------------

In [13]:
%%time
evaluator1 = MulticlassClassificationEvaluator(labelCol='author_label',
                                               predictionCol='prediction')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 10 ms


In [14]:
accuracy1 = evaluator1.evaluate(predictions1)
print('Author test set accuracy = ' + str(accuracy1))
predictions1.unpersist()

Author test set accuracy = 0.9920443502350802


DataFrame[words: array<string>, tfidf: vector, w2v_2d: vector, w2v_large: vector, author_label: int, title_label: int, rawPrediction: vector, probability: vector, prediction: double]

## Naive Bayes Classifier for title

In [15]:
%%time
nb2 = NaiveBayes(smoothing=1.0, 
                 modelType='multinomial',
                 labelCol='title_label',
                 featuresCol='tfidf')
model2 = nb2.fit(train)

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 697 ms


In [16]:
%%time
predictions2 = model2.transform(test)
predictions2.persist()
predictions2.show(3)

+--------------------+--------------------+--------------------+--------------------+------------+-----------+--------------------+--------------------+----------+
|               words|               tfidf|              w2v_2d|           w2v_large|author_label|title_label|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+------------+-----------+--------------------+--------------------+----------+
|[ , ||, Appendix,...|(50727,[0,1,3,4,5...|[-0.0391773918012...|[-0.0073203907892...|           2|         15|[-12646.429028809...|[0.0,0.0,0.0,0.0,...|      14.0|
|[ , ||, CHAPTER, ...|(50727,[0,1,2,3,4...|[-0.1021926742108...|[0.00169267584128...|           1|          7|[-6257.4194521038...|[5.19451039268454...|       8.0|
|[ , ||, CHAPTER, ...|(50727,[0,1,2,3,4...|[-0.1324708268134...|[0.03663685781380...|           0|          5|[-5571.2612098319...|[5.66222742076090...|       4.0|
+---------------

In [17]:
%%time
evaluator2 = MulticlassClassificationEvaluator(labelCol='title_label',
                                               predictionCol='prediction')

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 6.21 ms


In [18]:
accuracy2 = evaluator2.evaluate(predictions2)
print('Title test set accuracy = ' + str(accuracy2))

predictions2.unpersist()
train.unpersist()
test.unpersist()

Title test set accuracy = 0.07133608564630645


DataFrame[words: array<string>, tfidf: vector, w2v_2d: vector, w2v_large: vector, author_label: int, title_label: int]