In [45]:
import pyspark as ps
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, NumericType, StringType, IntegerType
from pyspark.ml import Pipeline
from pyspark.ml.pipeline import Transformer
from spacy.en import English
from src.SpacyTransformer import SpacyTransformer
from src.LabelTransformer import LabelTransformer
from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF, Word2Vec, NGram
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Read data.json into Spark SQL context

In [46]:
data_file = 'data/data.json'
df = spark.read.json(data_file)

In [47]:
print(df.printSchema())
print(df.count())
df.show(3)

root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- title: string (nullable = true)

None
9050
+----------+--------------------+-----------------+
|    author|             excerpt|            title|
+----------+--------------------+-----------------+
|JaneAusten|Chapter 1 || It i...|PrideAndPrejudice|
|JaneAusten|“What is his name...|PrideAndPrejudice|
|JaneAusten|“In such cases, a...|PrideAndPrejudice|
+----------+--------------------+-----------------+
only showing top 3 rows



In [48]:
# Create debugging data
df2 = df.sample(withReplacement=False, fraction=0.02)

## Create pipeline and process data

In [21]:
# Set up transformers
tokenizer = SpacyTransformer(inputCol='excerpt', outputCol='words')
countvec = CountVectorizer(inputCol=tokenizer.getOutputCol(), outputCol='termfreq')
idf = IDF(inputCol=countvec.getOutputCol(), outputCol='tfidf')
w2v_2d = Word2Vec(vectorSize=2, minCount=2, inputCol=tokenizer.getOutputCol(), outputCol='w2v_2d')
w2v_large = Word2Vec(vectorSize=250, minCount=2, inputCol=tokenizer.getOutputCol(), outputCol='w2v_large')

In [49]:
%%time
# Build pipeline and run pipeline
pipeline = Pipeline(stages=[tokenizer, countvec, idf, w2v_2d, w2v_large])
data = pipeline.fit(df).transform(df)
data.show(3)

+----------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|    author|             excerpt|            title|               words|            termfreq|               tfidf|              w2v_2d|           w2v_large|
+----------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+
|JaneAusten|Chapter 1 || It i...|PrideAndPrejudice|[Chapter, 1, ||, ...|(50727,[0,1,2,3,4...|(50727,[0,1,2,3,4...|[-0.1537176836475...|[-0.0043571270131...|
|JaneAusten|“What is his name...|PrideAndPrejudice|[“, What, is, his...|(50727,[0,1,2,3,4...|(50727,[0,1,2,3,4...|[-0.1880662840808...|[0.00572840133550...|
|JaneAusten|“In such cases, a...|PrideAndPrejudice|[“, In, such, cas...|(50727,[0,1,2,3,4...|(50727,[0,1,2,3,4...|[-0.1603233745282...|[0.01362464603545...|
+----------+--------------------+-----------------+-------

In [50]:
def make_labels(s):
    labels = {'JaneAusten': 0,
              'CharlesDickens': 1,
              'JohnMuir': 2,
              'MarkTwain': 3}
    return labels[s]

In [51]:
udf_make_labels = udf(make_labels, IntegerType())
data = data.withColumn('label', udf_make_labels('author'))
data.show(3)

+----------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|    author|             excerpt|            title|               words|            termfreq|               tfidf|              w2v_2d|           w2v_large|label|
+----------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----+
|JaneAusten|Chapter 1 || It i...|PrideAndPrejudice|[Chapter, 1, ||, ...|(50727,[0,1,2,3,4...|(50727,[0,1,2,3,4...|[-0.1537176836475...|[-0.0043571270131...|    0|
|JaneAusten|“What is his name...|PrideAndPrejudice|[“, What, is, his...|(50727,[0,1,2,3,4...|(50727,[0,1,2,3,4...|[-0.1880662840808...|[0.00572840133550...|    0|
|JaneAusten|“In such cases, a...|PrideAndPrejudice|[“, In, such, cas...|(50727,[0,1,2,3,4...|(50727,[0,1,2,3,4...|[-0.1603233745282...|[0.01362464603545...|    0|
+----------+----------

In [52]:
drop_list = ['author', 'title', 'excerpt', 'termfreq']
data2 = data.select([column for column in data.columns if column not in drop_list])

In [55]:
print(data2.printSchema())
print(data2.count())

root
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)
 |-- w2v_large: vector (nullable = true)
 |-- label: integer (nullable = true)

None
9050


In [59]:
data2.write.mode('overwrite').save('data/processed_data.parquet', format='parquet')

## Make train/test split

In [60]:
splits = data2.randomSplit([0.75, 0.25])
train = splits[0]
test = splits[1]

In [61]:
nb = NaiveBayes(smoothing=1.0, 
                modelType='multinomial', 
                featuresCol='tfidf')
model = nb.fit(train)

In [62]:
predictions = model.transform(test)
predictions.show(3)

+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|               words|               tfidf|              w2v_2d|           w2v_large|label|       rawPrediction|         probability|prediction|
+--------------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|[ , ||, Appendix,...|(50727,[0,1,3,4,5...|[-0.0391773918012...|[-0.0073203907892...|    2|[-13573.726286432...|   [0.0,0.0,1.0,0.0]|       2.0|
|[ , ||, CHAPTER, ...|(50727,[0,1,2,3,4...|[-0.1349814746155...|[9.93143534166291...|    1|[-6880.3404623496...|[2.48589989306050...|       1.0|
|[ , ||, CHAPTER, ...|(50727,[0,1,2,3,4...|[-0.1320070894340...|[0.02039432470268...|    1|[-4926.8195221126...|[2.98839071910880...|       1.0|
+--------------------+--------------------+--------------------+--------------------+-----+--------------------+------------------

In [63]:
evaluator = MulticlassClassificationEvaluator(labelCol='label',
                                              predictionCol='prediction')

In [64]:
accuracy = evaluator.evaluate(predictions)
print('Test set accuracy = ' + str(accuracy))

Test set accuracy = 0.9876035953221631
