In [25]:
import pyspark as ps
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, NumericType, StringType, IntegerType
from pyspark.ml import Pipeline
from pyspark.ml.pipeline import Transformer
from spacy.en import English
from src.SpacyTransformer import SpacyTransformer
from src.LabelTransformer import LabelTransformer
from pyspark.ml.feature import Tokenizer, CountVectorizer, IDF
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Read data.json into Spark SQL context

In [2]:
data_file = 'data/data.json'
df = spark.read.json(data_file)

In [3]:
print(df.printSchema())
print(df.count())
df.show(3)

root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- title: string (nullable = true)

None
9050
+----------+--------------------+-----------------+
|    author|             excerpt|            title|
+----------+--------------------+-----------------+
|JaneAusten|Chapter 1 || It i...|PrideAndPrejudice|
|JaneAusten|“What is his name...|PrideAndPrejudice|
|JaneAusten|“In such cases, a...|PrideAndPrejudice|
+----------+--------------------+-----------------+
only showing top 3 rows



In [6]:
# Create debugging data
df2 = df.sample(withReplacement=False, fraction=0.02)

## Create pipeline and process data

In [7]:
# Set up transformers
tokenizer = SpacyTransformer(inputCol='excerpt', outputCol='words')
countvec = CountVectorizer(inputCol=tokenizer.getOutputCol(), outputCol='termfreq')
idf = IDF(inputCol=countvec.getOutputCol(), outputCol='tfidf')

In [8]:
%%time
# Build pipeline and run pipeline
pipeline = Pipeline(stages=[tokenizer, countvec, idf])
data = pipeline.fit(df2).transform(df2)
data.show(3)

+----------+--------------------+-----------------+--------------------+--------------------+--------------------+
|    author|             excerpt|            title|               words|            termfreq|               tfidf|
+----------+--------------------+-----------------+--------------------+--------------------+--------------------+
|JaneAusten|He paused in hope...|PrideAndPrejudice|[He, paused, in, ...|(7583,[0,1,2,3,4,...|(7583,[0,1,2,3,4,...|
|JaneAusten|Elizabeth, feelin...|PrideAndPrejudice|[Elizabeth, ,, fe...|(7583,[0,1,2,3,4,...|(7583,[0,1,2,3,4,...|
|JaneAusten|Elizabeth took up...|PrideAndPrejudice|[Elizabeth, took,...|(7583,[0,1,2,3,4,...|(7583,[0,1,2,3,4,...|
+----------+--------------------+-----------------+--------------------+--------------------+--------------------+
only showing top 3 rows

CPU times: user 44 ms, sys: 0 ns, total: 44 ms
Wall time: 47.6 s


In [86]:
# labeler = LabelTransformer(inputCol='author', outputCol='label')
# pipeline = Pipeline(stages=[labeler])

In [9]:
data = data.withColumnRenamed('tfidf', 'features')
# data = data.withColumnRenamed('label', 'author')
data.show(3)

+----------+--------------------+-----------------+--------------------+--------------------+--------------------+
|    author|             excerpt|            title|               words|            termfreq|            features|
+----------+--------------------+-----------------+--------------------+--------------------+--------------------+
|JaneAusten|He paused in hope...|PrideAndPrejudice|[He, paused, in, ...|(7583,[0,1,2,3,4,...|(7583,[0,1,2,3,4,...|
|JaneAusten|Elizabeth, feelin...|PrideAndPrejudice|[Elizabeth, ,, fe...|(7583,[0,1,2,3,4,...|(7583,[0,1,2,3,4,...|
|JaneAusten|Elizabeth took up...|PrideAndPrejudice|[Elizabeth, took,...|(7583,[0,1,2,3,4,...|(7583,[0,1,2,3,4,...|
+----------+--------------------+-----------------+--------------------+--------------------+--------------------+
only showing top 3 rows



In [30]:
def make_labels(s):
    labels = {'JaneAusten': 0,
              'CharlesDickens': 1,
              'JohnMuir': 2,
              'MarkTwain': 3}
    return labels[s]

In [31]:
udf_make_labels = udf(make_labels, IntegerType())
data = data.withColumn('label', udf_make_labels('author'))
data.show(5)

+----------+--------------------+-----------------+--------------------+--------------------+--------------------+-----+
|    author|             excerpt|            title|               words|            termfreq|            features|label|
+----------+--------------------+-----------------+--------------------+--------------------+--------------------+-----+
|JaneAusten|He paused in hope...|PrideAndPrejudice|[He, paused, in, ...|(7583,[0,1,2,3,4,...|(7583,[0,1,2,3,4,...|    0|
|JaneAusten|Elizabeth, feelin...|PrideAndPrejudice|[Elizabeth, ,, fe...|(7583,[0,1,2,3,4,...|(7583,[0,1,2,3,4,...|    0|
|JaneAusten|Elizabeth took up...|PrideAndPrejudice|[Elizabeth, took,...|(7583,[0,1,2,3,4,...|(7583,[0,1,2,3,4,...|    0|
|JaneAusten|“Your examination...|PrideAndPrejudice|[“, Your, examina...|(7583,[0,1,2,3,4,...|(7583,[0,1,2,3,4,...|    0|
|JaneAusten|It was a journey ...|PrideAndPrejudice|[It, was, a, jour...|(7583,[0,1,2,3,4,...|(7583,[0,1,2,3,4,...|    0|
+----------+--------------------

In [32]:
print(data.printSchema())
print(data.count())

root
 |-- author: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- title: string (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- termfreq: vector (nullable = true)
 |-- features: vector (nullable = true)
 |-- label: integer (nullable = true)

None
194


## Make train/test split

In [38]:
splits = data.randomSplit([0.75, 0.25])
train = splits[0]
test = splits[1]

In [39]:
nb = NaiveBayes(smoothing=1.0, modelType='multinomial')
model = nb.fit(train)

In [40]:
predictions = model.transform(test)
predictions.show(5)

+--------------+--------------------+----------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|        author|             excerpt|           title|               words|            termfreq|            features|label|       rawPrediction|         probability|prediction|
+--------------+--------------------+----------------+--------------------+--------------------+--------------------+-----+--------------------+--------------------+----------+
|CharlesDickens|"Humbug!" said Sc...| AChristmasCarol|[", Humbug, !, ",...|(7583,[0,1,2,3,4,...|(7583,[0,1,2,3,4,...|    1|[-3415.3845827307...|[8.10128717079561...|       1.0|
|CharlesDickens|"My dear!" was Bo...| AChristmasCarol|[", My, dear, !, ...|(7583,[0,1,2,3,4,...|(7583,[0,1,2,3,4,...|    1|[-5455.2786476798...|[6.80632576888663...|       1.0|
|CharlesDickens|'I will do my end...|     OliverTwist|[', I, will, do, ...|(7583,[0,1,2,3,4,...|(7583,[0,1,2,3,4,..

In [41]:
evaluator = MulticlassClassificationEvaluator(labelCol='label',
                                              predictionCol='prediction')

In [42]:
accuracy = evaluator.evaluate(predictions)
print('Test set accuracy = ' + str(accuracy))

Test set accuracy = 0.7525571174312593
