In [1]:
import pyspark as ps
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Load data from parquet file

In [2]:
data = spark.read.parquet("data/sample_data.parquet/")
print(data.printSchema())

root
 |-- author: string (nullable = true)
 |-- title: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- excerpt_number: long (nullable = true)
 |-- author_id: double (nullable = true)
 |-- title_id: double (nullable = true)
 |-- id_vector: vector (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- termfreq: vector (nullable = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)

None


In [None]:
# data = spark.read.parquet("data/data.parquet/")
# print(data.printSchema())

## Make train/test split

In [3]:
%%time
splits = data.randomSplit(weights=[0.75, 0.25], seed=42)
train = splits[0].persist()
test = splits[1].persist()


CPU times: user 6.56 ms, sys: 3.51 ms, total: 10.1 ms
Wall time: 1.97 s


In [None]:
# Create debugging data
data_sample = data.sample(withReplacement=False, fraction=0.05, seed=42)
data_sample.persist()

## Naive Bayes Classifier for author

In [4]:
%%time
nb1 = NaiveBayes(smoothing=1.0, 
                 modelType='multinomial',
                 labelCol='author_id',
                 featuresCol='tfidf')
model_1 = nb1.fit(train)

CPU times: user 11.7 ms, sys: 3.18 ms, total: 14.8 ms
Wall time: 3.26 s


In [5]:
%%time
predictions = model_1.transform(test)
predictions.persist()
print(predictions.printSchema())

root
 |-- author: string (nullable = true)
 |-- title: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- excerpt_number: long (nullable = true)
 |-- author_id: double (nullable = true)
 |-- title_id: double (nullable = true)
 |-- id_vector: vector (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- termfreq: vector (nullable = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)

None
CPU times: user 3.48 ms, sys: 1.92 ms, total: 5.4 ms
Wall time: 1.12 s


In [8]:
%%time
evaluator = MulticlassClassificationEvaluator(labelCol='author_id',
                                               predictionCol='prediction')

CPU times: user 2.39 ms, sys: 1.64 ms, total: 4.02 ms
Wall time: 11.5 ms


In [9]:
accuracy = evaluator.evaluate(predictions)
print('Author test set accuracy = ' + str(accuracy))
predictions.unpersist()

Author test set accuracy = 0.9674671445639187


DataFrame[author: string, title: string, excerpt: string, excerpt_number: bigint, author_id: double, title_id: double, id_vector: vector, words: array<string>, termfreq: vector, tfidf: vector, w2v: vector, w2v_2d: vector, rawPrediction: vector, probability: vector, prediction: double]

## Decision Tree Classifier for author

In [10]:
%%time
dtc = DecisionTreeClassifier(labelCol='author_id',
                             featuresCol='tfidf',
                             maxMemoryInMB=2056)

CPU times: user 2.88 ms, sys: 4.68 ms, total: 7.56 ms
Wall time: 143 ms


In [11]:
%%time

# Spark only supports max depth of 30

paramGrid = ParamGridBuilder().addGrid(dtc.maxDepth, 
                                       [5,10,15,20,25,30]).build()

evaluator = MulticlassClassificationEvaluator(labelCol='author_id',
                                              predictionCol='prediction',
                                              metricName='f1')

crossval = CrossValidator(estimator=dtc,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

cv_dtc = crossval.fit(data)

model = cv_dtc.bestModel
print(model)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_43e39b9baf6148917504) of depth 5 with 43 nodes
CPU times: user 534 ms, sys: 138 ms, total: 671 ms
Wall time: 1min 16s


In [12]:
%%time
predictions = model.transform(test)
predictions.persist()
print(predictions.printSchema())

root
 |-- author: string (nullable = true)
 |-- title: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- excerpt_number: long (nullable = true)
 |-- author_id: double (nullable = true)
 |-- title_id: double (nullable = true)
 |-- id_vector: vector (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- termfreq: vector (nullable = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)

None
CPU times: user 3.15 ms, sys: 2.64 ms, total: 5.79 ms
Wall time: 140 ms


In [13]:
accuracy = evaluator.evaluate(predictions)
print('Author test set accuracy = ' + str(accuracy))
predictions.unpersist()

Author test set accuracy = 0.8318656340032748


DataFrame[author: string, title: string, excerpt: string, excerpt_number: bigint, author_id: double, title_id: double, id_vector: vector, words: array<string>, termfreq: vector, tfidf: vector, w2v: vector, w2v_2d: vector, rawPrediction: vector, probability: vector, prediction: double]

## Random Forrest Classifier for author

In [14]:
%%time
rfc = RandomForestClassifier(labelCol='author_id',
                             featuresCol='tfidf',
                             numTrees=50,
                             maxMemoryInMB=2056)

CPU times: user 2.53 ms, sys: 1.19 ms, total: 3.72 ms
Wall time: 23 ms


In [15]:
%%time

# Spark only supports max depth of 30

paramGrid = ParamGridBuilder().addGrid(rfc.maxDepth, 
                                       [5,10,15,20,25,30]).build()

evaluator = MulticlassClassificationEvaluator(labelCol='author_id',
                                              predictionCol='prediction',
                                              metricName='f1')

crossval = CrossValidator(estimator=rfc,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

cv_rfc = crossval.fit(data)

model = cv_rfc.bestModel
print(model)

RandomForestClassificationModel (uid=rfc_6d150c91dd4e) with 50 trees
CPU times: user 565 ms, sys: 258 ms, total: 823 ms
Wall time: 1min 35s


In [16]:
%%time
predictions = model.transform(test)
predictions.persist()
print(predictions.printSchema())

root
 |-- author: string (nullable = true)
 |-- title: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- excerpt_number: long (nullable = true)
 |-- author_id: double (nullable = true)
 |-- title_id: double (nullable = true)
 |-- id_vector: vector (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- termfreq: vector (nullable = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)

None
CPU times: user 2.51 ms, sys: 1.84 ms, total: 4.34 ms
Wall time: 118 ms


In [17]:
accuracy = evaluator.evaluate(predictions)
print('Author test set accuracy = ' + str(accuracy))
predictions.unpersist()

Author test set accuracy = 1.0


DataFrame[author: string, title: string, excerpt: string, excerpt_number: bigint, author_id: double, title_id: double, id_vector: vector, words: array<string>, termfreq: vector, tfidf: vector, w2v: vector, w2v_2d: vector, rawPrediction: vector, probability: vector, prediction: double]

##  Naives Bayes Classifier for title

In [18]:
%%time
nb2 = NaiveBayes(smoothing=1.0, 
                 modelType='multinomial',
                 labelCol='title_id',
                 featuresCol='tfidf')
model2 = nb2.fit(train)

CPU times: user 5.82 ms, sys: 1.73 ms, total: 7.55 ms
Wall time: 258 ms


In [19]:
%%time
predictions = model2.transform(test)
predictions.persist()
print(predictions.printSchema())

root
 |-- author: string (nullable = true)
 |-- title: string (nullable = true)
 |-- excerpt: string (nullable = true)
 |-- excerpt_number: long (nullable = true)
 |-- author_id: double (nullable = true)
 |-- title_id: double (nullable = true)
 |-- id_vector: vector (nullable = true)
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- termfreq: vector (nullable = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)

None
CPU times: user 2.67 ms, sys: 1.41 ms, total: 4.08 ms
Wall time: 107 ms


In [20]:
%%time
evaluator2 = MulticlassClassificationEvaluator(labelCol='title_id',
                                               predictionCol='prediction')

CPU times: user 1.72 ms, sys: 1.05 ms, total: 2.77 ms
Wall time: 7.1 ms


In [21]:
accuracy2 = evaluator2.evaluate(predictions)
print('Title test set accuracy = ' + str(accuracy2))

predictions.unpersist()

Title test set accuracy = 0.9397465437788016


DataFrame[author: string, title: string, excerpt: string, excerpt_number: bigint, author_id: double, title_id: double, id_vector: vector, words: array<string>, termfreq: vector, tfidf: vector, w2v: vector, w2v_2d: vector, rawPrediction: vector, probability: vector, prediction: double]