In [5]:
import pyspark as ps
from pyspark.ml.classification import NaiveBayes
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

## Load data from parquet file

In [8]:
data = spark.read.parquet("data/processed_data.parquet/")
print(data.printSchema())

root
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)
 |-- w2v_large: vector (nullable = true)
 |-- author_label: integer (nullable = true)
 |-- title_label: integer (nullable = true)

None


## Make train/test split

In [9]:
%%time
splits = data.randomSplit(weights=[0.75, 0.25], seed=42)
train = splits[0]
test = splits[1]

train.persist()
test.persist()

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 41 ms


In [10]:
# Create debugging data
data_sample = data.sample(withReplacement=False, fraction=0.05, seed=42)
data_sample.persist()

DataFrame[words: array<string>, tfidf: vector, w2v_2d: vector, w2v_large: vector, author_label: int, title_label: int]

## Naive Bayes Classifier for author

In [5]:
%%time
nb1 = NaiveBayes(smoothing=1.0, 
                 modelType='multinomial',
                 labelCol='author_label',
                 featuresCol='tfidf')
model1 = nb1.fit(train)

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 3.34 s


In [6]:
%%time
predictions = model1.transform(test)
predictions.persist()
print(predictions.printSchema())

root
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)
 |-- w2v_large: vector (nullable = true)
 |-- author_label: integer (nullable = true)
 |-- title_label: integer (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)

None
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 556 ms


In [7]:
%%time
evaluator = MulticlassClassificationEvaluator(labelCol='author_label',
                                               predictionCol='prediction')

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 9.01 ms


In [8]:
accuracy = evaluator.evaluate(predictions)
print('Author test set accuracy = ' + str(accuracy))
predictions.unpersist()

Author test set accuracy = 0.9916345991881657


DataFrame[words: array<string>, tfidf: vector, w2v_2d: vector, w2v_large: vector, author_label: int, title_label: int, rawPrediction: vector, probability: vector, prediction: double]

## Decision Tree Classifier for author

In [26]:
%%time
dtc = DecisionTreeClassifier(labelCol='author_label',
                             featuresCol='tfidf',
                             maxMemoryInMB=2056)

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 10.1 ms


In [27]:
%%time

# Spark only supports max depth of 30

paramGrid = ParamGridBuilder().addGrid(dtc.maxDepth, 
                                       [5,10,15,20,25,30]).build()

evaluator = MulticlassClassificationEvaluator(labelCol='author_label',
                                              predictionCol='prediction',
                                              metricName='f1')

crossval = CrossValidator(estimator=dtc,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

cv_dtc = crossval.fit(data)

model = cv_dtc.bestModel
print(model)

DecisionTreeClassificationModel (uid=DecisionTreeClassifier_4ffa8c7ce1f4d6824579) of depth 15 with 975 nodes
CPU times: user 688 ms, sys: 192 ms, total: 880 ms
Wall time: 1h 10min 19s


In [28]:
%%time
predictions = model.transform(test)
predictions.persist()
print(predictions.printSchema())

root
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)
 |-- w2v_large: vector (nullable = true)
 |-- author_label: integer (nullable = true)
 |-- title_label: integer (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)

None
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 63.2 ms


In [29]:
accuracy = evaluator.evaluate(predictions)
print('Author test set accuracy = ' + str(accuracy))
predictions.unpersist()

Author test set accuracy = 0.9651498620622281


DataFrame[words: array<string>, tfidf: vector, w2v_2d: vector, w2v_large: vector, author_label: int, title_label: int, rawPrediction: vector, probability: vector, prediction: double]

## Random Forrest Classifier for author

In [22]:
%%time
rfc = RandomForestClassifier(labelCol='author_label',
                             featuresCol='tfidf',
                             numTrees=50,
                             maxMemoryInMB=2056)

CPU times: user 0 ns, sys: 4 ms, total: 4 ms
Wall time: 5.06 ms


In [23]:
%%time

# Spark only supports max depth of 30

paramGrid = ParamGridBuilder().addGrid(rfc.maxDepth, 
                                       [5,10,15,20,25,30]).build()

evaluator = MulticlassClassificationEvaluator(labelCol='author_label',
                                              predictionCol='prediction',
                                              metricName='f1')

crossval = CrossValidator(estimator=rfc,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

cv_rfc = crossval.fit(data)

model = cv_rfc.bestModel
print(model)

RandomForestClassificationModel (uid=rfc_8675b823e430) with 50 trees
CPU times: user 784 ms, sys: 204 ms, total: 988 ms
Wall time: 1h 14min 30s


In [24]:
%%time
predictions = model.transform(test)
predictions.persist()
print(predictions.printSchema())

root
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)
 |-- w2v_large: vector (nullable = true)
 |-- author_label: integer (nullable = true)
 |-- title_label: integer (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)

None
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 263 ms


In [25]:
accuracy = evaluator.evaluate(predictions)
print('Author test set accuracy = ' + str(accuracy))
predictions.unpersist()

Author test set accuracy = 0.9977948607099778


DataFrame[words: array<string>, tfidf: vector, w2v_2d: vector, w2v_large: vector, author_label: int, title_label: int, rawPrediction: vector, probability: vector, prediction: double]

##  Naives Bayes Classifier for title

In [None]:
%%time
nb2 = NaiveBayes(smoothing=1.0, 
                 modelType='multinomial',
                 labelCol='title_label',
                 featuresCol='tfidf')
model2 = nb2.fit(train)

In [25]:
%%time
predictions = model2.transform(test)
predictions.persist()
print(predictions.printSchema())

root
 |-- words: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- tfidf: vector (nullable = true)
 |-- w2v_2d: vector (nullable = true)
 |-- w2v_large: vector (nullable = true)
 |-- author_label: integer (nullable = true)
 |-- title_label: integer (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = true)

None
CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 98.9 ms


In [28]:
%%time
evaluator2 = MulticlassClassificationEvaluator(labelCol='title_label',
                                               predictionCol='prediction')

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 4.8 ms


In [29]:
accuracy2 = evaluator2.evaluate(predictions)
print('Title test set accuracy = ' + str(accuracy2))

predictions.unpersist()

Title test set accuracy = 0.06740951396632552


DataFrame[words: array<string>, tfidf: vector, w2v_2d: vector, w2v_large: vector, author_label: int, title_label: int, rawPrediction: vector, probability: vector, prediction: double]