## 0. Install Packages

In [0]:
%sh
pip install nltk
pip install stop-words
pip install pyspellchecker



## 1.1. Load Additional Labeled Data

In [0]:
import pandas as pd
import numpy as np
# File location and type
file_location = "/FileStore/tables/additional.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "false"
delimiter = ","

df = spark.read.format(file_type).option("inferSchema", infer_schema).option("header", "true").option("sep", delimiter).load(file_location)

pandasDF_news = df.select('news').toPandas()
pandasDF_target = df.select('target').toPandas()

## 1.2. Load Original Data

In [0]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import numpy as np
import re
from pyspark.sql import SQLContext

categories = ['rec.autos', 'rec.sport.baseball', 'comp.graphics', 'comp.sys.mac.hardware', 
              'sci.space', 'sci.crypt', 'talk.politics.guns', 'talk.religion.misc']
newsgroup = fetch_20newsgroups(subset='train',categories= categories , shuffle=True, random_state=42)

df_news = pd.DataFrame(data=newsgroup.data, columns=['news']) 
df_news = df_news.append(pandasDF_news, ignore_index=True)
df_news = df_news.replace(re.compile(r"From: \S*@\S*\s?"),"")
df_news = df_news.replace(re.compile('\s+')," ")
df_news = df_news.replace(re.compile("\'"),"")

df_target = pd.DataFrame(data=newsgroup.target, columns=['target'])
df_target = df_target.append(pandasDF_target, ignore_index=True)
df_target['target']=df_target.target.astype('int64')
df_binary_labels = pd.DataFrame(np.where (df_target < 10, 0, 1), columns=['Binary Label'])

sqlContext = SQLContext(sc)
df_newsgroup = sqlContext.createDataFrame(pd.concat([df_news, df_target, df_binary_labels], axis=1))

## 2. Pipeline

In [0]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml import Pipeline

regexTokenizer = RegexTokenizer(inputCol="news", outputCol="news_words", pattern="\\W")
add_stopwords = ["http","https","amp","rt","t","c","the","subject","re",'.',',','', 'i i','?','\'\'',"''",'y','*','out','==','df','e.g.','\'m','\[',"'m",':', ')', '(','n\'t', '\'','``','``','\'s', 'https://','-'] 
stopwordsRemover = StopWordsRemover(inputCol=regexTokenizer.getOutputCol(), outputCol="filtered").setStopWords(add_stopwords)
hashingTF = HashingTF(inputCol=stopwordsRemover.getOutputCol(), outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
string_indexer = StringIndexer(inputCol = "target", outputCol = "target_indexed")

pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, string_indexer])
pipelineFit = pipeline.fit(df_newsgroup)
dataset = pipelineFit.transform(df_newsgroup)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)

dtc = DecisionTreeClassifier(labelCol=string_indexer.getOutputCol(), maxDepth=10)
dtc_mod = dtc.fit(trainingData)
predictions = dtc_mod.transform(testData)

## 3. Evaluate Decision Tree Classifier Model

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="target_indexed", predictionCol="prediction")
score = evaluator.evaluate(predictions)
print("score = %s" % (score) + "\n")

print ("accuracy: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})) )
print ("weightedPrecision: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})) )
print ("weightedRecall: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"}))  )
print ("weightedTruePositiveRate: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "weightedTruePositiveRate"})) )
print ("weightedFalsePositiveRate: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "weightedFalsePositiveRate"})) )
print ("weightedFMeasure: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "weightedFMeasure"})) )
print ("truePositiveRateByLabel: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "truePositiveRateByLabel"})))
print ("falsePositiveRateByLabel: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "falsePositiveRateByLabel"})) )
print ("precisionByLabel: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "precisionByLabel"})) )
print ("recallByLabel: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel"})) )
print ("fMeasureByLabel: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "fMeasureByLabel"})) )
print ("hammingLoss: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "hammingLoss"})) )

score = 0.5829143672986536

accuracy: 0.5556213017751479
weightedPrecision: 0.7346671946678965
weightedRecall: 0.555621301775148
weightedTruePositiveRate: 0.555621301775148
weightedFalsePositiveRate: 0.05878185485447813
weightedFMeasure: 0.5829143672986536
truePositiveRateByLabel: 0.565625
falsePositiveRateByLabel: 0.03722627737226277
precisionByLabel: 0.7801724137931034
recallByLabel: 0.565625
fMeasureByLabel: 0.6557971014492754
hammingLoss: 0.4443786982248521


## 4. Cross-Validation and Hyperparameter Tuning

In [0]:
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator

trainingData1 = trainingData.drop("news_words","news_tf","news_tfidf","rawPrediction","probability","prediction","filtered","rawFeatures","CrossValidator_2b30ebf36fbb_rand")
testData1 = testData.drop("news_words","news_tf","news_tfidf","rawPrediction","probability","prediction","filtered","rawFeatures","CrossValidator_2b30ebf36fbb_rand")
trainingData1.show(5)

# Grid for DecisionTreeClassifier
grid = (ParamGridBuilder().baseOn([evaluator.metricName, 'precision']) \
        .addGrid(dtc.maxDepth, [2, 5, 10, 20, 30]) \
        .addGrid(dtc.maxBins, [10, 20, 40, 80, 100]) \
        .build()) \


# Instantiation of a CrossValidator
cv = CrossValidator(estimator=dtc, estimatorParamMaps=grid, evaluator=evaluator, numFolds=3)

# Transform the data and train the classifier on the training set
cv_model = cv.fit(trainingData1)

# Transform the data and perform predictions on the test set
df_test_pred1 = cv_model.transform(testData1)

# Evaluate the predictions done on the test set
evaluator.evaluate(df_test_pred1)

+--------------------+------+------------+--------------------+--------------+
|                news|target|Binary Label|            features|target_indexed|
+--------------------+------+------------+--------------------+--------------+
| (Peter van der V...|     0|           0|(10000,[42,66,120...|           5.0|
|( Nikan B Firoozy...|     5|           0|(10000,[55,222,26...|           4.0|
|( Phil Mueller ) ...|     2|           0|(10000,[15,78,207...|           1.0|
|("Imaging Club") ...|     0|           0|(10000,[78,452,48...|           5.0|
|("RWTMS2::MUNIZB"...|     5|           0|(10000,[66,78,86,...|           4.0|
+--------------------+------+------------+--------------------+--------------+
only showing top 5 rows

Exception: You haven't configured the CLI yet! Please configure by entering `/databricks/python_shell/scripts/db_ipykernel_launcher.py configure`
Out[30]: 0.690259126923159

## 5. Best Parameters and Metric Scores

In [0]:
bestModel = cv_model.bestModel

print ('Best Param (maxDepth): ', bestModel._java_obj.getMaxDepth())
print ('Best Param (maxBins): ', bestModel._java_obj.getMaxBins())

print ("accuracy: " + str (evaluator.evaluate(df_test_pred1, {evaluator.metricName: "accuracy"})) )
print ("weightedPrecision: " + str (evaluator.evaluate(df_test_pred1, {evaluator.metricName: "weightedPrecision"})) )
print ("weightedRecall: " + str (evaluator.evaluate(df_test_pred1, {evaluator.metricName: "weightedRecall"}))  )
print ("weightedTruePositiveRate: " + str (evaluator.evaluate(df_test_pred1, {evaluator.metricName: "weightedTruePositiveRate"})) )
print ("weightedFalsePositiveRate: " + str (evaluator.evaluate(df_test_pred1, {evaluator.metricName: "weightedFalsePositiveRate"})) )
print ("weightedFMeasure: " + str (evaluator.evaluate(df_test_pred1, {evaluator.metricName: "weightedFMeasure"})) )
print ("truePositiveRateByLabel: " + str (evaluator.evaluate(df_test_pred1, {evaluator.metricName: "truePositiveRateByLabel"})))
print ("falsePositiveRateByLabel: " + str (evaluator.evaluate(df_test_pred1, {evaluator.metricName: "falsePositiveRateByLabel"})) )
print ("precisionByLabel: " + str (evaluator.evaluate(df_test_pred1, {evaluator.metricName: "precisionByLabel"})) )
print ("recallByLabel: " + str (evaluator.evaluate(df_test_pred1, {evaluator.metricName: "recallByLabel"})) )
print ("fMeasureByLabel: " + str (evaluator.evaluate(df_test_pred1, {evaluator.metricName: "fMeasureByLabel"})) )
print ("hammingLoss: " + str (evaluator.evaluate(df_test_pred1, {evaluator.metricName: "hammingLoss"})) )

Best Param (maxDepth):  30
Best Param (maxBins):  20
accuracy: 0.6715976331360947
weightedPrecision: 0.7508143446549618
weightedRecall: 0.6715976331360947
weightedTruePositiveRate: 0.6715976331360947
weightedFalsePositiveRate: 0.039398178961806814
weightedFMeasure: 0.690259126923159
truePositiveRateByLabel: 0.7375
falsePositiveRateByLabel: 0.041605839416058395
precisionByLabel: 0.8054607508532423
recallByLabel: 0.7375
fMeasureByLabel: 0.7699836867862969
hammingLoss: 0.32840236686390534
