####Install

In [0]:
%sh
pip install nltk
pip install stop-words
pip install pyspellchecker

You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.
You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.
You should consider upgrading via the '/databricks/python3/bin/python -m pip install --upgrade pip' command.


####TBD

1. Tokenization into words
2. Stop words removal
3. Noise reduction (e.g., removal of punctuation)
4. Stemmin

#### 1. Load Data

In [0]:
import pandas as pd
import numpy as np
# File location and type
file_location = "/FileStore/tables/additional.csv"
file_type = "csv"

# CSV options
infer_schema = "false"
first_row_is_header = "false"
delimiter = ","

df = spark.read.format(file_type).option("inferSchema", infer_schema).option("header", "true").option("sep", delimiter).load(file_location)


pandasDF_news = df.select('news').toPandas()
pandasDF_target = df.select('target').toPandas()

In [0]:
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import numpy as np
import re
from pyspark.sql import SQLContext

categories = ['rec.autos', 'rec.sport.baseball', 'comp.graphics', 'comp.sys.mac.hardware', 
              'sci.space', 'sci.crypt', 'talk.politics.guns', 'talk.religion.misc']
newsgroup = fetch_20newsgroups(subset='train',categories= categories , shuffle=True, random_state=42)

df_news = pd.DataFrame(data=newsgroup.data, columns=['news']) 

df_news = df_news.append(pandasDF_news, ignore_index=True)

df_news = df_news.replace(re.compile(r"From: \S*@\S*\s?"),"")
df_news = df_news.replace(re.compile('\s+')," ")
df_news = df_news.replace(re.compile("\'"),"")

#df_news = df_news.dropna()

df_target = pd.DataFrame(data=newsgroup.target, columns=['target'])

df_target = df_target.append(pandasDF_target, ignore_index=True)
#df_target = df_target.dropna()

df_target['target']=df_target.target.astype('int64')

df_binary_labels = pd.DataFrame(np.where (df_target < 10, 0, 1), columns=['Binary Label'])

sqlContext = SQLContext(sc)
df_newsgroup = sqlContext.createDataFrame(pd.concat([df_news, df_target, df_binary_labels], axis=1))

#### 2. Pipeline

In [0]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import LinearSVC
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline


regexTokenizer = RegexTokenizer(inputCol="news", outputCol="news_words", pattern="\\W")
add_stopwords = ["http","https","amp","rt","t","c","the","subject","re",'.',',','', 'i i','?','\'\'',"''",'y','*','out','==','df','e.g.','\'m','\[',"'m",':', ')', '(','n\'t', '\'','``','``','\'s', 'https://','-'] 
stopwordsRemover = StopWordsRemover(inputCol="news_words", outputCol="filtered").setStopWords(add_stopwords)
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
string_indexer = StringIndexer(inputCol = "target", outputCol = "target_indexed")

pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, string_indexer])
pipelineFit = pipeline.fit(df_newsgroup)

dataset = pipelineFit.transform(df_newsgroup)
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed = 100)
#lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
rf = RandomForestClassifier(featuresCol=idf.getOutputCol(), labelCol=string_indexer.getOutputCol(), maxDepth=10)
rf_mod = rf.fit(trainingData)
#lrModel = lr.fit(trainingData)
#predictions = lrModel.transform(testData)
predictions = rf_mod.transform(testData)

# evaluator = MulticlassClassificationEvaluator(labelCol="target_indexed", predictionCol="prediction")
# evaluator.evaluate(predictions)

#### 3. Evaluate ML Model

In [0]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="target_indexed", predictionCol="prediction")
evaluator.evaluate(predictions)

accuracy = evaluator.evaluate(predictions)
print("Accuracy = %s" % (accuracy))
print("Test Error = %s" % (1.0 - accuracy))

Accuracy = 0.7032946941465295
Test Error = 0.29670530585347055


In [0]:
print ("Accuracy: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})) )
print ("weightedPrecision: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})) )
print ("weightedRecall: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"}))  )
print ("weightedTruePositiveRate: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "weightedTruePositiveRate"})) )
print ("weightedFalsePositiveRate: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "weightedFalsePositiveRate"})) )
print ("weightedFMeasure: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "weightedFMeasure"})) )
print ("truePositiveRateByLabel: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "truePositiveRateByLabel"})))
print ("falsePositiveRateByLabel: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "falsePositiveRateByLabel"})) )
print ("precisionByLabel: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "precisionByLabel"})) )
print ("recallByLabel: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel"})) )
print ("fMeasureByLabel: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "fMeasureByLabel"})) )
print ("hammingLoss: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "hammingLoss"})) )

Accuracy: 0.714792899408284
weightedPrecision: 0.7482196872419422
weightedRecall: 0.714792899408284
weightedTruePositiveRate: 0.714792899408284
weightedFalsePositiveRate: 0.04777471565434986
weightedFMeasure: 0.7032946941465295
truePositiveRateByLabel: 0.909375
falsePositiveRateByLabel: 0.1437956204379562
precisionByLabel: 0.5963114754098361
recallByLabel: 0.909375


#### 4. Parameter tuning

In [0]:
from pyspark.ml.tuning import ParamGridBuilder
from pyspark.ml.tuning import CrossValidator

trainingData1 = trainingData.drop("news_words","news_tf","news_tfidf","rawPrediction","probability","prediction","filtered","rawFeatures","CrossValidator_2b30ebf36fbb_rand")
testData1 = testData.drop("news_words","news_tf","news_tfidf","rawPrediction","probability","prediction","filtered","rawFeatures","CrossValidator_2b30ebf36fbb_rand")


trainingData1.show(5)

#grid for randomforest
grid = (ParamGridBuilder().baseOn([evaluator.metricName, 'precision']).addGrid(rf.maxDepth, [10, 20]).build())


# Instanciation of a CrossValidator
cv = CrossValidator(estimator=rf, estimatorParamMaps=grid, evaluator=evaluator, numFolds=3)

# Transform the data and train the classifier on the training set
cv_model = cv.fit(trainingData1)

# Transform the data and perform predictions on the test set
df_test_pred1 = cv_model.transform(testData1)

# Evaluate the predictions done on the test set
evaluator.evaluate(df_test_pred1)

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-454663714469487>[0m in [0;36m<module>[0;34m[0m
[1;32m      2[0m [0;32mfrom[0m [0mpyspark[0m[0;34m.[0m[0mml[0m[0;34m.[0m[0mtuning[0m [0;32mimport[0m [0mCrossValidator[0m[0;34m[0m[0;34m[0m[0m
[1;32m      3[0m [0;34m[0m[0m
[0;32m----> 4[0;31m [0mtrainingData1[0m [0;34m=[0m [0mtrainingData[0m[0;34m.[0m[0mdrop[0m[0;34m([0m[0;34m"news_words"[0m[0;34m,[0m[0;34m"news_tf"[0m[0;34m,[0m[0;34m"news_tfidf"[0m[0;34m,[0m[0;34m"rawPrediction"[0m[0;34m,[0m[0;34m"probability"[0m[0;34m,[0m[0;34m"prediction"[0m[0;34m,[0m[0;34m"filtered"[0m[0;34m,[0m[0;34m"rawFeatures"[0m[0;34m,[0m[0;34m"CrossValidator_2b30ebf36fbb_rand"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m      5[0m [0mtestData1[0m [0;34m=[0m [0mtestData[0m[0;34m.[0

In [0]:
print ("Accuracy: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})) )
print ("weightedPrecision: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})) )
print ("weightedRecall: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"}))  )
print ("weightedTruePositiveRate: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "weightedTruePositiveRate"})) )
print ("weightedFalsePositiveRate: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "weightedFalsePositiveRate"})) )
print ("weightedFMeasure: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "weightedFMeasure"})) )
print ("truePositiveRateByLabel: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "truePositiveRateByLabel"})))
print ("falsePositiveRateByLabel: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "falsePositiveRateByLabel"})) )
print ("precisionByLabel: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "precisionByLabel"})) )
print ("recallByLabel: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel"})) )
print ("fMeasureByLabel: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "fMeasureByLabel"})) )
print ("hammingLoss: " + str (evaluator.evaluate(predictions, {evaluator.metricName: "hammingLoss"})) )