<a href="https://colab.research.google.com/github/dalgual/aidatasci/blob/main/waze5ClassPipelineRF_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Author: Jongwook Woo, Dalya Manatova
Created at 04/07/2023


# Waze Traffic Data Classification with Random Forrest __PySpark__ with GPU
# Multiclassification

Dense of Waze Traffic is an example of xgboost classifier to do binary/mulit-class classification. This notebook will show you how to load data, train the xgboost model and use this model to predict if a location is dense or not.

## Load libraries
First load some common libraries will be used by both GPU version and CPU version xgboost.

In [None]:
%pyspark

from pyspark.sql.types import *
from pyspark.sql.functions import *

from ml.dmlc.xgboost4j.scala.spark import XGBoostClassificationModel, XGBoostClassifier
#from ml.dmlc.xgboost4j.scala.spark.rapids import GpuDataReader
from ml.dmlc.xgboost4j.scala.spark import XGBoostRegressionModel, XGBoostRegressor

from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.feature import StringIndexer, VectorIndexer, MinMaxScaler, SQLTransformer, Normalizer
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit, CrossValidator

from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics # # performance metrics
from pyspark.mllib.evaluation import MultilabelMetrics

from pyspark.storagelevel import StorageLevel

import os


In [None]:
%pyspark

from pyspark.context import SparkContext
from pyspark.sql.session import SparkSession

from time import time


## Build the schema and parameters
The mortgage data has 27 columns: 26 features and 1 label. "deinquency_12" is the label column. The schema will be used to load data in the future.

The next block also defines some key parameters used in xgboost training process.

In [None]:
%pyspark

IS_CPU = True # False
IS_SAMPLE = False #True #False
IS_HALF = False #False
IS_SHOW = True

# TSV or CV?
IS_TSV = True

### Schema
"location_x", "location_y", "sin_weekday", "cos_weekday", "sin_month", "cos_month", "sin_day", "cos_day", "sin_hour", "cos_hour", "sin_min", "cos_min", "sin_sec", "cos_sec", "is_rush", "is_weekend", "is_holiday", "level", "trueLabel"



In [None]:
%pyspark

#labelColName = "trueLevel"
labelColName = "trueLabel"

schema = StructType([
  StructField("location_x", DoubleType()),
  StructField("location_y", DoubleType()),
  StructField("sin_weekday", DoubleType()),
  StructField("cos_weekday", DoubleType()),
  StructField("sin_month", DoubleType()),
  StructField("cos_month", DoubleType()),
  StructField("sin_day", DoubleType()),
  StructField("cos_day", DoubleType()),
  StructField("sin_hour", DoubleType()),
  StructField("cos_hour", DoubleType()),
  StructField("sin_min", DoubleType()),
  StructField("cos_min", DoubleType()),
  StructField("sin_sec", DoubleType()),
  StructField("cos_sec", DoubleType()),
  StructField("is_rush", IntegerType()),
  StructField("is_weekend", IntegerType()),
  StructField("is_holiday", IntegerType()),
  StructField("level", IntegerType())])
#  StructField("label", IntegerType())])


featureNames = [ x.name for x in schema if x.name != "level" ]


In [None]:
%pyspark

# ALL, DEBUG, ERROR, FATAL, INFO, OFF, TRACE, WARN
LOG_LEVEL = "OFF" #"INFO"
spark.sparkContext.setLogLevel(LOG_LEVEL)

In [None]:
%pyspark

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","


In [None]:
%pyspark

file_location = "/user/hadoop/waze/"
dir = "mid_data"
file_type = "csv"
if (IS_SAMPLE == True):
    dir = "sample_mid_data"
    file_type = "csv"
elif (IS_HALF == True):
    dir = "mid_half_data"
    file_type = "csv" #"parquet"



In [None]:
%pyspark

'''dataTrans = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location + dir)
'''
dataTrans = spark.read.schema(schema) \
    .option('header', first_row_is_header) \
    .csv(file_location + dir)

In [None]:
%pyspark

temp_table_name = "jampredictclean_1m_100mb_csv"
dataTrans.createOrReplaceTempView(temp_table_name)

In [None]:
%pyspark

dataTrans.persist(StorageLevel.MEMORY_AND_DISK )
dataTrans.printSchema()

In [None]:
%pyspark

if(IS_HALF == True):
    dataTrans = dataTrans.sample(0.1)

In [None]:
%pyspark

splits = dataTrans.randomSplit([0.75, 0.25])
# for decision tree classifier
train = splits[0].withColumnRenamed("level", "label")
test = splits[1].withColumnRenamed("level", labelColName)

print ("Training Rows:", train.count(), " Testing Rows:", test.count())

# Commented out IPython magic to ensure Python compatibility.
# %pyspark
if IS_SHOW == True:
    train.show(5)


In [None]:
%pyspark

trainSet = train
evalSet  = test
transSet = test

In [None]:
%pyspark

if IS_SHOW == True: trainSet.printSchema()

In [None]:
%pyspark

if IS_SHOW == True: transSet.printSchema()

In [None]:
%pyspark

featureNames = [ x.name for x in schema if x.name != "label" ]


In [None]:
%pyspark

if IS_SHOW == True: print("schema", featureNames)

In [None]:
%pyspark

# For finite numeric valuse
catVect = VectorAssembler(inputCols = ["sin_weekday", "cos_weekday", "sin_month", "cos_month", "sin_day", "cos_day", "sin_hour", "cos_hour", "sin_min", "cos_min", "sin_sec", "cos_sec", "is_rush", "is_weekend", "is_holiday"], outputCol="catFeatures") #.setHandleInvalid("skip")
catIdx = VectorIndexer(inputCol = catVect.getOutputCol(), outputCol = "idxCatFeatures")


In [None]:
%pyspark

# infinite number is meaningful so that it should be number features
numVect = VectorAssembler(inputCols = ['location_x', 'location_y'], outputCol="numFeatures")
# number vector is normalized
minMax = MinMaxScaler(inputCol = numVect.getOutputCol(), outputCol="normFeatures")

# combine finite and infinite numeric list
featVect = VectorAssembler(inputCols=["idxCatFeatures", "normFeatures"], outputCol="features")  #="features1")



In [None]:
%pyspark

# NOTE: labelCol="label" not "trueLabel"
rf = RandomForestClassifier(labelCol="label", featuresCol="features")

# pipeline = Pipeline(stages=[assembler, rf])
pipeline = Pipeline(stages=[catVect, catIdx, numVect, minMax, featVect, rf])

In [None]:
%pyspark

paramGrid = ParamGridBuilder() \
            .addGrid(rf.numTrees, [10, 20]) \
            .addGrid(rf.minInfoGain, [0.0, 0.01]) \
            .addGrid(rf.maxDepth, [5, 10]).build()
# TODO: K = 2, you may test it with 5, 10
# K=2, 3, 5, 10: Root Mean Square Error (RMSE): 13.2
K = 2
if(IS_TSV == True):
    cv = TrainValidationSplit(estimator=pipeline, evaluator=MulticlassClassificationEvaluator(),  estimatorParamMaps=paramGrid, trainRatio=0.8)
else:
    cv = CrossValidator(estimator=pipeline, evaluator=MulticlassClassificationEvaluator(), estimatorParamMaps=paramGrid, numFolds=K)

model = cv.fit(trainSet)

In [None]:
%pyspark

'''
rf2 = RandomForestClassifier(labelCol="level", featuresCol="features")
pipeline = Pipeline(stages=[assembler, rf2])
model = pipeline.fit(trainSet)
'''

In [None]:
%pyspark

#from time import time
'''def with_benchmark(phrase, action):
    start = time()
    result = action()
    end = time()
    print('{} takes {} seconds'.format(phrase, round(end - start, 2)))
    return result

model = with_benchmark('Training', lambda: cv.fit(trainSet))'''

'''start = time()
model = cv.fit(trainSet)
end = time()
print('{} takes {} seconds'.format('Training', round(end - start, 2)))'''

In [None]:
%pyspark

def transform():
    result = model.transform(transSet).cache()
    result.foreachPartition(lambda _: None)
    return result

#predicted = with_benchmark('Transformation', transform)

result = model.transform(transSet)
predicted = result
predicted.select(labelColName, 'rawPrediction', 'probability', 'prediction').show(5)


'''prediction = model.transform(test)
#predicted = prediction.select("features", "prediction", "trueLabel")
predicted = prediction.select("normFeatures", "prediction", "trueLabel")'''

predicted.show()



In [None]:
%pyspark

'''accuracy = with_benchmark(
    'Evaluation',
    lambda: MulticlassClassificationEvaluator().setLabelCol(labelColName).evaluate(predicted))
'''
accuracy = MulticlassClassificationEvaluator().setLabelCol(labelColName).evaluate(predicted)

print('Accuracy is ' + str(accuracy))



In [None]:
%pyspark
evaluator = MulticlassClassificationEvaluator(labelCol =labelColName, predictionCol="prediction")
print("Test-set Accuracy is : ", evaluator.evaluate(predicted))

In [None]:
%pyspark

from pyspark.sql.types import FloatType

#important: need to cast to float type, and order by prediction, else it won't work
preds_and_labels = predicted.select(['prediction',labelColName])\
                              .withColumn(labelColName, col(labelColName)\
                              .cast(FloatType()))\
                              .orderBy('prediction')

metrics = MulticlassMetrics(preds_and_labels.rdd.map(tuple))

print(metrics.confusionMatrix().toArray())

In [None]:
%pyspark

labels = train.rdd.map(lambda lp: lp.label).distinct().collect()
for label in sorted(labels):
    print("Class %s precision = %s" % (label, metrics.precision(label)))
    print("Class %s recall = %s" % (label, metrics.recall(label)))
    #print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))

In [None]:
%pyspark

accuracy = metrics.accuracy
precision1 = metrics.precision(1)
precision2 = metrics.precision(2)
precision3 = metrics.precision(3)
precision4 = metrics.precision(4)
precision5 = metrics.precision(5)

print ("accuracy = ", accuracy)
print (" precision1 = ", precision1, " precision2 = ", precision2,  " precision3 = ", precision3, " precision4 = ", precision4, " precision5 = ", precision5)

In [None]:
%pyspark

recall1 = metrics.recall(1)
recall2 = metrics.recall(2)
recall3 = metrics.recall(3)
recall4 = metrics.recall(4)
recall5 = metrics.recall(5)

print (" recall1 = ", recall1, " recall2 = ", recall2,  " recall3 = ", recall3, " recall4 = ", recall4, " recall5 = ", recall5)


In [None]:
%pyspark

# Weighted stats
print("Weighted recall = %s" % metrics.weightedRecall)
print("Weighted precision = %s" % metrics.weightedPrecision)
print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)


In [None]:
%pyspark
import seaborn as sns
import matplotlib.pyplot as plt

labels = [1, 2, 3, 4, 5]
_ = plt.figure(figsize=(7, 7))
sns.heatmap(metrics.confusionMatrix().toArray(),
            cmap='viridis',
            annot=True,fmt='0',
            cbar=False,
            xticklabels=labels,
            yticklabels=labels)


In [None]:
%pyspark


'''z.show(results.select("trueLabel", labelColName,"rawPrediction","probability","prediction").limit(10))
display(pipelineModel.stages[-1], predDF.drop("prediction", "rawPrediction", "probability"), "ROC")
'''

__References__
1. Multiclass Classification, https://spark.apache.org/docs/3.3.0/mllib-evaluation-metrics.html#multiclass-classification
1. https://github.com/NVIDIA/spark-rapids-examples/blob/main/examples/XGBoost-Examples/mortgage/notebooks/scala/mortgage_gpu_crossvalidation.ipynb
1. MultiClass Classification using PySPark, https://www.kaggle.com/code/ashokkumarpalivela/multiclass-classification-using-pyspark
1. XGBoost Distributed Training and Parallel Predictions with Apache Spark, https://medium.com/cloudzone/xgboost-distributed-training-and-predicting-with-apache-spark-1127cdfb31ae
1. Use XGBoost on Databricks, https://docs.databricks.com/machine-learning/train-model/xgboost.html