In [2]:
import findspark
findspark.init('/home/ubuntu/spark-2.1.1-bin-hadoop2.7')
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, MultilayerPerceptronClassifier, LogisticRegression
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler, Normalizer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
import pandas as pd
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

spark = SparkSession.builder.appName('transforming').getOrCreate()


##### Common to all models's pipeline

# Read in data
df = spark.read.csv("Datasets/forestfires_merged.csv", inferSchema=True, header=True)
df = df.withColumn('label', df.fire)

# Create Transformer to deal with categorical values in pipeline
indexerDay = StringIndexer(inputCol='day', outputCol='dayIndex')

# Create Transformer to convert csv to vector in pipeline
vector_assembler = VectorAssembler(inputCols = ['X', 'Y', 'FFMC', 'month nominal', 'dayIndex','DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain'], outputCol = 'features')

# Create Transformer to scale values in pipeline
normalizer = Normalizer(inputCol="features", outputCol="normFeatures", p=1.0)

# Split the data into training and test sets (20% held out for testing)
(trainingData, testData) = df.randomSplit([0.8, 0.2])

trainingData.show(10)
#df.groupby('fire').count().toPandas()


+---+---+---+---+-----+---+----+-----+-----+----+----+---+----+----+----+-------------+-----+
|id1|id2|  X|  Y|month|day|FFMC|  DMC|   DC| ISI|temp| RH|wind|rain|fire|month nominal|label|
+---+---+---+---+-----+---+----+-----+-----+----+----+---+----+----+----+-------------+-----+
|  2|  3|  8|  6|  mar|fri|91.7| 33.3| 77.5| 9.0| 8.3| 42| 4.0| 0.2|   0|            3|    0|
|  3|  4|  8|  6|  mar|sun|89.3| 51.3|102.2| 9.6|11.4| 42| 1.8| 0.0|   0|            3|    0|
|  4|  5|  8|  6|  aug|sun|92.3| 85.3|488.0|14.7|22.2| 29| 5.4| 0.0|   0|            8|    0|
|  5|  6|  8|  6|  aug|mon|92.3| 88.9|495.6| 8.5|24.1| 27| 3.1| 0.0|   0|            8|    0|
|  6|  7|  8|  6|  aug|mon|91.5|145.4|608.2|10.7| 8.0| 42| 2.2| 0.0|   0|            8|    0|
|  8|  9|  7|  5|  sep|sat|92.5| 88.0|698.6| 7.1|22.8| 40| 4.0| 0.0|   0|            9|    0|
| 10| 11|  7|  5|  sep|sat|92.8| 73.2|713.0| 8.4|19.3| 38| 4.0| 0.0|   0|            9|    0|
| 11| 12|  6|  5|  aug|fri|91.6| 70.8|665.3| 0.8|17.0| 72| 6

In [3]:
##### Random forest

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="fire", featuresCol="normFeatures")

# Create a Pipeline
pipeline = Pipeline(stages=[indexerDay, vector_assembler, normalizer, rf])

# Train model.  This also runs the other Transformers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)
predictions.show(5)

# Select example rows to display.
predictions.select("prediction", "fire", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = MulticlassClassificationEvaluator(
    labelCol="fire", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy = %g" % (accuracy))


+---+---+---+---+-----+---+----+-----+-----+---+----+---+----+----+----+-------------+-----+--------+--------------------+--------------------+--------------------+--------------------+----------+
|id1|id2|  X|  Y|month|day|FFMC|  DMC|   DC|ISI|temp| RH|wind|rain|fire|month nominal|label|dayIndex|            features|        normFeatures|       rawPrediction|         probability|prediction|
+---+---+---+---+-----+---+----+-----+-----+---+----+---+----+----+----+-------------+-----+--------+--------------------+--------------------+--------------------+--------------------+----------+
|  0|  0|  7|  5|  mar|fri|86.2| 26.2| 94.3|5.1| 8.2| 51| 6.7| 0.0|   0|            3|    0|     1.0|[7.0,5.0,86.2,3.0...|[0.02383384405856...|[12.9829852208010...|[0.64914926104005...|       0.0|
|  1|  2|  7|  4|  oct|sat|90.6| 43.7|686.9|6.7|14.6| 33| 1.3| 0.0|   0|           10|    0|     2.0|[7.0,4.0,90.6,10....|[0.00777950655701...|[9.32861058405763...|[0.46643052920288...|       1.0|
|  7|  8|  8|  

In [5]:
##### Gradient Boosted Tree

# Train a GBT model.
gbt = GBTClassifier(labelCol="fire", featuresCol="normFeatures")

# Create a Pipeline
pipeline2 = Pipeline(stages=[indexerDay, vector_assembler, normalizer, gbt])

# Train model.  This also runs the other Transformers.
model2 = pipeline2.fit(trainingData)

# Make predictions.
predictions2 = model2.transform(testData)

# Select example rows to display.
predictions2.select("prediction", "fire", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator2 = MulticlassClassificationEvaluator(
    labelCol="fire", predictionCol="prediction", metricName="accuracy")
accuracy2 = evaluator2.evaluate(predictions2)
print("Accuracy = %g" % (accuracy2))

+----------+----+--------------------+
|prediction|fire|            features|
+----------+----+--------------------+
|       0.0|   0|[7.0,5.0,86.2,3.0...|
|       1.0|   0|[7.0,4.0,90.6,10....|
|       1.0|   0|[8.0,6.0,91.0,9.0...|
|       0.0|   0|[7.0,5.0,92.5,9.0...|
|       0.0|   0|[7.0,4.0,90.9,9.0...|
+----------+----+--------------------+
only showing top 5 rows

Accuracy = 0.843882


In [16]:
##### Cross validation for Random Forest

paramGrid = ParamGridBuilder() \
    .addGrid(rf.numTrees, [30]) \
    .addGrid(rf.maxDepth, [30]) \
    .addGrid(rf.minInstancesPerNode, [5]) \
    .build()

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=BinaryClassificationEvaluator(),
                          numFolds=5)

cvModel = crossval.fit(trainingData)
cvPrediction = cvModel.transform(testData)

cvEvaluator = MulticlassClassificationEvaluator(
    labelCol="fire", predictionCol="prediction", metricName="accuracy")
cvAccuracy = cvEvaluator.evaluate(cvPrediction)
print("Accuracy = %g" % (cvAccuracy))

Accuracy = 0.82243


In [26]:
##### Cross Validation for Gradient Boosted Tree
paramGrid2 = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [20]) \
    .addGrid(gbt.maxDepth, [10]) \
    .addGrid(gbt.minInstancesPerNode, [5]) \
    .build()

crossval2 = CrossValidator(estimator=pipeline2,
                          estimatorParamMaps=paramGrid2,
                          evaluator=BinaryClassificationEvaluator(rawPredictionCol='prediction'),
                          numFolds=5)

cvModel2 = crossval2.fit(trainingData)
cvPrediction2 = cvModel2.transform(testData)

cvEvaluator2 = MulticlassClassificationEvaluator(
    labelCol="fire", predictionCol="prediction", metricName="accuracy")
cvAccuracy2 = cvEvaluator2.evaluate(cvPrediction2)
print("Accuracy = %g" % (cvAccuracy2))

Accuracy = 0.88785


In [31]:
cvPrediction2.toPandas().to_csv(path_or_buf="Datasets/gbt_results.csv")

In [None]:
##### 10-fold Cross Validation for Gradient Boosted Tree
paramGrid2 = ParamGridBuilder() \
    .addGrid(gbt.maxIter, [20]) \
    .addGrid(gbt.maxDepth, [10]) \
    .addGrid(gbt.minInstancesPerNode, [5]) \
    .build()

crossval2 = CrossValidator(estimator=pipeline2,
                          estimatorParamMaps=paramGrid2,
                          evaluator=BinaryClassificationEvaluator(rawPredictionCol='prediction'),
                          numFolds=10)

cvModel2 = crossval2.fit(trainingData)
cvPrediction2 = cvModel2.transform(testData)

cvEvaluator2 = MulticlassClassificationEvaluator(
    labelCol="fire", predictionCol="prediction", metricName="accuracy")
cvAccuracy2 = cvEvaluator2.evaluate(cvPrediction2)
print("Accuracy = %g" % (cvAccuracy2))