In [109]:
import pyspark
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.sql.types import FloatType, StructType, StringType, TimestampType, StructField
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.mllib.evaluation import BinaryClassificationMetrics

In [110]:
spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

weather_schema = StructType([
    StructField('Date', TimestampType()),
    StructField('Location', StringType()),
    StructField('MinTemp', FloatType()),
    StructField('MaxTemp', FloatType()),
    StructField('Rainfall', FloatType()),
    StructField('Evaporation', FloatType()),
    StructField('Sunshine', FloatType()),
    StructField('WindGustDir', StringType()),
    StructField('WindGustSpeed', FloatType()),
    StructField('WindDir9am', StringType()),
    StructField('WindDir3pm', StringType()),
    StructField('WindSpeed9am', FloatType()),
    StructField('WindSpeed3pm', FloatType()),
    StructField('Humidity9am', FloatType()),
    StructField('Humidity3pm', FloatType()),
    StructField('Pressure9am', FloatType()),
    StructField('Pressure3pm', FloatType()),
    StructField('Cloud9am', FloatType()),
    StructField('Cloud3pm', FloatType()),
    StructField('Temp9am', FloatType()),
    StructField('Temp3pm', FloatType()),
    StructField('RainToday', StringType()),
    StructField('RainTomorrow', StringType())
])

In [111]:
df = spark.read.options(header = 'True').schema(weather_schema).csv('weatherAUS.csv')

df = df.drop('Date')
df = df.na.drop()

df = df.filter(df.Location != 'NA')
df = df.filter(df.WindGustDir != 'NA')
df = df.filter(df.WindDir9am != 'NA')
df = df.filter(df.WindDir3pm != 'NA')
df = df.filter(df.RainToday != 'NA')
df = df.filter(df.RainTomorrow != 'NA')

In [112]:
categoricalCols = (['Location', 'WindGustDir', 'WindDir9am',
                    'WindDir3pm', 'RainToday', 'RainTomorrow'])

stages = []

for col in categoricalCols:
    stringIndexer = StringIndexer(inputCol = col, outputCol = col + "Index")
    encoder = OneHotEncoder(inputCols = [stringIndexer.getOutputCol()], outputCols = [col + 'classVec'])
    stages += [stringIndexer, encoder]
    
label_stringIdx = StringIndexer(inputCol = 'RainTomorrow', outputCol = 'label')
stages += [label_stringIdx]

numericCols = (['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine', 'WindGustSpeed', 'WindSpeed9am',
               'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
               'Temp3pm'])
assemblerInputs = [c + 'classVec' for c in categoricalCols] + numericCols
assembler = VectorAssembler(inputCols = assemblerInputs, outputCol = 'features')

stages += [assembler]

partialPipeline = Pipeline().setStages(stages)
pipelineModel = partialPipeline.fit(df)
preppedDataDF = pipelineModel.transform(df)

(trainingData, testData) = preppedDataDF.randomSplit([.8, .2], seed = 12345)

In [113]:
dt = DecisionTreeClassifier(labelCol = 'label', featuresCol = 'features')

dtparamGrid = (ParamGridBuilder()
             .addGrid(dt.maxBins, [5, 10, 15])
             .addGrid(dt.maxDepth, [3, 5, 7])
             .addGrid(dt.minInfoGain, [0.0, 0.2, 0.4])
             .addGrid(dt.impurity, ['gini', 'entropy'])
             .build())

In [153]:
dtEvaluator = BinaryClassificationEvaluator()

dtCV = CrossValidator(estimator = dt,
                     estimatorParamMaps = dtparamGrid,
                     evaluator = dtEvaluator,
                     numFolds = 3)

dtCVmodel = dtCV.fit(trainingData)

                                                                                

In [154]:
dtPredictions = dtCVmodel.transform(testData)

print('Accuracy: ', dtEvaluator.evaluate(dtPredictions))
print('AUC: ', BinaryClassificationMetrics(dtPredictions['label', 'prediction'].rdd).areaUnderROC)

#this seems wrong, but I'm not sure why it's doing this

Accuracy:  1.0




AUC:  1.0


In [198]:
best_model = dtCVmodel.bestModel

print('Impurity: ' + best_model.getImpurity())
print('Max Bins: ' + str(best_model.getMaxBins()))
print('Depth: ' + str(best_model.depth))
print('Min Info Gain: ' + str(best_model.getMinInfoGain()))
print('Number of Nodes: ' + str(best_model.numNodes))

Impurity: gini
Max Bins: 5
Depth: 1
Min Info Gain: 0.0
Number of Nodes: 3
