In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import RandomForestClassifier

In [2]:
def indexStringColumns(df, cols):
    # variable newdf will be updated several times
    newdf = df

    for c in cols:
        # For each given colum, fits StringIndexerModel.
        si = StringIndexer(inputCol=c, outputCol=c + "-num")
        sm = si.fit(newdf)
        # Creates a DataFame by putting the transformed values in the new colum with suffix "-num"
        # and then drops the original columns.
        # and drop the "-num" suffix.
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c + "-num", c)
    return (newdf, sm)

In [3]:
def renameColumns(df, cols):
    # variable newdf will be updated several times
    newdf = df

    for c in cols:
        # Creates a DataFame by putting the transformed values in the new colum with suffix "-num"
        # and then drops the original columns.
        # and drop the "-num" suffix.
        newdf = newdf.drop(c)
        newdf = newdf.withColumnRenamed(c + "-num", c)
    return newdf

In [4]:
def toFloatSafe(x):
    try:
        return float(x)
    except ValueError:
        return None

## Create Data Frames

In [5]:
ss = SparkSession.builder.getOrCreate()
sc = ss.sparkContext

In [6]:
training_data = "../Data/Week4_Discussion_Iris/iris.csv"
test_data = "../Data/Week4_Discussion_Iris/iris_test.csv"

In [7]:
iris_raw = sc.textFile(training_data, 4).map(lambda x: x.split(","))
iris_test_raw = sc.textFile(test_data, 4).map(lambda x: x.split(","))

irisschema = StructType([
    StructField("sepal_length", FloatType(), False),
    StructField("sepal_width", FloatType(), False),
    StructField("petal_length", FloatType(), False),
    StructField("petal_width", FloatType(), False),
    StructField("class", StringType(), False)
])

In [8]:
dfIris = ss.createDataFrame(
    iris_raw.map(lambda x: (toFloatSafe(x[0]), toFloatSafe(x[1]),
                            toFloatSafe(x[2]), toFloatSafe(x[3]), x[4])),
    irisschema)

In [9]:
dfIris_test = ss.createDataFrame(
    iris_test_raw.map(lambda x: (toFloatSafe(x[0]), toFloatSafe(x[1]),
                                 toFloatSafe(x[2]), toFloatSafe(x[3]), x[4])),
    irisschema)

## Print the schema

In [10]:
dfIris.printSchema()

root
 |-- sepal_length: float (nullable = false)
 |-- sepal_width: float (nullable = false)
 |-- petal_length: float (nullable = false)
 |-- petal_width: float (nullable = false)
 |-- class: string (nullable = false)



## Create a string indexer and transform data

In [11]:
dfIrisnumeric, sm = indexStringColumns(dfIris, ["class"])

## Make sure to apply the same string indexer to the validation set

In [12]:
dfIris_test = sm.transform(dfIris_test)

In [13]:
dfIris_test = renameColumns(dfIris_test, ["class"])

## Create a feature vector

In [14]:
va = VectorAssembler(outputCol="features", inputCols=dfIris.columns[0:-1])  
# except the last col.

irispoints = va.transform(dfIrisnumeric).select("features", "class")\
                         .withColumnRenamed("class", "label").cache()
irispoints_test = va.transform(dfIris_test).select("features", "class")\
                    .withColumnRenamed("class", "label")

## Create and apply Random Forest

In [15]:
rf = RandomForestClassifier(maxDepth=1, numTrees=2, maxBins=2, seed=7)
rfmodel = rf.fit(irispoints)

# Evaluate the model using the validation set

In [16]:
rfpredicts = rfmodel.transform(irispoints_test)

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")
f1 = evaluator.evaluate(rfpredicts)
print("F1 = %.4f" % f1)

F1 = 0.3347


In [17]:
ss.stop()