In [None]:
import findspark
findspark.init()

import pyspark # only run after findspark.init()
from pyspark.sql import SparkSession
from pyspark.sql.types import *

spark = SparkSession.builder.getOrCreate()

In [None]:
import random

NUM_SAMPLES = 1000000000

def inside(p):
    x, y = random.random(), random.random()
    return x*x + y*y < 1

sc = spark.sparkContext

count = sc.parallelize(range(0, NUM_SAMPLES)).filter(inside).count()

pi = 4 * count/NUM_SAMPLES

print("Pi is roughly: ", pi)


In [None]:
spark = SparkSession.builder.appName('iris_clf').getOrCreate()

In [None]:
df = spark.read.csv('iris.csv', header=True, inferSchema=True)
df.printSchema()

In [None]:
df.show(3)

In [None]:
schema = StructType([
    StructField('sepal_length', DoubleType()),
    StructField('sepal_width', DoubleType()),
    StructField('petal_length', DoubleType()),
    StructField('petal_width', DoubleType()),
    StructField('type', StringType())
])

In [None]:
df2 = spark.read.csv('iris.csv', header=True, schema=schema)
df2.show(5)

In [None]:
from pyspark.ml.feature import VectorAssembler

In [None]:
input_cols = ['sepal_length','sepal_width','petal_length','petal_width']
output_col = ['type']
vectorizer = VectorAssembler(inputCols=input_cols, outputCol="features") # "features" is the default name and does not need to be added as a param

df = vectorizer.transform(df2)

df.show(5)

In [None]:
from pyspark.ml.feature import StringIndexer

In [None]:
indexer = StringIndexer(inputCol='type', outputCol='indexed_type')

df = indexer.fit(df).transform(df)

df.show(5)

In [None]:
# df_train, df_val, df_test = df.randomSplit([0.7, 0.1, 0.2], seed=0)

In [None]:
df_train, df_test = df.randomSplit([0.7, 0.2], seed=0)

In [None]:
from pyspark.ml.classification import RandomForestClassifier

In [None]:
rf_clf = RandomForestClassifier(featuresCol='features', labelCol='indexed_type')

In [None]:
rf_clf = rf_clf.fit(df_train)

In [None]:
df_test = rf_clf.transform(df_test)
df_test.show(3)

In [None]:
df_test.select('type', 'features', 'indexed_type', 'rawPrediction', 'probability', 'prediction').show()

In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [44]:
criterion = MulticlassClassificationEvaluator(labelCol='indexed_type', predictionCol='prediction')
accuracy = criterion.evaluate(df_test)
accuracy

0.9375