In [1]:
from pyspark.sql import SparkSession

# Build the SparkSession
spark = SparkSession.builder \
   .master("local") \
   .appName("First demo SparkML") \
   .config("spark.executor.memory", "1gb") \
   .getOrCreate()
   
sc = spark.sparkContext

In [2]:
df = spark.read.csv('/Users/Storage/Vicohub/Data/Iris.csv', header=True)
df.show(5)

+------------+-----------+------------+-----------+-----------+
|sepal_length|sepal_width|petal_length|petal_width|    species|
+------------+-----------+------------+-----------+-----------+
|         5.1|        3.5|         1.4|        0.2|Iris-setosa|
|         4.9|        3.0|         1.4|        0.2|Iris-setosa|
|         4.7|        3.2|         1.3|        0.2|Iris-setosa|
|         4.6|        3.1|         1.5|        0.2|Iris-setosa|
|         5.0|        3.6|         1.4|        0.2|Iris-setosa|
+------------+-----------+------------+-----------+-----------+
only showing top 5 rows



In [3]:
from pyspark.ml.linalg import DenseVector
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import RandomForestClassifier

In [6]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [4]:
data = df.rdd.map(lambda x: (DenseVector(x[:4]), x[4]))

# Replace `df` with the new DataFrame
df1 = spark.createDataFrame(data, ["X", "Y"])
df1.show(5)

+-----------------+-----------+
|                X|          Y|
+-----------------+-----------+
|[5.1,3.5,1.4,0.2]|Iris-setosa|
|[4.9,3.0,1.4,0.2]|Iris-setosa|
|[4.7,3.2,1.3,0.2]|Iris-setosa|
|[4.6,3.1,1.5,0.2]|Iris-setosa|
|[5.0,3.6,1.4,0.2]|Iris-setosa|
+-----------------+-----------+
only showing top 5 rows



In [5]:
indexer = StringIndexer(inputCol="Y", outputCol="Yn")
indexed = indexer.fit(df1).transform(df1)
indexed.show(60)

+-----------------+---------------+---+
|                X|              Y| Yn|
+-----------------+---------------+---+
|[5.1,3.5,1.4,0.2]|    Iris-setosa|0.0|
|[4.9,3.0,1.4,0.2]|    Iris-setosa|0.0|
|[4.7,3.2,1.3,0.2]|    Iris-setosa|0.0|
|[4.6,3.1,1.5,0.2]|    Iris-setosa|0.0|
|[5.0,3.6,1.4,0.2]|    Iris-setosa|0.0|
|[5.4,3.9,1.7,0.4]|    Iris-setosa|0.0|
|[4.6,3.4,1.4,0.3]|    Iris-setosa|0.0|
|[5.0,3.4,1.5,0.2]|    Iris-setosa|0.0|
|[4.4,2.9,1.4,0.2]|    Iris-setosa|0.0|
|[4.9,3.1,1.5,0.1]|    Iris-setosa|0.0|
|[5.4,3.7,1.5,0.2]|    Iris-setosa|0.0|
|[4.8,3.4,1.6,0.2]|    Iris-setosa|0.0|
|[4.8,3.0,1.4,0.1]|    Iris-setosa|0.0|
|[4.3,3.0,1.1,0.1]|    Iris-setosa|0.0|
|[5.8,4.0,1.2,0.2]|    Iris-setosa|0.0|
|[5.7,4.4,1.5,0.4]|    Iris-setosa|0.0|
|[5.4,3.9,1.3,0.4]|    Iris-setosa|0.0|
|[5.1,3.5,1.4,0.3]|    Iris-setosa|0.0|
|[5.7,3.8,1.7,0.3]|    Iris-setosa|0.0|
|[5.1,3.8,1.5,0.3]|    Iris-setosa|0.0|
|[5.4,3.4,1.7,0.2]|    Iris-setosa|0.0|
|[5.1,3.7,1.5,0.4]|    Iris-setosa|0.0|


In [20]:
import numpy as np
evaluator = MulticlassClassificationEvaluator(
        labelCol="Yn", predictionCol="prediction", metricName="accuracy")

In [21]:
acc = []
for i in range(10):
    rf = RandomForestClassifier(labelCol="Yn", 
                            featuresCol="X", 
                            numTrees=10)
    (trainingData, testData) = indexed.randomSplit([0.7, 0.3])
    model = rf.fit(trainingData)
    predictions = model.transform(testData)

    predictions.select("Yn", "prediction")

    accuracy = evaluator.evaluate(predictions)
    print("{} - Test Accuracy = {}".format(i,accuracy))
    acc.append(accuracy)
print('AVG Acc: ', np.mean(acc))

0 - Test Accuracy = 0.9333333333333333
1 - Test Accuracy = 0.9523809523809523
2 - Test Accuracy = 0.975
3 - Test Accuracy = 0.9629629629629629
4 - Test Accuracy = 0.9512195121951219
5 - Test Accuracy = 0.9259259259259259
6 - Test Accuracy = 0.9285714285714286
7 - Test Accuracy = 0.9423076923076923
8 - Test Accuracy = 0.9285714285714286
9 - Test Accuracy = 0.9444444444444444
AVG Acc:  0.9444717680693291


In [24]:
# from pyspark.ml.classification import GBTClassifier
from pyspark.ml.classification import LogisticRegression

In [25]:
acc = []
for i in range(10):
#     gbt = GBTClassifier(labelCol="Yn", 
#                         featuresCol="X",
#                         maxIter=10)
    lr = LogisticRegression(labelCol="Yn", 
                            featuresCol="X", 
                            maxIter=10, 
                            regParam=0.3, 
                            elasticNetParam=0.8)
    (trainingData, testData) = indexed.randomSplit([0.7, 0.3])
    model = lr.fit(trainingData)
    predictions = model.transform(testData)

    predictions.select("Yn", "prediction")
    accuracy = evaluator.evaluate(predictions)
    print("{} - Test Accuracy = {}".format(i,accuracy))
    acc.append(accuracy)
print('AVG Acc: ', np.mean(acc))

0 - Test Accuracy = 0.5227272727272727
1 - Test Accuracy = 0.7608695652173914
2 - Test Accuracy = 0.8367346938775511
3 - Test Accuracy = 0.62
4 - Test Accuracy = 0.6511627906976745
5 - Test Accuracy = 0.7619047619047619
6 - Test Accuracy = 0.64
7 - Test Accuracy = 0.6545454545454545
8 - Test Accuracy = 0.5471698113207547
9 - Test Accuracy = 0.6666666666666666
AVG Acc:  0.6661781016957528


## Wine White

In [27]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType

schema = StructType([
    StructField("fixed acidity", DoubleType()),
    StructField("volatile acidity", DoubleType()),
    StructField("citric acid", DoubleType()),
    StructField("residual sugar", DoubleType()),
    StructField("chlorides", DoubleType()),
    StructField("free sulfur dioxide", DoubleType()),
    StructField("total sulfur dioxide", DoubleType()),
    StructField("density", DoubleType()),
    StructField("pH", DoubleType()),
    StructField("sulphates", DoubleType()),
    StructField("alcohol", DoubleType()),
    StructField("quality", IntegerType())
])

df = spark.read.format("com.databricks.spark.csv").schema(schema).option("header", "true") \
    .load("/Users/Storage/Vicohub/Data/winequality/winequality-white.csv", sep=';')
df.show(5)

+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density|  pH|sulphates|alcohol|quality|
+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+
|          7.0|            0.27|       0.36|          20.7|    0.045|               45.0|               170.0|  1.001| 3.0|     0.45|    8.8|      6|
|          6.3|             0.3|       0.34|           1.6|    0.049|               14.0|               132.0|  0.994| 3.3|     0.49|    9.5|      6|
|          8.1|            0.28|        0.4|           6.9|     0.05|               30.0|                97.0| 0.9951|3.26|     0.44|   10.1|      6|
|          7.2|            0.23|       0.32|           8.5|    0.058|               47.0|           

In [28]:
data = df.rdd.map(lambda x: (DenseVector(x[:11]), x[11]))

# Replace `df` with the new DataFrame
df2 = spark.createDataFrame(data, ["X", "Y"])
indexed = indexer.fit(df2).transform(df2)
indexed.show(5)

+--------------------+---+---+
|                   X|  Y| Yn|
+--------------------+---+---+
|[7.0,0.27,0.36,20...|  6|0.0|
|[6.3,0.3,0.34,1.6...|  6|0.0|
|[8.1,0.28,0.4,6.9...|  6|0.0|
|[7.2,0.23,0.32,8....|  6|0.0|
|[7.2,0.23,0.32,8....|  6|0.0|
+--------------------+---+---+
only showing top 5 rows



In [50]:
rf = RandomForestClassifier(labelCol="Yn", 
                            featuresCol="X", 
                            numTrees=100,
                            featureSubsetStrategy="auto",
                            impurity='gini', maxDepth=25, maxBins=32)
(trainingData, testData) = indexed.randomSplit([0.7, 0.3])
model = rf.fit(trainingData)
predictions = model.transform(testData)

predictions.select("Yn", "prediction")

accuracy = evaluator.evaluate(predictions)
print("Test Accuracy = {}".format(accuracy))

# predictions.show(20)

Test Accuracy = 0.6933701657458563


In [45]:
indexed.toPandas()['Yn'].nunique()

7