Code based on: Bowles, M. (2019). Machine Learning with Spark and Python: Essential Techniques for Predictive Analytics. John Wiley & Sons.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
spark = SparkSession.builder.appName("winequality_gbm").getOrCreate()

In [None]:
from pyspark.sql.types import StructType,StructField,DoubleType
wineq_schema = StructType([\
    StructField("FAcid",DoubleType(),True),    # 1 - fixed acidity			  
    StructField("VAcid",DoubleType(),True),    # 2 - volatile acidity			  
    StructField("Citr",DoubleType(),True),     # 3 - citric acid			  
    StructField("ReSug",DoubleType(),True),    # 4 - residual sugar			  
    StructField("Chlo",DoubleType(),True),     # 5 - chlorides				  
    StructField("FrSO2",DoubleType(),True),    # 6 - free sulfur dioxide		  
    StructField("TSO2",DoubleType(),True),     # 7 - total sulfur dioxide		  
    StructField("Den",DoubleType(),True),      # 8 - density				  
    StructField("pH",DoubleType(),True),       # 9 - pH				  
    StructField("Sulph",DoubleType(),True),    # 10 - sulphates			  
    StructField("Alc",DoubleType(),True),      # 11 - alcohol				  
    StructField("Qual",DoubleType(),True)])    # 12 - quality (score between 0 and 10)
wineq = spark.read.format("csv").option("header", "True").option("sep", ";")\
        .schema(wineq_schema)\
        .load("Data/winequality/winequality-red.csv")

In [None]:
vecAssembler = VectorAssembler(outputCol="features")

In [None]:
vecAssembler.setInputCols(["FAcid","VAcid","Citr","ReSug","Chlo","FrSO2","TSO2","Den","pH","Sulph","Alc"])

In [None]:
v_wineq = vecAssembler.transform(wineq)

In [None]:
inp_wineq = v_wineq.select(["features", "Qual"])

In [None]:
train, test = inp_wineq.randomSplit([0.67, 0.33])

In [None]:
gb_tree = GBTRegressor(featuresCol = 'features', 
                       labelCol = 'Qual', 
                       maxIter=100, 
                       maxDepth=5, 
                       subsamplingRate=0.5, 
                       stepSize=0.1)

In [None]:
gb_tree_mod = gb_tree.fit(train)
gb_tree_preds = gb_tree_mod.transform(test)
gb_tree_preds.select('prediction', 'Qual').show(3)

In [None]:
gb_tree_eval = RegressionEvaluator(
                    labelCol="Qual", 
                    predictionCol="prediction", 
                    metricName="rmse")

In [None]:
RMSE = gb_tree_eval.evaluate(gb_tree_preds)

In [None]:
gb_tree_eval = RegressionEvaluator(
                    labelCol="Qual", 
                    predictionCol="prediction", 
                    metricName="r2")

In [None]:
R2 = gb_tree_eval.evaluate(gb_tree_preds)

In [None]:
print("RMSE (testing): ", RMSE)
print("R Squared (testing):", R2)