Code based on: Bowles, M. (2019). Machine Learning with Spark and Python: Essential Techniques for Predictive Analytics. John Wiley & Sons.

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.regression import LinearRegression

In [None]:
spark = SparkSession.builder.appName("winequality").getOrCreate()

In [None]:
from pyspark.sql.types import StructType,StructField,DoubleType
wineq_schema = StructType([\
    StructField("FAcid",DoubleType(),True),    # 1 - fixed acidity			  
    StructField("VAcid",DoubleType(),True),    # 2 - volatile acidity			  
    StructField("Citr",DoubleType(),True),     # 3 - citric acid			  
    StructField("ReSug",DoubleType(),True),    # 4 - residual sugar			  
    StructField("Chlo",DoubleType(),True),     # 5 - chlorides				  
    StructField("FrSO2",DoubleType(),True),    # 6 - free sulfur dioxide		  
    StructField("TSO2",DoubleType(),True),     # 7 - total sulfur dioxide		  
    StructField("Den",DoubleType(),True),      # 8 - density				  
    StructField("pH",DoubleType(),True),       # 9 - pH				  
    StructField("Sulph",DoubleType(),True),    # 10 - sulphates			  
    StructField("Alc",DoubleType(),True),      # 11 - alcohol				  
    StructField("Qual",DoubleType(),True)])    # 12 - quality (score between 0 and 10)
wineq = spark.read.format("csv").option("header", "True").option("sep", ";")\
        .schema(wineq_schema)\
        .load("Data/winequality/winequality-red.csv")

In [None]:
wineq.show(3)

In [None]:
vecAssembler = VectorAssembler(outputCol="features")

In [None]:
vecAssembler.setInputCols(["FAcid","VAcid","Citr","ReSug","Chlo","FrSO2","TSO2","Den","pH","Sulph","Alc"])

In [None]:
v_wineq = vecAssembler.transform(wineq)

In [None]:
v_wineq.show(3)

In [None]:
normaliser = StandardScaler(inputCol="features", outputCol="normFeatures")
normModel = normaliser.fit(v_wineq)

In [None]:
wineq_norm = normModel.transform(v_wineq)

In [None]:
wineq_norm.show(3)

In [None]:
inp_wineq_norm = wineq_norm.select(["normFeatures", "Qual"])

In [None]:
inp_wineq_norm.show(3)

In [None]:
train, test = inp_wineq_norm.randomSplit([0.67, 0.33])

In [None]:
lambda_pars = [1., .1, .01, .001, .0001, .00001, .000001]

In [None]:
weights = list()
intercepts = list()
RMSE = list()
for lambda_par in lambda_pars:
    ridge = LinearRegression(featuresCol = "normFeatures", labelCol='Qual', maxIter=100, regParam=lambda_par, elasticNetParam=0.0)
    ridge_mod = ridge.fit(train)
    eval_res = ridge_mod.evaluate(test)
    RMSE.append(eval_res.rootMeanSquaredError)
    mod_weights = ridge_mod.coefficients.toArray()
    weights.append(mod_weights)
    intercepts.append(ridge_mod.intercept)

In [None]:
print('{:18}'.format("RMSE"), "lambda") 
for i in range(len(RMSE)):   
    print(RMSE[i], lambda_pars[i])

In [None]:
n_weights = len(weights[0])
ordered_idx = sorted(zip(range(n_weights), weights[2]), key=lambda x: -abs(x[1]))
for (a,b) in ordered_idx:
    print(wineq.columns[a])

In [None]:
x = range(len(RMSE))
plt.plot(x, RMSE, 'r')
plt.xlabel('-log lambda')
plt.ylabel('RMSE')
plt.savefig("../Plots/winequality_regression.png", dpi=600)
plt.show()