Code based on:
Bowles, M. (2019). Machine Learning with Spark and Python: Essential Techniques for Predictive Analytics. John Wiley & Sons.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pandas import DataFrame
from pyspark.sql import SparkSession
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler, OneHotEncoder, StringIndexer
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
spark = SparkSession.builder.appName("abalone").getOrCreate()

In [None]:
from pyspark.sql.types import StructType,StructField,IntegerType,DoubleType,\
                              StringType
aba_schema = StructType([\
    StructField("Sex",StringType(),True),
    StructField("Length",DoubleType(),True),
    StructField("Diameter",DoubleType(),True),
    StructField("Height",DoubleType(),True),
    StructField("Whole",DoubleType(),True),
    StructField("Shucked wt",DoubleType(),True),
    StructField("Viscera wt",DoubleType(),True),
    StructField("Shell wt",DoubleType(),True),
    StructField("Rings",IntegerType(),True)])
aba = spark.read.format("csv").option("header", "False").schema(aba_schema)\
      .option("inferSchema", "True").load("Data/abalone/abalone.data")

In [None]:
aba.show(3)

In [None]:
num_cols = aba.columns[1:-1]

In [None]:
string_indexer = StringIndexer(inputCol="Sex", outputCol="SexIdx")

In [None]:
encoder = OneHotEncoder(inputCols=[string_indexer.getOutputCol()], outputCols=["SexVec"])

In [None]:
inputs = ["SexVec"] + num_cols

In [None]:
vec_assembler = VectorAssembler(inputCols=inputs,outputCol="features")

In [None]:
stages = [string_indexer, encoder, vec_assembler]

In [None]:
pipeline = Pipeline(stages = stages)
pipeline_mod = pipeline.fit(aba)
aba_transfd = pipeline_mod.transform(aba)
inp_aba = aba_transfd.select(['features', 'Rings'])

In [None]:
inp_aba.show(4)

In [None]:
train_aba, test_aba = inp_aba.randomSplit([0.7, 0.3], seed = 1234)

In [None]:
regr_aba = LinearRegression(featuresCol = 'features', labelCol='Rings', maxIter=10, regParam=0.003, elasticNetParam=0.8)

In [None]:
regr_mod_aba = regr_aba.fit(train_aba)
print("Weights and intercept: ", regr_mod_aba.coefficients, regr_mod_aba.intercept)

In [None]:
train_aba_summary = regr_mod_aba.summary
print("RMSE (training): ", train_aba_summary.rootMeanSquaredError)
print("R Squared (training):", train_aba_summary.r2)

In [None]:
regr_aba_preds = regr_mod_aba.transform(test_aba)
regr_aba_preds.select("prediction","Rings","features").show(8)

In [None]:
regr_eval = RegressionEvaluator(predictionCol="prediction", labelCol="Rings",metricName="r2")
print("R Squared (test):", regr_eval.evaluate(regr_aba_preds))