In [None]:
from pyspark.sql import SparkSession
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import avg, round
from pyspark.ml.feature import RFormula, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
spark = SparkSession.builder.appName("multilinear_regression_car_horsepower").getOrCreate()

cars = spark.read.load('../data/raw/mtcars.csv', format='csv', header=True,
                       inferSchema=True, sep=';')
print("Number of instances in this dataset: ", cars.count())
cars.show()

In [None]:
assembler = VectorAssembler(inputCols=cars.columns, outputCol='corr_features')
cars_assembled = assembler.transform(cars).select('corr_features')

corr_matrix = Correlation.corr(cars_assembled, 'corr_features')
corr_matrix = corr_matrix.collect()[0]["pearson({})".format('corr_features')].toArray()
corr_matrix = spark.createDataFrame(corr_matrix.tolist(), cars.columns)
corr_matrix.select([round(c, 3).alias(c) for c in corr_matrix.columns]).show()

In [None]:
r_formula = RFormula(formula="HP ~ MilesPerGallon + Cylinders + VShapeOrStraightLine + Carburetor")
cars_rf = r_formula.fit(cars).transform(cars)
cars_rf.select('features', 'label').show(10)

cars_train, cars_test = cars_rf.randomSplit([0.75, 0.25])
print("Number of training instances: ", cars_train.count())
print("Number of testing instances: ", cars_test.count())

In [None]:
regressor = LinearRegression(maxIter=1000)
model = regressor.fit(cars_train)

pred = model.transform(cars_test)
pred.select('label', 'prediction').show()

In [None]:
print("Regression Evaluation Metrics: ")

evaluator = RegressionEvaluator(metricName='rmse')
print("RMSE: ", evaluator.evaluate(pred))

evaluator.setMetricName('r2')
print("R2: ", evaluator.evaluate(pred))

evaluator.setMetricName('mae')
print("MAE: ", evaluator.evaluate(pred))

evaluator.setMetricName('var')
print("Explained variance: ", evaluator.evaluate(pred))