# Car Horsepower Multilinear Regression with PySpark

* Regressing car horsepower using purely PySpark's API;
* Small dataset with 32 instances having 10 independent variables and 1 dependent variable ("HP");
* 

In [29]:
import findspark
from pyspark.sql import SparkSession
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import avg, round
from pyspark.ml.feature import RFormula, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [30]:
findspark.init()
spark = SparkSession.builder.appName("multilinear_regression_car_horsepower").getOrCreate()

cars = spark.read.load('../data/raw/mtcars.csv', format='csv', header=True,
                       inferSchema=True, sep=';')
print("Number of instances in this dataset: ", cars.count())
cars.show()

Number of instances in this dataset:  32
+--------------+---------+------------+-------------+------+---------------+--------------------+-----------------+-----+----------+---+
|MilesPerGallon|Cylinders|Displacement|RearAxleRatio|Weight|QuarterMileTime|VShapeOrStraightLine|AutomaticOrManual|Gears|Carburetor| HP|
+--------------+---------+------------+-------------+------+---------------+--------------------+-----------------+-----+----------+---+
|            21|        6|         160|           39|   262|           1646|                   0|                1|    4|         4|110|
|            21|        6|         160|           39|  2875|           1702|                   0|                1|    4|         4|110|
|           228|        4|         108|          385|   232|           1861|                   1|                1|    4|         1| 93|
|           214|        6|         258|          308|  3215|           1944|                   1|                0|    3|         1|110|


In [31]:
assembler = VectorAssembler(inputCols=cars.columns, outputCol='corr_features')
cars_assembled = assembler.transform(cars).select('corr_features')

corr_matrix = Correlation.corr(cars_assembled, 'corr_features')
corr_matrix = corr_matrix.collect()[0][corr_matrix.columns[0]].toArray()
corr_matrix = spark.createDataFrame(corr_matrix.tolist(), cars.columns)
corr_matrix.select([round(c, 3).alias(c) for c in corr_matrix.columns]).show()

+--------------+---------+------------+-------------+------+---------------+--------------------+-----------------+------+----------+------+
|MilesPerGallon|Cylinders|Displacement|RearAxleRatio|Weight|QuarterMileTime|VShapeOrStraightLine|AutomaticOrManual| Gears|Carburetor|    HP|
+--------------+---------+------------+-------------+------+---------------+--------------------+-----------------+------+----------+------+
|           1.0|   -0.539|       0.092|        0.493|-0.028|         -0.073|               0.725|            0.089| 0.031|    -0.623|-0.549|
|        -0.539|      1.0|       0.055|       -0.258| 0.172|          0.115|              -0.811|           -0.523|-0.493|     0.527| 0.832|
|         0.092|    0.055|         1.0|        0.101|-0.252|         -0.468|               0.022|           -0.368|-0.138|     -0.03|-0.065|
|         0.493|   -0.258|       0.101|          1.0|-0.414|         -0.451|               0.323|            0.226| 0.382|    -0.135|-0.111|
|        -0.0

In [32]:
r_formula = RFormula(formula="HP ~ MilesPerGallon + Cylinders + VShapeOrStraightLine + Carburetor")
cars_rf = r_formula.fit(cars).transform(cars)
cars_rf.select('features', 'label').show(10)

cars_train, cars_test = cars_rf.randomSplit([0.75, 0.25])
print("Number of training instances: ", cars_train.count())
print("Number of testing instances: ", cars_test.count())

+-------------------+-----+
|           features|label|
+-------------------+-----+
| [21.0,6.0,0.0,4.0]|110.0|
| [21.0,6.0,0.0,4.0]|110.0|
|[228.0,4.0,1.0,1.0]| 93.0|
|[214.0,6.0,1.0,1.0]|110.0|
|[187.0,8.0,0.0,2.0]|175.0|
|[181.0,6.0,1.0,1.0]|105.0|
|[143.0,8.0,0.0,4.0]|245.0|
|[244.0,4.0,1.0,2.0]| 62.0|
|[228.0,4.0,1.0,2.0]| 95.0|
|[192.0,6.0,1.0,4.0]|123.0|
+-------------------+-----+
only showing top 10 rows

Number of training instances:  26
Number of testing instances:  6


In [33]:
regressor = LinearRegression(maxIter=1000)
model = regressor.fit(cars_train)

pred = model.transform(cars_test)
pred.select('label', 'prediction').show()

+-----+------------------+
|label|        prediction|
+-----+------------------+
|245.0|210.64666631126815|
|230.0|210.80027219520244|
|150.0|170.83126397174686|
|123.0|172.55393646930358|
| 95.0| 83.08784420496343|
| 66.0| 66.69387763019927|
+-----+------------------+



In [34]:
print("Regression Evaluation Metrics: ")

evaluator = RegressionEvaluator(metricName='rmse')
print("RMSE: ", evaluator.evaluate(pred))

evaluator.setMetricName('r2')
print("R2: ", evaluator.evaluate(pred))

evaluator.setMetricName('mae')
print("MAE: ", evaluator.evaluate(pred))

evaluator.setMetricName('var')
print("Explained variance: ", evaluator.evaluate(pred))

Regression Evaluation Metrics: 
RMSE:  27.630587469393905
R2:  0.8253076222404148
MAE:  22.757382559969283
Explained variance:  3284.019646281066


In [49]:
gaussian_regressor = GeneralizedLinearRegression(family='gaussian', maxIter=1000)
model = gaussian_regressor.fit(cars_train)
pred = model.transform(cars_test)
print("Gaussian Residual Distribution Regression")
evaluator = RegressionEvaluator(metricName='rmse')
print("RMSE: ", evaluator.evaluate(pred))
evaluator.setMetricName('r2')
print("R2: ", evaluator.evaluate(pred))
evaluator.setMetricName('mae')
print("MAE: ", evaluator.evaluate(pred))
evaluator.setMetricName('var')
print("Explained variance: ", evaluator.evaluate(pred))
print()

poisson_regressor = GeneralizedLinearRegression(family='poisson', maxIter=1000)
model = poisson_regressor.fit(cars_train)
pred = model.transform(cars_test)
print("Poisson Residual Distribution Regression")
evaluator = RegressionEvaluator(metricName='rmse')
print("RMSE: ", evaluator.evaluate(pred))
evaluator.setMetricName('r2')
print("R2: ", evaluator.evaluate(pred))
evaluator.setMetricName('mae')
print("MAE: ", evaluator.evaluate(pred))
evaluator.setMetricName('var')
print("Explained variance: ", evaluator.evaluate(pred))
print()

gamma_regressor = GeneralizedLinearRegression(family='gamma', maxIter=1000)
model = gamma_regressor.fit(cars_train)
pred = model.transform(cars_test)
print("Gamma Residual Distribution Regression")
evaluator = RegressionEvaluator(metricName='rmse')
print("RMSE: ", evaluator.evaluate(pred))
evaluator.setMetricName('r2')
print("R2: ", evaluator.evaluate(pred))
evaluator.setMetricName('mae')
print("MAE: ", evaluator.evaluate(pred))
evaluator.setMetricName('var')
print("Explained variance: ", evaluator.evaluate(pred))
print()

gamma_regressor = GeneralizedLinearRegression(family='tweedie', maxIter=1000,
                                              variancePower=1.5)
model = gamma_regressor.fit(cars_train)
pred = model.transform(cars_test)
print("Compound Poisson-Gamma Distribution Regression")
evaluator = RegressionEvaluator(metricName='rmse')
print("RMSE: ", evaluator.evaluate(pred))
evaluator.setMetricName('r2')
print("R2: ", evaluator.evaluate(pred))
evaluator.setMetricName('mae')
print("MAE: ", evaluator.evaluate(pred))
evaluator.setMetricName('var')
print("Explained variance: ", evaluator.evaluate(pred))
print()

Gaussian Residual Distribution Regression
RMSE:  27.630587469393905
R2:  0.8253076222404148
MAE:  22.757382559969283
Explained variance:  3284.019646281066

Poisson Residual Distribution Regression
RMSE:  21.23746351840282
R2:  0.8967954106297144
MAE:  19.802578787382718
Explained variance:  3096.802624550415

Gamma Residual Distribution Regression
RMSE:  18.441348199419693
R2:  0.9221821810165918
MAE:  16.177232177334023
Explained variance:  3063.180593656929

Compound Poisson-Gamma Distribution Regression
RMSE:  19.06491733397372
R2:  0.916830599404782
MAE:  17.893252434062966
Explained variance:  3055.6827521341424



In [50]:
gamma_regressor = GeneralizedLinearRegression(family='gamma', maxIter=1000)
model = gamma_regressor.fit(cars_train)
pred = model.transform(cars_test)
pred.select('label', 'prediction').show()

+-----+------------------+
|label|        prediction|
+-----+------------------+
|245.0|214.11628383884303|
|230.0| 214.9070615605769|
|150.0| 171.3834428569978|
|123.0| 125.0113413225654|
| 95.0| 83.81672237643872|
| 66.0| 82.50867666029959|
+-----+------------------+

