In [35]:
import findspark
from pyspark.sql import SparkSession
from pyspark.ml.stat import Correlation
from pyspark.sql.functions import avg, round
from pyspark.ml.feature import RFormula, VectorAssembler
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.evaluation import RegressionEvaluator

In [36]:
findspark.init()
spark = SparkSession.builder.appName("multilinear_regression_car_horsepower").getOrCreate()

cars = spark.read.load('../data/raw/mtcars.csv', format='csv', header=True,
                       inferSchema=True, sep=';')
print("Number of instances in this dataset: ", cars.count())
cars.show()

Number of instances in this dataset:  32
+--------------+---------+------------+-------------+------+---------------+--------------------+-----------------+-----+----------+---+
|MilesPerGallon|Cylinders|Displacement|RearAxleRatio|Weight|QuarterMileTime|VShapeOrStraightLine|AutomaticOrManual|Gears|Carburetor| HP|
+--------------+---------+------------+-------------+------+---------------+--------------------+-----------------+-----+----------+---+
|            21|        6|         160|           39|   262|           1646|                   0|                1|    4|         4|110|
|            21|        6|         160|           39|  2875|           1702|                   0|                1|    4|         4|110|
|           228|        4|         108|          385|   232|           1861|                   1|                1|    4|         1| 93|
|           214|        6|         258|          308|  3215|           1944|                   1|                0|    3|         1|110|


In [37]:
# Assemble variables in a single vector for the Correlation constructor.
assembler = VectorAssembler(inputCols=cars.columns, outputCol='corr_features')
cars_assembled = assembler.transform(cars).select('corr_features')

corr_matrix = Correlation.corr(cars_assembled, 'corr_features')

# Get the list of correlations provided by function corr() and build a
# dataframe using the original column names.
corr_matrix = corr_matrix.collect()[0][corr_matrix.columns[0]].toArray()
corr_matrix = spark.createDataFrame(corr_matrix.tolist(), cars.columns)

# Round every value in the dataframe to better visualize correlations.
corr_matrix.select([round(c, 3).alias(c) for c in corr_matrix.columns]).show()

+--------------+---------+------------+-------------+------+---------------+--------------------+-----------------+------+----------+------+
|MilesPerGallon|Cylinders|Displacement|RearAxleRatio|Weight|QuarterMileTime|VShapeOrStraightLine|AutomaticOrManual| Gears|Carburetor|    HP|
+--------------+---------+------------+-------------+------+---------------+--------------------+-----------------+------+----------+------+
|           1.0|   -0.539|       0.092|        0.493|-0.028|         -0.073|               0.725|            0.089| 0.031|    -0.623|-0.549|
|        -0.539|      1.0|       0.055|       -0.258| 0.172|          0.115|              -0.811|           -0.523|-0.493|     0.527| 0.832|
|         0.092|    0.055|         1.0|        0.101|-0.252|         -0.468|               0.022|           -0.368|-0.138|     -0.03|-0.065|
|         0.493|   -0.258|       0.101|          1.0|-0.414|         -0.451|               0.323|            0.226| 0.382|    -0.135|-0.111|
|        -0.0

In [38]:
r_formula = RFormula(formula="HP ~ MilesPerGallon + Cylinders + VShapeOrStraightLine + AutomaticOrManual + Carburetor")
cars_rf = r_formula.fit(cars).transform(cars)
cars_rf.select('features', 'label').show(10)

cars_train, cars_test = cars_rf.randomSplit([0.8, 0.2], seed=10)
print("Number of training instances: ", cars_train.count())
print("Number of testing instances: ", cars_test.count())

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[21.0,6.0,0.0,1.0...|110.0|
|[21.0,6.0,0.0,1.0...|110.0|
|[228.0,4.0,1.0,1....| 93.0|
|[214.0,6.0,1.0,0....|110.0|
|[187.0,8.0,0.0,0....|175.0|
|[181.0,6.0,1.0,0....|105.0|
|[143.0,8.0,0.0,0....|245.0|
|[244.0,4.0,1.0,0....| 62.0|
|[228.0,4.0,1.0,0....| 95.0|
|[192.0,6.0,1.0,0....|123.0|
+--------------------+-----+
only showing top 10 rows

Number of training instances:  26
Number of testing instances:  6


In [39]:
regressor = LinearRegression()
model = regressor.fit(cars_train)

pred = model.transform(cars_train)

print("Regression Evaluation Metrics: ")

evaluator = RegressionEvaluator(metricName='rmse')
print("RMSE: ", evaluator.evaluate(pred))

evaluator.setMetricName('r2')
print("R2: ", evaluator.evaluate(pred))

evaluator.setMetricName('mae')
print("MAE: ", evaluator.evaluate(pred))

evaluator.setMetricName('var')
print("Explained variance: ", evaluator.evaluate(pred))

Regression Evaluation Metrics: 
RMSE:  26.975872244566172
R2:  0.8517468606749382
MAE:  21.909180729505163
Explained variance:  4180.783085875607


In [40]:
gaussian_regressor = GeneralizedLinearRegression(family='gaussian')
model = gaussian_regressor.fit(cars_train)
pred = model.transform(cars_train)
print("Gaussian Residual Distribution Regression")
evaluator = RegressionEvaluator(metricName='rmse')
print("RMSE: ", evaluator.evaluate(pred))
evaluator.setMetricName('r2')
print("R2: ", evaluator.evaluate(pred))
evaluator.setMetricName('mae')
print("MAE: ", evaluator.evaluate(pred))
evaluator.setMetricName('var')
print("Explained variance: ", evaluator.evaluate(pred))
print()

poisson_regressor = GeneralizedLinearRegression(family='poisson')
model = poisson_regressor.fit(cars_train)
pred = model.transform(cars_train)
print("Poisson Residual Distribution Regression")
evaluator = RegressionEvaluator(metricName='rmse')
print("RMSE: ", evaluator.evaluate(pred))
evaluator.setMetricName('r2')
print("R2: ", evaluator.evaluate(pred))
evaluator.setMetricName('mae')
print("MAE: ", evaluator.evaluate(pred))
evaluator.setMetricName('var')
print("Explained variance: ", evaluator.evaluate(pred))
print()

gamma_regressor = GeneralizedLinearRegression(family='gamma')
model = gamma_regressor.fit(cars_train)
pred = model.transform(cars_train)
print("Gamma Residual Distribution Regression")
evaluator = RegressionEvaluator(metricName='rmse')
print("RMSE: ", evaluator.evaluate(pred))
evaluator.setMetricName('r2')
print("R2: ", evaluator.evaluate(pred))
evaluator.setMetricName('mae')
print("MAE: ", evaluator.evaluate(pred))
evaluator.setMetricName('var')
print("Explained variance: ", evaluator.evaluate(pred))
print()

gamma_regressor = GeneralizedLinearRegression(family='tweedie', variancePower=1.5)
model = gamma_regressor.fit(cars_train)
pred = model.transform(cars_train)
print("Compound Poisson-Gamma Distribution Regression")
evaluator = RegressionEvaluator(metricName='rmse')
print("RMSE: ", evaluator.evaluate(pred))
evaluator.setMetricName('r2')
print("R2: ", evaluator.evaluate(pred))
evaluator.setMetricName('mae')
print("MAE: ", evaluator.evaluate(pred))
evaluator.setMetricName('var')
print("Explained variance: ", evaluator.evaluate(pred))

Gaussian Residual Distribution Regression
RMSE:  26.975872244566172
R2:  0.8517468606749382
MAE:  21.909180729505163
Explained variance:  4180.783085875607

Poisson Residual Distribution Regression
RMSE:  20.005159436976776
R2:  0.918466339603979
MAE:  16.979841561499327
Explained variance:  4322.4636381456285

Gamma Residual Distribution Regression
RMSE:  19.538256870365164
R2:  0.9222277728120352
MAE:  16.34857490526664
Explained variance:  4698.910854089208

Compound Poisson-Gamma Distribution Regression
RMSE:  18.865193241449965
R2:  0.9274937536134987
MAE:  16.154007033455237
Explained variance:  4490.801127783104


In [41]:
gamma_regressor = GeneralizedLinearRegression(family='tweedie', variancePower=1.5)
model = gamma_regressor.fit(cars_train)
pred = model.transform(cars_test)
pred.select('label', 'prediction').show()

print("Compound Poisson-Gamma Distribution Regression")
evaluator = RegressionEvaluator(metricName='rmse')
print("RMSE: ", evaluator.evaluate(pred))
evaluator.setMetricName('r2')
print("R2: ", evaluator.evaluate(pred))
evaluator.setMetricName('mae')
print("MAE: ", evaluator.evaluate(pred))
evaluator.setMetricName('var')
print("Explained variance: ", evaluator.evaluate(pred))

+-----+------------------+
|label|        prediction|
+-----+------------------+
|110.0|129.76006773629265|
| 91.0| 76.56169271747196|
|245.0|211.21480680614903|
|180.0|191.02054034801478|
|180.0|192.35208258360223|
|105.0|103.47114459316559|
+-----+------------------+

Compound Poisson-Gamma Distribution Regression
RMSE:  18.33361178201924
R2:  0.8874017814632057
MAE:  15.480841091853847
Explained variance:  2532.585976846807


In [42]:
r_formula = RFormula(formula="HP ~ Cylinders + Carburetor")
cars_rf = r_formula.fit(cars).transform(cars)
cars_rf.select('features', 'label').show(10)

cars_train, cars_test = cars_rf.randomSplit([0.8, 0.2], seed=10)
print("Number of training instances: ", cars_train.count())
print("Number of testing instances: ", cars_test.count())

+---------+-----+
| features|label|
+---------+-----+
|[6.0,4.0]|110.0|
|[6.0,4.0]|110.0|
|[4.0,1.0]| 93.0|
|[6.0,1.0]|110.0|
|[8.0,2.0]|175.0|
|[6.0,1.0]|105.0|
|[8.0,4.0]|245.0|
|[4.0,2.0]| 62.0|
|[4.0,2.0]| 95.0|
|[6.0,4.0]|123.0|
+---------+-----+
only showing top 10 rows

Number of training instances:  26
Number of testing instances:  6


In [43]:
regressor = LinearRegression()
model = regressor.fit(cars_train)

pred = model.transform(cars_train)

print("Regression Evaluation Metrics: ")

evaluator = RegressionEvaluator(metricName='rmse')
print("RMSE: ", evaluator.evaluate(pred))

evaluator.setMetricName('r2')
print("R2: ", evaluator.evaluate(pred))

evaluator.setMetricName('mae')
print("MAE: ", evaluator.evaluate(pred))

evaluator.setMetricName('var')
print("Explained variance: ", evaluator.evaluate(pred))

Regression Evaluation Metrics: 
RMSE:  28.156912645679043
R2:  0.8384812395097463
MAE:  22.452116900327816
Explained variance:  4115.669039494376


In [44]:
gaussian_regressor = GeneralizedLinearRegression(family='gaussian')
model = gaussian_regressor.fit(cars_train)
pred = model.transform(cars_train)
print("Gaussian Residual Distribution Regression")
evaluator = RegressionEvaluator(metricName='rmse')
print("RMSE: ", evaluator.evaluate(pred))
evaluator.setMetricName('r2')
print("R2: ", evaluator.evaluate(pred))
evaluator.setMetricName('mae')
print("MAE: ", evaluator.evaluate(pred))
evaluator.setMetricName('var')
print("Explained variance: ", evaluator.evaluate(pred))
print()

poisson_regressor = GeneralizedLinearRegression(family='poisson')
model = poisson_regressor.fit(cars_train)
pred = model.transform(cars_train)
print("Poisson Residual Distribution Regression")
evaluator = RegressionEvaluator(metricName='rmse')
print("RMSE: ", evaluator.evaluate(pred))
evaluator.setMetricName('r2')
print("R2: ", evaluator.evaluate(pred))
evaluator.setMetricName('mae')
print("MAE: ", evaluator.evaluate(pred))
evaluator.setMetricName('var')
print("Explained variance: ", evaluator.evaluate(pred))
print()

gamma_regressor = GeneralizedLinearRegression(family='gamma')
model = gamma_regressor.fit(cars_train)
pred = model.transform(cars_train)
print("Gamma Residual Distribution Regression")
evaluator = RegressionEvaluator(metricName='rmse')
print("RMSE: ", evaluator.evaluate(pred))
evaluator.setMetricName('r2')
print("R2: ", evaluator.evaluate(pred))
evaluator.setMetricName('mae')
print("MAE: ", evaluator.evaluate(pred))
evaluator.setMetricName('var')
print("Explained variance: ", evaluator.evaluate(pred))
print()

gamma_regressor = GeneralizedLinearRegression(family='tweedie', variancePower=1.5)
model = gamma_regressor.fit(cars_train)
pred = model.transform(cars_train)
print("Compound Poisson-Gamma Distribution Regression")
evaluator = RegressionEvaluator(metricName='rmse')
print("RMSE: ", evaluator.evaluate(pred))
evaluator.setMetricName('r2')
print("R2: ", evaluator.evaluate(pred))
evaluator.setMetricName('mae')
print("MAE: ", evaluator.evaluate(pred))
evaluator.setMetricName('var')
print("Explained variance: ", evaluator.evaluate(pred))
print()

Gaussian Residual Distribution Regression
RMSE:  28.156912645679043
R2:  0.8384812395097463
MAE:  22.452116900327816
Explained variance:  4115.669039494376

Poisson Residual Distribution Regression
RMSE:  21.445864854886782
R2:  0.9062999039830922
MAE:  17.690059312220498
Explained variance:  4256.556342512145

Gamma Residual Distribution Regression
RMSE:  21.93738200607269
R2:  0.9019556657332528
MAE:  17.952050147725455
Explained variance:  4767.5057407427275

Compound Poisson-Gamma Distribution Regression
RMSE:  20.361971403065272
R2:  0.9155319336242966
MAE:  17.134449940662535
Explained variance:  4467.983970447927



In [45]:
gamma_regressor = GeneralizedLinearRegression(family='tweedie', variancePower=1.5)
model = gamma_regressor.fit(cars_train)
pred = model.transform(cars_test)
pred.select('label', 'prediction').show()

print("Compound Poisson-Gamma Distribution Regression")
evaluator = RegressionEvaluator(metricName='rmse')
print("RMSE: ", evaluator.evaluate(pred))
evaluator.setMetricName('r2')
print("R2: ", evaluator.evaluate(pred))
evaluator.setMetricName('mae')
print("MAE: ", evaluator.evaluate(pred))
evaluator.setMetricName('var')
print("Explained variance: ", evaluator.evaluate(pred))

+-----+------------------+
|label|        prediction|
+-----+------------------+
|110.0|136.78035272999264|
| 91.0| 82.45982735478307|
|245.0| 213.5221532280251|
|180.0|191.78183479976053|
|180.0|191.78183479976053|
|105.0|106.65765121430641|
+-----+------------------+

Compound Poisson-Gamma Distribution Regression
RMSE:  18.53531650929676
R2:  0.8849105616564172
MAE:  15.33661549350199
Explained variance:  2346.2323084243108
