In [27]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("Python Spark regression example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
df = spark.read.format('csv').options(header='true', inferschema='true').\
load('data/project/regression/house-prices-advanced-regression-techniques/train.csv')

<font size=5> Show statistics of each column, including feature columns and label column (medv)  </font>

In [29]:
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- MSSubClass: integer (nullable = true)
 |-- MSZoning: string (nullable = true)
 |-- LotFrontage: string (nullable = true)
 |-- LotArea: integer (nullable = true)
 |-- Street: string (nullable = true)
 |-- Alley: string (nullable = true)
 |-- LotShape: string (nullable = true)
 |-- LandContour: string (nullable = true)
 |-- Utilities: string (nullable = true)
 |-- LotConfig: string (nullable = true)
 |-- LandSlope: string (nullable = true)
 |-- Neighborhood: string (nullable = true)
 |-- Condition1: string (nullable = true)
 |-- Condition2: string (nullable = true)
 |-- BldgType: string (nullable = true)
 |-- HouseStyle: string (nullable = true)
 |-- OverallQual: integer (nullable = true)
 |-- OverallCond: integer (nullable = true)
 |-- YearBuilt: integer (nullable = true)
 |-- YearRemodAdd: integer (nullable = true)
 |-- RoofStyle: string (nullable = true)
 |-- RoofMatl: string (nullable = true)
 |-- Exterior1st: string (nullable = true)
 |--

In [30]:
catcols=[]
for i in df.dtypes:
    if i[1] == 'string':
        catcols.append(i[0])

In [31]:
num_cols=[]
for i in df.dtypes:
    if i[1] != 'string' and i[0] != 'SalePrice':
        num_cols.append(i[0])

In [44]:
labelCol='SalePrice'

In [45]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql.functions import col

# categorical columns
categorical_columns = catcols

In [46]:
indexers = [ StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in categorical_columns ]

In [47]:
encoders = [ OneHotEncoder(inputCol=indexer.getOutputCol(), \
                           outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers ]

In [48]:
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() \
                                       for encoder in encoders] + num_cols, outputCol="features")

In [49]:
pipeline = Pipeline(stages=indexers + encoders + [assembler])
model=pipeline.fit(df)
data = model.transform(df)
data = data.withColumn('label',col(labelCol))
data=data.select('features','label')
data.show(5, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
|features                                                                                                                                                                                                                                                                                                                                                                           

In [50]:
from pyspark.ml.feature import VectorIndexer
# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous. 
# Update metadata accordingly.
featureIndexer =VectorIndexer(inputCol="features", \
                                  outputCol="indexedFeatures", \
                                  maxCategories=4).fit(data)


In [51]:
data=featureIndexer.transform(data)
data.show(5)

+--------------------+------+--------------------+
|            features| label|     indexedFeatures|
+--------------------+------+--------------------+
|(793,[0,10,114,11...|208500|(793,[0,10,114,11...|
|(793,[0,7,114,115...|181500|(793,[0,7,114,115...|
|(793,[0,17,114,11...|223500|(793,[0,17,114,11...|
|(793,[0,5,114,115...|140000|(793,[0,5,114,115...|
|(793,[0,46,114,11...|250000|(793,[0,46,114,11...|
+--------------------+------+--------------------+
only showing top 5 rows



In [52]:
# Split the data into training and test sets (40% held out for testing)
(trainingData, testData) = data.randomSplit([0.6, 0.4])
trainingData.show(5,False)
testData.show(5,False)

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

<font size=5>

Let's do Linear Regression first, fit the Linear Regression model with train_df
    
</font>

In [53]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(featuresCol = 'indexedFeatures', labelCol='label', maxIter=10, regParam=0.3, elasticNetParam=0.8)
lr_model = lr.fit(trainingData)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

23/05/18 14:55:31 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
23/05/18 14:55:31 WARN InstanceBuilder$NativeBLAS: Failed to load implementation from:dev.ludovic.netlib.blas.ForeignLinkerBLAS
23/05/18 14:55:32 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/05/18 14:55:32 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
Coefficients: [1917.160030379038,-1689.6978474747243,3435.4329150546705,1037.3754544569008,684.4296777375985,-5148.310450455826,1800.0343155607347,-1207.077225463191,-2966.1692788449154,8796.771942097905,-3829.021458911261,-2792.556154566117,-2000.972891761346,-1562.5804532307363,10173.762498840331,-3201.6420557683955,3380.7677221978784,-722.8250477465193,-5741.096100784033,674.0254566063854,3584.6765394678796,4292.232046093475,0.0,2572.509103791946,-0.0,8781.701002106041,586.4578111663332,-2339.811125507324,1223.67016592706,-14861.53669

<font size=5>
Linear Regression produced slope coefficients and intercept

y=a1 X x1 + a2 X x2 +...+ an X xn + b

a1,a2,...an are coefficients for the xn in their space
b is intercept

x1, x2, ... xn are independent variables

</font>

In [54]:
trainingSummary = lr_model.summary
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

RMSE: 11092.356683
r2: 0.981226


In [55]:
trainingData.describe().show()

+-------+------------------+
|summary|             label|
+-------+------------------+
|  count|               865|
|   mean|182455.25317919074|
| stddev|  81001.6225020652|
|    min|             34900|
|    max|            745000|
+-------+------------------+



<font size=5>

Test the model with test_df, testing produces metrics that evaluates the performance of the regressor with RMSE and R2 score.

  
    
</font>

In [56]:
lr_predictions = lr_model.transform(testData)
lr_predictions.select("prediction","label","indexedFeatures").show(5)
from pyspark.ml.evaluation import RegressionEvaluator
lr_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")
print("R Squared (R2) on test data = %g" % lr_evaluator.evaluate(lr_predictions))

+------------------+------+--------------------+
|        prediction| label|     indexedFeatures|
+------------------+------+--------------------+
| 221698.8891367484|180000|(793,[0,4,114,115...|
|230726.33736688783|272000|(793,[0,4,114,115...|
|254888.22204350145|236500|(793,[0,4,114,115...|
|190714.77210748335|228500|(793,[0,4,114,115...|
|238250.00716324686|287000|(793,[0,4,114,115...|
+------------------+------+--------------------+
only showing top 5 rows

R Squared (R2) on test data = 0.782314


In [57]:
test_result = lr_model.evaluate(testData)
print("Root Mean Squared Error (RMSE) on test data = %g" % test_result.rootMeanSquaredError)

Root Mean Squared Error (RMSE) on test data = 35957.5


In [58]:
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show(2)

numIterations: 10
objectiveHistory: [0.5, 0.334514953040227, 0.14759293589408232, 0.08313392503822171, 0.0438587159572454, 0.03263985570710769, 0.017880267778532046, 0.014395661604855682, 0.011906319499620252, 0.010370229455823547, 0.009406911580501684]
+------------------+
|         residuals|
+------------------+
|18742.939005149063|
| 787.3177377092652|
+------------------+
only showing top 2 rows



<font size=5>
    
Now try Gradient Boost Tree Regressor with the same train_df and test_df 
    
    
</font>

In [59]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator


In [61]:
gbt = GBTRegressor(featuresCol="indexedFeatures",labelCol='label', maxIter=10)
gbt_model = gbt.fit(trainingData)


In [62]:
gbt_predictions = gbt_model.transform(testData)
gbt_predictions.select("prediction","label","indexedFeatures").show(5)

+------------------+------+--------------------+
|        prediction| label|     indexedFeatures|
+------------------+------+--------------------+
|298270.68872449966|180000|(793,[0,4,114,115...|
|249595.29012726084|272000|(793,[0,4,114,115...|
|309128.73236294644|236500|(793,[0,4,114,115...|
| 286623.7316972314|228500|(793,[0,4,114,115...|
| 288485.1788028957|287000|(793,[0,4,114,115...|
+------------------+------+--------------------+
only showing top 5 rows



<font size=5>

Test the model with testData, testing produces metrics that evaluates the performance of the regressor with RMSE and R2 score.

Looks like the metrics of Gradient Boost Tree are better that those of Linear Regressor
    
    
</font>

In [63]:
from pyspark.ml.evaluation import RegressionEvaluator
gbt_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")
print("R Squared (R2) on test data = %g" % gbt_evaluator.evaluate(gbt_predictions))

R Squared (R2) on test data = 0.623158


In [64]:
gbt_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="rmse")

In [65]:
print("RMSE on test data = %g" % gbt_evaluator.evaluate(gbt_predictions))

RMSE on test data = 47310.1


<font size=5>

Now try Random Forest Regressor with the same train_df and testData
    
</font>

In [66]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator


In [68]:

rf = RandomForestRegressor(featuresCol="indexedFeatures",labelCol='label', maxDepth=3)
rf_model = rf.fit(trainingData)

In [70]:
rf_predictions = rf_model.transform(testData)
rf_predictions.select("prediction","label","indexedFeatures").show(5)

+------------------+------+--------------------+
|        prediction| label|     indexedFeatures|
+------------------+------+--------------------+
|196813.88408157713|180000|(793,[0,4,114,115...|
|245360.10733130894|272000|(793,[0,4,114,115...|
|272346.38704260724|236500|(793,[0,4,114,115...|
|251881.43152475887|228500|(793,[0,4,114,115...|
| 266022.5401291818|287000|(793,[0,4,114,115...|
+------------------+------+--------------------+
only showing top 5 rows



<font size=5>
    
Test the model with testData, testing produces metrics that evaluates the performance of the regressor with RMSE and R2 score.

Looks like the metrics of Random Forest are better that those of Linear Regressor, but similar to those of Gradient Boost Tree
    
</font>

In [71]:
from pyspark.ml.evaluation import RegressionEvaluator
rf_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")
print("R Squared (R2) on test data = %g" % rf_evaluator.evaluate(rf_predictions))

R Squared (R2) on test data = 0.748724


In [72]:
rf_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="rmse")

In [73]:
print("RMSE on test data = %g" % gbt_evaluator.evaluate(gbt_predictions))

RMSE on test data = 47310.1


<font size=5>

Finally, try Decision Tree regressor with the same train_df and test_df
    
    
</font>

In [74]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator


In [75]:
dt = DecisionTreeRegressor(featuresCol="indexedFeatures",labelCol='label', maxDepth=3)
dt_model = dt.fit(trainingData)

In [76]:
dt_predictions = dt_model.transform(testData)
dt_predictions.select("prediction","label","indexedFeatures").show(5)

+------------------+------+--------------------+
|        prediction| label|     indexedFeatures|
+------------------+------+--------------------+
|213025.18867924527|180000|(793,[0,4,114,115...|
|251076.86440677967|272000|(793,[0,4,114,115...|
|          349955.3|236500|(793,[0,4,114,115...|
|251076.86440677967|228500|(793,[0,4,114,115...|
|251076.86440677967|287000|(793,[0,4,114,115...|
+------------------+------+--------------------+
only showing top 5 rows



<font size=5>
    
Test the model with test_df, testing produces metrics that evaluates the performance of the regressor with RMSE and R2 score.

Looks like the metrics of Decision Tree Regressor are slightly better than that those of Linear Regressor, but not as good as Gradient Boost Tree and Random Forest

</font>

In [77]:
from pyspark.ml.evaluation import RegressionEvaluator
dt_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="r2")
print("R Squared (R2) on test data = %g" % dt_evaluator.evaluate(dt_predictions))

R Squared (R2) on test data = 0.604514


In [78]:
dt_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="label",metricName="rmse")

In [79]:
print("RMSE on test data = %g" % dt_evaluator.evaluate(dt_predictions))

RMSE on test data = 48466.3


<font size=5>

This concludes the testing of Spark ML regressors

</font>