# DATA PREPARATION

In [3]:
import pyspark
from pyspark.sql.functions import *
from pyspark.ml.feature import StringIndexer, IndexToString, RFormula, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline, Model






#required imports
from sklearn.neural_network import MLPRegressor
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sn
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error  
from math import sqrt 

## RMSE = Root Mean Square Error 
is a frequently used measure of the differences between values predicted by a model. 

It is the standard deviation of prediction errors.<br>
[RMSE](https://www.statisticshowto.datasciencecentral.com/rmse/) answers the question of how concentrated the data is around the line of best fit.
![](img/rmse.png)
Where:<br>
f = forecasts (expected values or unknown results)<br>
o = observed values (known results)

In [4]:
#load features
features = pd.read_csv("raw_files/features_newest.csv")
features.head()

Unnamed: 0,Month,Season,Weekday,Holiday,Daily Weather,Daily Weather (Past),Humidity,Humidity (Past),Windspeed,Windspeed (Past),Apparent Temperature (Avg),Apparent Temperature (Avg) (Past),Rented Bikes,Rented Bikes (Future)
0,January,Winter,Monday,False,partly-cloudy-day,fog,0.88,0.94,1.59,0.55,46.74,36.295,281,279.0
1,January,Winter,Tuesday,False,partly-cloudy-day,partly-cloudy-day,0.86,0.88,2.07,1.59,42.15,46.74,279,274.0
2,January,Winter,Wednesday,False,clear-day,partly-cloudy-day,0.86,0.86,4.13,2.07,45.45,42.15,274,161.0
3,January,Winter,Thursday,False,rain,clear-day,0.87,0.86,3.6,4.13,46.2,45.45,161,270.0
4,January,Winter,Friday,False,partly-cloudy-day,rain,0.81,0.87,7.43,3.6,56.085,46.2,270,62.0


In [5]:
#target variable
label = features['Rented Bikes'].tolist()

In [6]:
len(label)

1514

In [7]:
#transorm non-numerical labels to numerical labels
le = LabelEncoder()
features.loc[:,'Holiday'] = le.fit_transform(features['Holiday'])
features.head()

Unnamed: 0,Month,Season,Weekday,Holiday,Daily Weather,Daily Weather (Past),Humidity,Humidity (Past),Windspeed,Windspeed (Past),Apparent Temperature (Avg),Apparent Temperature (Avg) (Past),Rented Bikes,Rented Bikes (Future)
0,January,Winter,Monday,0,partly-cloudy-day,fog,0.88,0.94,1.59,0.55,46.74,36.295,281,279.0
1,January,Winter,Tuesday,0,partly-cloudy-day,partly-cloudy-day,0.86,0.88,2.07,1.59,42.15,46.74,279,274.0
2,January,Winter,Wednesday,0,clear-day,partly-cloudy-day,0.86,0.86,4.13,2.07,45.45,42.15,274,161.0
3,January,Winter,Thursday,0,rain,clear-day,0.87,0.86,3.6,4.13,46.2,45.45,161,270.0
4,January,Winter,Friday,0,partly-cloudy-day,rain,0.81,0.87,7.43,3.6,56.085,46.2,270,62.0


In [8]:
#transorm non-numerical labels to numerical labels
cleanup_nums = {"Month":   {"January": 1, "February": 2, "March": 3, "April": 4, "May": 5, "June": 6, "July": 7, "August": 8, "September": 9, "October": 10, "November": 11, "December": 12},
               "Season": {"Spring": 1, "Summer": 2, "Autumn": 3, "Winter": 4},
               "Weekday": {"Monday": 1, "Tuesday": 2, "Wednesday": 3, "Thursday": 4,
                                  "Friday": 5, "Saturday": 6, "Sunday":7 },
               "Daily Weather": {"clear-day": 1, "partly-cloudy-day": 2,"cloudy":3,"wind":4, "rain": 5, "fog": 6, "snow": 7},
               "Daily Weather (Past)": {"clear-day": 1, "partly-cloudy-day": 2,"cloudy":3,"wind":4, "rain": 5,"fog": 6, "snow": 7} }

In [9]:
features.replace(cleanup_nums, inplace=True)

In [10]:
features.head()

Unnamed: 0,Month,Season,Weekday,Holiday,Daily Weather,Daily Weather (Past),Humidity,Humidity (Past),Windspeed,Windspeed (Past),Apparent Temperature (Avg),Apparent Temperature (Avg) (Past),Rented Bikes,Rented Bikes (Future)
0,1,4,1,0,2,6.0,0.88,0.94,1.59,0.55,46.74,36.295,281,279.0
1,1,4,2,0,2,2.0,0.86,0.88,2.07,1.59,42.15,46.74,279,274.0
2,1,4,3,0,1,2.0,0.86,0.86,4.13,2.07,45.45,42.15,274,161.0
3,1,4,4,0,5,1.0,0.87,0.86,3.6,4.13,46.2,45.45,161,270.0
4,1,4,5,0,2,5.0,0.81,0.87,7.43,3.6,56.085,46.2,270,62.0


In [11]:
features.shape

(1514, 14)

In [12]:
#Feature Matrix without past
X=features.loc[:,['Month','Season','Weekday','Daily Weather','Humidity','Windspeed', 'Apparent Temperature (Avg)','Rented Bikes (Future)']]
X.head()

Unnamed: 0,Month,Season,Weekday,Daily Weather,Humidity,Windspeed,Apparent Temperature (Avg),Rented Bikes (Future)
0,1,4,1,2,0.88,1.59,46.74,279.0
1,1,4,2,2,0.86,2.07,42.15,274.0
2,1,4,3,1,0.86,4.13,45.45,161.0
3,1,4,4,5,0.87,3.6,46.2,270.0
4,1,4,5,2,0.81,7.43,56.085,62.0


In [14]:
#X['label'] = y

In [68]:
#X.to_csv('raw_files/encoded_last.csv')

In [13]:


# Below code is Spark 2+
spark = pyspark.sql.SparkSession.builder.appName('test').getOrCreate()
df = spark.read.csv('raw_files/encoded_last.csv', header = True, inferSchema = True)


In [14]:
df.show(5)

+---+-----+------+-------+-------------+--------+---------+--------------------------+---------------------+-----+
|_c0|Month|Season|Weekday|Daily Weather|Humidity|Windspeed|Apparent Temperature (Avg)|Rented Bikes (Future)|label|
+---+-----+------+-------+-------------+--------+---------+--------------------------+---------------------+-----+
|  0|    1|     4|      1|            2|    0.88|     1.59|                     46.74|                279.0|  281|
|  1|    1|     4|      2|            2|    0.86|     2.07|         42.15000000000001|                274.0|  279|
|  2|    1|     4|      3|            1|    0.86|     4.13|                     45.45|                161.0|  274|
|  3|    1|     4|      4|            5|    0.87|      3.6|                      46.2|                270.0|  161|
|  4|    1|     4|      5|            2|    0.81|     7.43|                    56.085|                 62.0|  270|
+---+-----+------+-------+-------------+--------+---------+---------------------

In [15]:
#cols = df.columns
#categoricalColumns = [, "Daily Weather (Past)"]
cols = [ "Humidity", "Windspeed", "Apparent Temperature (Avg)", "Rented Bikes (Future)", "Daily Weather", "Month", "Season", "Weekday" ]
stages = [] # stages in our Pipeline


assembler = VectorAssembler(inputCols= cols , outputCol="features").setHandleInvalid("skip")

preppedDataDF = assembler.transform(df)





In [16]:
preppedDataDF.show(5)

+---+-----+------+-------+-------------+--------+---------+--------------------------+---------------------+-----+--------------------+
|_c0|Month|Season|Weekday|Daily Weather|Humidity|Windspeed|Apparent Temperature (Avg)|Rented Bikes (Future)|label|            features|
+---+-----+------+-------+-------------+--------+---------+--------------------------+---------------------+-----+--------------------+
|  0|    1|     4|      1|            2|    0.88|     1.59|                     46.74|                279.0|  281|[0.88,1.59,46.74,...|
|  1|    1|     4|      2|            2|    0.86|     2.07|         42.15000000000001|                274.0|  279|[0.86,2.07,42.150...|
|  2|    1|     4|      3|            1|    0.86|     4.13|                     45.45|                161.0|  274|[0.86,4.13,45.45,...|
|  3|    1|     4|      4|            5|    0.87|      3.6|                      46.2|                270.0|  161|[0.87,3.6,46.2,27...|
|  4|    1|     4|      5|            2|    0.81

In [17]:
preppedDataDF.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Season: integer (nullable = true)
 |-- Weekday: integer (nullable = true)
 |-- Daily Weather: integer (nullable = true)
 |-- Humidity: double (nullable = true)
 |-- Windspeed: double (nullable = true)
 |-- Apparent Temperature (Avg): double (nullable = true)
 |-- Rented Bikes (Future): double (nullable = true)
 |-- label: integer (nullable = true)
 |-- features: vector (nullable = true)



In [18]:
# Keep relevant columns
selectedcols = ["features", "label"]
dataset = preppedDataDF.select(selectedcols)
display(dataset)



DataFrame[features: vector, label: int]

In [19]:
dataset.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.88,1.59,46.74,...|  281|
|[0.86,2.07,42.150...|  279|
|[0.86,4.13,45.45,...|  274|
|[0.87,3.6,46.2,27...|  161|
|[0.81,7.43,56.085...|  270|
+--------------------+-----+
only showing top 5 rows



In [20]:
### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = dataset.randomSplit([0.7, 0.3], seed=100)
print(trainingData.count())
print(testData.count())

1055
458


In [23]:
trainingData.show(5)

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[0.41,10.85,70.53...|  383|
|[0.48,5.68,70.039...|  361|
|[0.49,9.74,56.395...|  374|
|[0.49,11.71,67.15...|  374|
|[0.51,2.63,65.36,...|   89|
+--------------------+-----+
only showing top 5 rows



  #  Multinominal logistic regression

In [25]:
from pyspark.ml.classification import LogisticRegression


lr = LogisticRegression(labelCol="label", featuresCol="features",maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(dataset)

# Print the coefficients and intercept for multinomial logistic regression
print("Coefficients: \n" + str(lrModel.coefficientMatrix))
print("Intercept: " + str(lrModel.interceptVector))

trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# for multiclass, we can inspect metrics on a per-label basis
print("False positive rate by label:")
for i, rate in enumerate(trainingSummary.falsePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("True positive rate by label:")
for i, rate in enumerate(trainingSummary.truePositiveRateByLabel):
    print("label %d: %s" % (i, rate))

print("Precision by label:")
for i, prec in enumerate(trainingSummary.precisionByLabel):
    print("label %d: %s" % (i, prec))

print("Recall by label:")
for i, rec in enumerate(trainingSummary.recallByLabel):
    print("label %d: %s" % (i, rec))

print("F-measure by label:")
for i, f in enumerate(trainingSummary.fMeasureByLabel()):
    print("label %d: %s" % (i, f))

accuracy = trainingSummary.accuracy
falsePositiveRate = trainingSummary.weightedFalsePositiveRate
truePositiveRate = trainingSummary.weightedTruePositiveRate
fMeasure = trainingSummary.weightedFMeasure()
precision = trainingSummary.weightedPrecision
recall = trainingSummary.weightedRecall
print("Accuracy: %s\nFPR: %s\nTPR: %s\nF-measure: %s\nPrecision: %s\nRecall: %s"
      % (accuracy, falsePositiveRate, truePositiveRate, fMeasure, precision, recall))

Coefficients: 
839 X 8 CSCMatrix

Intercept: [-1.462615163343616,-1.462615163343616,-1.462615163343616,-1.462615163343616,-1.462615163343616,-1.462615163343616,-1.462615163343616,-1.462615163343616,-1.462615163343616,-1.462615163343616,-1.462615163343616,-1.462615163343616,0.5815132999508911,-1.462615163343616,-1.462615163343616,-1.462615163343616,0.5815132999508911,1.2922070125915213,2.260642846094094,-1.462615163343616,0.5815132999508881,-1.462615163343616,0.5815132999508881,0.5815132999508883,2.260642846094097,-1.462615163343616,1.2922070125915068,2.260642846094111,1.2922070125915566,2.833845900892422,2.7379891427734884,2.260642846094104,2.514663006163527,1.292207012591538,2.514663006163527,2.2606428460940964,2.9983243937426014,2.260642846094149,3.2313365366756144,2.833845900892425,2.8338459008924297,2.5146630061635276,2.514663006163527,3.231336536675633,2.737989142773429,2.514663006163525,3.3651899764184336,-1.462615163343616,2.260642846094159,2.833845900892419,2.260642846094061,2.

label 0: 0.0
label 1: 0.0
label 2: 0.0
label 3: 0.0
label 4: 0.0
label 5: 0.0
label 6: 0.0
label 7: 0.0
label 8: 0.0
label 9: 0.0
label 10: 0.0
label 11: 0.0
label 12: 0.0
label 13: 0.0
label 14: 0.0
label 15: 0.0
label 16: 0.0
label 17: 0.0
label 18: 0.0
label 19: 0.0
label 20: 0.0
label 21: 0.0
label 22: 0.0
label 23: 0.0
label 24: 0.0
label 25: 0.0
label 26: 0.0
label 27: 0.0
label 28: 0.0
label 29: 0.0
label 30: 0.0
label 31: 0.0
label 32: 0.0
label 33: 0.0
label 34: 0.0
label 35: 0.0
label 36: 0.0
label 37: 0.0
label 38: 0.0
label 39: 0.0
label 40: 0.0
label 41: 0.0
label 42: 0.0
label 43: 0.0
label 44: 0.0
label 45: 0.0
label 46: 0.0
label 47: 0.0
label 48: 0.0
label 49: 0.0
label 50: 0.0
label 51: 0.0
label 52: 0.0
label 53: 0.0
label 54: 0.0
label 55: 0.0
label 56: 0.0
label 57: 0.0
label 58: 0.0
label 59: 0.0
label 60: 0.0
label 61: 0.0
label 62: 0.0
label 63: 0.0
label 64: 0.0
label 65: 0.0
label 66: 0.0
label 67: 0.0
label 68: 0.0
label 69: 0.0
label 70: 0.0
label 71: 0.0
la

In [30]:
predictions = lrModel.transform(testData)
predictions.select("label","features","prediction").toPandas().head()

Unnamed: 0,label,features,prediction
0,360,"[0.5, 0.75, 69.465, 387.0, 1.0, 6.0, 2.0, 1.0]",374.0
1,102,"[0.51, 8.14, 74.225, 383.0, 1.0, 7.0, 2.0, 7.0]",374.0
2,421,"[0.52, 2.63, 78.645, 350.0, 2.0, 7.0, 2.0, 3.0]",374.0
3,410,"[0.53, 2.21, 56.19499999999999, 368.0, 1.0, 5....",374.0
4,838,"[0.55, 1.77, 62.97, 359.0, 2.0, 7.0, 2.0, 4.0]",374.0


In [31]:
dfs= predictions.toPandas()

In [32]:
a=dfs.loc[:,'label']
b=dfs.loc[:,'prediction']
rmse=sqrt(mean_squared_error(a, b))  
"The root mean square error of the model is {0:.3f}".format(rmse) 

'The root mean square error of the model is 181.051'

  #  NAIVE BAYES
  
  Naive Bayes can be trained very efficiently. With a single pass over the training data, it computes the conditional probability distribution of each feature given each label. For prediction, it applies Bayes’ theorem to compute the conditional probability distribution of each label given an observatio

In [33]:

from pyspark.ml.classification import NaiveBayes
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


# create the trainer and set its parameters
nb = NaiveBayes(labelCol="label", featuresCol="features",smoothing=1.0, modelType="multinomial")

# train the model
model = nb.fit(trainingData)

# select example rows to display.
predictions = model.transform(testData)
predictions.show()

# compute accuracy on the test set
evaluator = MulticlassClassificationEvaluator(
                                              metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Test set accuracy = " + str(accuracy))

+--------------------+-----+--------------------+--------------------+----------+
|            features|label|       rawPrediction|         probability|prediction|
+--------------------+-----+--------------------+--------------------+----------+
|[0.5,0.75,69.465,...|  360|[-503.39013414973...|[1.76103730584544...|     267.0|
|[0.51,8.14,74.225...|  102|[-551.43867179154...|[4.38931506436329...|     195.0|
|[0.52,2.63,78.645...|  421|[-494.37295733011...|[8.32206870204753...|     251.0|
|[0.53,2.21,56.194...|  410|[-479.90857080891...|[9.31814034778108...|     249.0|
|[0.55,1.77,62.97,...|  838|[-487.33677132064...|[1.19584488718303...|     267.0|
|[0.55,4.18,74.36,...|   80|[-258.32644246764...|[5.52391797803166...|     224.0|
|[0.55,4.72,61.725...|  359|[-224.20973119386...|[1.51376703753594...|      49.0|
|[0.57,1.06,68.235...|  404|[-444.16443544783...|[7.71590975311418...|     267.0|
|[0.57,4.0,42.395,...|  374|[-461.98928714617...|[1.12571477313646...|     252.0|
|[0.57,5.72,67.7

In [34]:
dfs= predictions.toPandas()


In [35]:
a=dfs.loc[:,'label']
b=dfs.loc[:,'prediction']
rmse=sqrt(mean_squared_error(a, b))  
"The root mean square error of the model is {0:.3f}".format(rmse) 

'The root mean square error of the model is 145.654'

  #  Generaliszed linear regression
  
  Contrasted with linear regression where the output is assumed to follow a Gaussian distribution, generalized linear models (GLMs) are specifications of linear models where the response variable Yi follows some distribution from the exponential family of distributions. 
  
  Spark currently only supports up to 4096 features through its GeneralizedLinearRegression interface

In [38]:
from pyspark.ml.regression import GeneralizedLinearRegression

glr = GeneralizedLinearRegression(family="gaussian", link="identity", maxIter=10, regParam=0.3)

# Fit the model
model = glr.fit(dataset)

# Print the coefficients and intercept for generalized linear regression model
print("Coefficients: " + str(model.coefficients))
print("Intercept: " + str(model.intercept))

# Summarize the model over the training set and print out some metrics
summary = model.summary
print("Coefficient Standard Errors: " + str(summary.coefficientStandardErrors))
print("T Values: " + str(summary.tValues))
print("P Values: " + str(summary.pValues))
print("Dispersion: " + str(summary.dispersion))
print("Null Deviance: " + str(summary.nullDeviance))
print("Residual Degree Of Freedom Null: " + str(summary.residualDegreeOfFreedomNull))
print("Deviance: " + str(summary.deviance))
print("Residual Degree Of Freedom: " + str(summary.residualDegreeOfFreedom))
print("AIC: " + str(summary.aic))
print("Deviance Residuals: ")
summary.residuals().show()

Coefficients: [-168.14083236129397,-2.851978822244318,1.5585176168533887,0.08754512234866518,-3.8360147309195183,-0.28577816703179526,2.8103070551708647,-43.28625659841644]
Intercept: 465.43731047315106
Coefficient Standard Errors: [33.54686978682021, 0.7772676570016662, 0.2746747565911543, 0.020113183443095234, 1.756219325058362, 0.7370166523537957, 2.440094164406971, 1.3105860301613133, 32.702366783907394]
T Values: [-5.012116880942275, -3.6692364548473733, 5.6740475032920425, 4.352623869630097, -2.184245826353176, -0.3877499458378737, 1.1517207393731335, -33.0281687750697, 14.232526763236889]
P Values: [6.020876093248972e-07, 0.0002517662550869382, 1.6702176752758646e-08, 1.4359620283954655e-05, 0.02909817321000152, 0.6982559591332509, 0.24961889322891917, 0.0, 0.0]
Dispersion: 7950.5953440406865
Null Deviance: 26577824.753469933
Residual Degree Of Freedom Null: 1512
Deviance: 11957695.397437192
Residual Degree Of Freedom: 1504
AIC: 17892.937290351514
Deviance Residuals: 
+---------

In [45]:
predictions = model.transform(testData)
predictions.select("label","features","prediction").toPandas().head()


Unnamed: 0,label,features,prediction
0,360,"[0.5, 0.75, 69.465, 387.0, 1.0, 6.0, 2.0, 1.0]",478.153973
1,102,"[0.51, 8.14, 74.225, 383.0, 1.0, 7.0, 2.0, 7.0]",202.461486
2,421,"[0.52, 2.63, 78.645, 350.0, 2.0, 7.0, 2.0, 3.0]",389.803152
3,410,"[0.53, 2.21, 56.19499999999999, 368.0, 1.0, 5....",314.217674
4,838,"[0.55, 1.77, 62.97, 359.0, 2.0, 7.0, 2.0, 4.0]",320.283515


In [46]:
dfs_glr= predictions.toPandas()
a=dfs_glr.loc[:,'label']
b=dfs_glr.loc[:,'prediction']
rmse=sqrt(mean_squared_error(a, b))  
"The root mean square error of the model is {0:.3f}".format(rmse) 

'The root mean square error of the model is 95.392'

# Decision tree regression

In [50]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator



# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dataset)


# Train a DecisionTree model.
dt = DecisionTreeRegressor(featuresCol="indexedFeatures")

# Chain indexer and tree in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, dt])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

treeModel = model.stages[1]
# summary only
print(treeModel)

+------------------+-----+--------------------+
|        prediction|label|            features|
+------------------+-----+--------------------+
| 361.1818181818182|  360|[0.5,0.75,69.465,...|
| 92.93333333333334|  102|[0.51,8.14,74.225...|
|388.30526315789473|  421|[0.52,2.63,78.645...|
|388.30526315789473|  410|[0.53,2.21,56.194...|
|388.30526315789473|  838|[0.55,1.77,62.97,...|
+------------------+-----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 66.7557
DecisionTreeRegressionModel (uid=DecisionTreeRegressor_58ea3674f6aa) of depth 5 with 63 nodes


# Random forest regression

In [54]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator


# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dataset)



# Train a RandomForest model.
rf = RandomForestRegressor(featuresCol="indexedFeatures")

# Chain indexer and forest in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, rf])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

rfModel = model.stages[1]
print(rfModel)  # summary only

+------------------+-----+--------------------+
|        prediction|label|            features|
+------------------+-----+--------------------+
|370.21593153596564|  360|[0.5,0.75,69.465,...|
|130.59313410056097|  102|[0.51,8.14,74.225...|
| 361.3742856113027|  421|[0.52,2.63,78.645...|
|361.10228822788997|  410|[0.53,2.21,56.194...|
|360.65701037478044|  838|[0.55,1.77,62.97,...|
+------------------+-----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 65.4324
RandomForestRegressionModel (uid=RandomForestRegressor_960abfea5bd4) with 20 trees


# Gradient-boosted tree regression

In [57]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import GBTRegressor
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator



# Automatically identify categorical features, and index them.
# Set maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
    VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(dataset)


# Train a GBT model.
gbt = GBTRegressor(featuresCol="indexedFeatures", maxIter=10)

# Chain indexer and GBT in a Pipeline
pipeline = Pipeline(stages=[featureIndexer, gbt])

# Train model.  This also runs the indexer.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
predictions.select("prediction", "label", "features").show(5)

# Select (prediction, true label) and compute test error
evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")
rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

gbtModel = model.stages[1]
print(gbtModel)  # summary only

+------------------+-----+--------------------+
|        prediction|label|            features|
+------------------+-----+--------------------+
|   370.80126713324|  360|[0.5,0.75,69.465,...|
| 92.30893779412457|  102|[0.51,8.14,74.225...|
|385.27381530022654|  421|[0.52,2.63,78.645...|
| 393.3158747084903|  410|[0.53,2.21,56.194...|
| 393.9794239160642|  838|[0.55,1.77,62.97,...|
+------------------+-----+--------------------+
only showing top 5 rows

Root Mean Squared Error (RMSE) on test data = 66.0122
GBTRegressionModel (uid=GBTRegressor_3b634f05a358) with 10 trees
