In [68]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [69]:
!ls

gdrive	sample_data  spark-2.4.4-bin-hadoop2.7	spark-2.4.4-bin-hadoop2.7.tgz


In [0]:
DIR = '/content/gdrive/My Drive/Spark/data/'

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://www-eu.apache.org/dist/spark/spark-2.4.4/spark-2.4.4-bin-hadoop2.7.tgz
!tar xf spark-2.4.4-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.4-bin-hadoop2.7"

In [0]:
import findspark
findspark.init("spark-2.4.4-bin-hadoop2.7")# SPARK_HOME

In [0]:
from pyspark.sql import SparkSession

spark = SparkSession \
        .builder \
        .appName('Python Spark regression example')\
        .config('spark.some.config.option','some-value')\
        .getOrCreate()

In [75]:
file_loc = DIR + 'Advertising.csv'
#print(file_loc)
df = spark.read.csv(file_loc, inferSchema=True, header=True)
print(type(df))

<class 'pyspark.sql.dataframe.DataFrame'>


In [76]:
df.show()
df.printSchema()

+---+-----+-----+---------+-----+
|_c0|   TV|Radio|Newspaper|Sales|
+---+-----+-----+---------+-----+
|  1|230.1| 37.8|     69.2| 22.1|
|  2| 44.5| 39.3|     45.1| 10.4|
|  3| 17.2| 45.9|     69.3|  9.3|
|  4|151.5| 41.3|     58.5| 18.5|
|  5|180.8| 10.8|     58.4| 12.9|
|  6|  8.7| 48.9|     75.0|  7.2|
|  7| 57.5| 32.8|     23.5| 11.8|
|  8|120.2| 19.6|     11.6| 13.2|
|  9|  8.6|  2.1|      1.0|  4.8|
| 10|199.8|  2.6|     21.2| 10.6|
| 11| 66.1|  5.8|     24.2|  8.6|
| 12|214.7| 24.0|      4.0| 17.4|
| 13| 23.8| 35.1|     65.9|  9.2|
| 14| 97.5|  7.6|      7.2|  9.7|
| 15|204.1| 32.9|     46.0| 19.0|
| 16|195.4| 47.7|     52.9| 22.4|
| 17| 67.8| 36.6|    114.0| 12.5|
| 18|281.4| 39.6|     55.8| 24.4|
| 19| 69.2| 20.5|     18.3| 11.3|
| 20|147.3| 23.9|     19.1| 14.6|
+---+-----+-----+---------+-----+
only showing top 20 rows

root
 |-- _c0: integer (nullable = true)
 |-- TV: double (nullable = true)
 |-- Radio: double (nullable = true)
 |-- Newspaper: double (nullable = true)
 |-- 

In [0]:
from pyspark.sql import Row
from pyspark.ml.linalg import Vectors

# method 1 (good for small feature):
#def transData(row):
# return Row(label=row["Sales"],
# features=Vectors.dense([row["TV"],
# row["Radio"],
# row["Newspaper"]]))

# Method 2 (good for large features):

def transData(data):
  return data.rdd.map(lambda r: [Vectors.dense(r[1:-1]),r[-1]]).toDF(['features','label'])

In [78]:
transformed= transData(df)
transformed.show(5)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]|  9.3|
|[151.5,41.3,58.5]| 18.5|
|[180.8,10.8,58.4]| 12.9|
+-----------------+-----+
only showing top 5 rows



In [0]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

featureIndexer = VectorIndexer(inputCol = 'features', \
                               outputCol = 'indexedFeatures', \
                               maxCategories = 4).fit(transformed)

data = featureIndexer.transform(transformed)              

In [80]:
data.show(5,True)

+-----------------+-----+-----------------+
|         features|label|  indexedFeatures|
+-----------------+-----+-----------------+
|[230.1,37.8,69.2]| 22.1|[230.1,37.8,69.2]|
| [44.5,39.3,45.1]| 10.4| [44.5,39.3,45.1]|
| [17.2,45.9,69.3]|  9.3| [17.2,45.9,69.3]|
|[151.5,41.3,58.5]| 18.5|[151.5,41.3,58.5]|
|[180.8,10.8,58.4]| 12.9|[180.8,10.8,58.4]|
+-----------------+-----+-----------------+
only showing top 5 rows



In [0]:
# Split the data

(trainingData, testData) = transformed.randomSplit([0.6,0.4])

In [82]:
trainingData.show(5)
testData.show(5)

+----------------+-----+
|        features|label|
+----------------+-----+
|  [4.1,11.6,5.7]|  3.2|
| [7.3,28.1,41.4]|  5.5|
| [7.8,38.9,50.6]|  6.6|
|  [8.4,27.2,2.1]|  5.7|
|[11.7,36.9,45.2]|  7.3|
+----------------+-----+
only showing top 5 rows

+---------------+-----+
|       features|label|
+---------------+-----+
| [0.7,39.6,8.7]|  1.6|
| [5.4,29.9,9.4]|  5.3|
|  [8.6,2.1,1.0]|  4.8|
|[8.7,48.9,75.0]|  7.2|
|[13.1,0.4,25.6]|  5.3|
+---------------+-----+
only showing top 5 rows



In [0]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression()
pipeline = Pipeline(stages=[featureIndexer,lr])

model = pipeline.fit(trainingData)

In [84]:
predictions = model.transform(testData)
predictions.select('features','label','prediction').show(5)

+---------------+-----+------------------+
|       features|label|        prediction|
+---------------+-----+------------------+
| [0.7,39.6,8.7]|  1.6|10.623733334918189|
| [5.4,29.9,9.4]|  5.3| 8.917258844485561|
|  [8.6,2.1,1.0]|  4.8| 3.554151288083581|
|[8.7,48.9,75.0]|  7.2|12.882628817673519|
|[13.1,0.4,25.6]|  5.3| 3.442348044013835|
+---------------+-----+------------------+
only showing top 5 rows



In [85]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(labelCol='label',predictionCol='prediction',
                                metricName='rmse')

rmse = evaluator.evaluate(predictions)
print('Root Mean Squared Error (RMSE) on test data = %g' %rmse)

Root Mean Squared Error (RMSE) on test data = 1.83065


In [86]:
y_true = predictions.select('label').toPandas()
y_pred = predictions.select('prediction').toPandas()

import sklearn.metrics

r2_score = sklearn.metrics.r2_score(y_true,y_pred)
print('r2_score: {0}'.format(r2_score))

r2_score: 0.8833252889477862


## Generalized Linear Regression

In [87]:
df.describe().show()

+-------+------------------+-----------------+------------------+------------------+------------------+
|summary|               _c0|               TV|             Radio|         Newspaper|             Sales|
+-------+------------------+-----------------+------------------+------------------+------------------+
|  count|               200|              200|               200|               200|               200|
|   mean|             100.5|         147.0425|23.264000000000024|30.553999999999995|14.022500000000003|
| stddev|57.879184513951124|85.85423631490805|14.846809176168728| 21.77862083852283| 5.217456565710477|
|    min|                 1|              0.7|               0.0|               0.3|               1.6|
|    max|               200|            296.4|              49.6|             114.0|              27.0|
+-------+------------------+-----------------+------------------+------------------+------------------+



In [88]:
transformed.show(5)

+-----------------+-----+
|         features|label|
+-----------------+-----+
|[230.1,37.8,69.2]| 22.1|
| [44.5,39.3,45.1]| 10.4|
| [17.2,45.9,69.3]|  9.3|
|[151.5,41.3,58.5]| 18.5|
|[180.8,10.8,58.4]| 12.9|
+-----------------+-----+
only showing top 5 rows



In [89]:
from pyspark.ml import Pipeline
from pyspark.ml.regression import LinearRegression
from pyspark.ml.feature import VectorIndexer
from pyspark.ml.evaluation import RegressionEvaluator

featureIndexer = VectorIndexer(inputCol = 'features', \
                               outputCol = 'indexedFeatures', \
                               maxCategories = 4).fit(transformed)

data = featureIndexer.transform(transformed)  

data.show()

+-----------------+-----+-----------------+
|         features|label|  indexedFeatures|
+-----------------+-----+-----------------+
|[230.1,37.8,69.2]| 22.1|[230.1,37.8,69.2]|
| [44.5,39.3,45.1]| 10.4| [44.5,39.3,45.1]|
| [17.2,45.9,69.3]|  9.3| [17.2,45.9,69.3]|
|[151.5,41.3,58.5]| 18.5|[151.5,41.3,58.5]|
|[180.8,10.8,58.4]| 12.9|[180.8,10.8,58.4]|
|  [8.7,48.9,75.0]|  7.2|  [8.7,48.9,75.0]|
| [57.5,32.8,23.5]| 11.8| [57.5,32.8,23.5]|
|[120.2,19.6,11.6]| 13.2|[120.2,19.6,11.6]|
|    [8.6,2.1,1.0]|  4.8|    [8.6,2.1,1.0]|
| [199.8,2.6,21.2]| 10.6| [199.8,2.6,21.2]|
|  [66.1,5.8,24.2]|  8.6|  [66.1,5.8,24.2]|
| [214.7,24.0,4.0]| 17.4| [214.7,24.0,4.0]|
| [23.8,35.1,65.9]|  9.2| [23.8,35.1,65.9]|
|   [97.5,7.6,7.2]|  9.7|   [97.5,7.6,7.2]|
|[204.1,32.9,46.0]| 19.0|[204.1,32.9,46.0]|
|[195.4,47.7,52.9]| 22.4|[195.4,47.7,52.9]|
|[67.8,36.6,114.0]| 12.5|[67.8,36.6,114.0]|
|[281.4,39.6,55.8]| 24.4|[281.4,39.6,55.8]|
| [69.2,20.5,18.3]| 11.3| [69.2,20.5,18.3]|
|[147.3,23.9,19.1]| 14.6|[147.3,

In [90]:
trainingData.show(5)
testData.show(5)

+----------------+-----+
|        features|label|
+----------------+-----+
|  [4.1,11.6,5.7]|  3.2|
| [7.3,28.1,41.4]|  5.5|
| [7.8,38.9,50.6]|  6.6|
|  [8.4,27.2,2.1]|  5.7|
|[11.7,36.9,45.2]|  7.3|
+----------------+-----+
only showing top 5 rows

+---------------+-----+
|       features|label|
+---------------+-----+
| [0.7,39.6,8.7]|  1.6|
| [5.4,29.9,9.4]|  5.3|
|  [8.6,2.1,1.0]|  4.8|
|[8.7,48.9,75.0]|  7.2|
|[13.1,0.4,25.6]|  5.3|
+---------------+-----+
only showing top 5 rows



In [0]:
from pyspark.ml.regression import GeneralizedLinearRegression

glr = GeneralizedLinearRegression(family = 'gaussian', link= 'identity', 
                                  maxIter = 10, regParam=0.3)

In [0]:
pipeline = Pipeline(stages= [featureIndexer,glr])
model = pipeline.fit(trainingData)

In [93]:
predictions = model.transform(testData)
predictions.select('features','label','prediction').show(5)

+---------------+-----+------------------+
|       features|label|        prediction|
+---------------+-----+------------------+
| [0.7,39.6,8.7]|  1.6|10.752336427950446|
| [5.4,29.9,9.4]|  5.3| 9.149377508974801|
|  [8.6,2.1,1.0]|  4.8| 4.084229810748363|
|[8.7,48.9,75.0]|  7.2|13.035654514301717|
|[13.1,0.4,25.6]|  5.3| 4.037971217653756|
+---------------+-----+------------------+
only showing top 5 rows



In [94]:
evaluator = RegressionEvaluator(labelCol='label',predictionCol='prediction',
                                metricName='rmse')

rmse = evaluator.evaluate(predictions)
print('Root Mean Squared Error (RMSE) on test data = %g' %rmse)

y_true = predictions.select('label').toPandas()
y_pred = predictions.select('prediction').toPandas()

r2_score = sklearn.metrics.r2_score(y_true,y_pred)
print('r2_score: {0}'.format(r2_score))

Root Mean Squared Error (RMSE) on test data = 1.83519
r2_score: 0.8827459913263899


Better than Linear Regression.

## Decision Tree

In [0]:
from pyspark.ml.regression import DecisionTreeRegressor

dt = DecisionTreeRegressor(featuresCol="indexedFeatures")

In [0]:
pipeline = Pipeline(stages=[featureIndexer,dt])
model = pipeline.fit(trainingData)

In [97]:
predictions = model.transform(testData)
predictions.select('features','label','prediction').show(5)

+---------------+-----+-----------------+
|       features|label|       prediction|
+---------------+-----+-----------------+
| [0.7,39.6,8.7]|  1.6|6.599999999999998|
| [5.4,29.9,9.4]|  5.3|6.599999999999998|
|  [8.6,2.1,1.0]|  4.8|              3.2|
|[8.7,48.9,75.0]|  7.2|6.599999999999998|
|[13.1,0.4,25.6]|  5.3|              6.4|
+---------------+-----+-----------------+
only showing top 5 rows



In [98]:
evaluator = RegressionEvaluator(labelCol='label',predictionCol='prediction',
                                metricName='rmse')

rmse = evaluator.evaluate(predictions)
print('Root Mean Squared Error (RMSE) on test data = %g' %rmse)


y_true = predictions.select('label').toPandas()
y_pred = predictions.select('prediction').toPandas()

r2_score = sklearn.metrics.r2_score(y_true,y_pred)
print('r2_score: {0}'.format(r2_score))

Root Mean Squared Error (RMSE) on test data = 1.45007
r2_score: 0.9267942064512279


## Random Forest

In [0]:
from pyspark.ml.regression import RandomForestRegressor

rf = RandomForestRegressor()

In [100]:
pipeline = Pipeline(stages=[featureIndexer,rf])
model = pipeline.fit(trainingData)

predictions = model.transform(testData)
predictions.select('features','label','prediction').show(5)

+---------------+-----+------------------+
|       features|label|        prediction|
+---------------+-----+------------------+
| [0.7,39.6,8.7]|  1.6| 9.909248015873015|
| [5.4,29.9,9.4]|  5.3|10.215575396825395|
|  [8.6,2.1,1.0]|  4.8|7.9985366826156294|
|[8.7,48.9,75.0]|  7.2|12.659500000000003|
|[13.1,0.4,25.6]|  5.3| 9.067425919110129|
+---------------+-----+------------------+
only showing top 5 rows



In [101]:
evaluator = RegressionEvaluator(labelCol='label',predictionCol='prediction',
                                metricName='rmse')

rmse = evaluator.evaluate(predictions)
print('Root Mean Squared Error (RMSE) on test data = %g' %rmse)


y_true = predictions.select('label').toPandas()
y_pred = predictions.select('prediction').toPandas()

r2_score = sklearn.metrics.r2_score(y_true,y_pred)
print('r2_score: {0}'.format(r2_score))

Root Mean Squared Error (RMSE) on test data = 2.1145
r2_score: 0.844338169137798


In [102]:
# Feature Importance

model.stages[-1].featureImportances


SparseVector(3, {0: 0.4455, 1: 0.3943, 2: 0.1603})

In [103]:
model.stages[-1].trees

[DecisionTreeRegressionModel (uid=dtr_69aace303462) of depth 5 with 53 nodes,
 DecisionTreeRegressionModel (uid=dtr_76f9ba74505f) of depth 5 with 43 nodes,
 DecisionTreeRegressionModel (uid=dtr_532d3f92b818) of depth 5 with 55 nodes,
 DecisionTreeRegressionModel (uid=dtr_321555d04eef) of depth 5 with 55 nodes,
 DecisionTreeRegressionModel (uid=dtr_f33f701adbf9) of depth 5 with 53 nodes,
 DecisionTreeRegressionModel (uid=dtr_b4137ae342e7) of depth 5 with 33 nodes,
 DecisionTreeRegressionModel (uid=dtr_1a6e49d93434) of depth 5 with 47 nodes,
 DecisionTreeRegressionModel (uid=dtr_c9329163030b) of depth 5 with 47 nodes,
 DecisionTreeRegressionModel (uid=dtr_e72d19c682da) of depth 5 with 41 nodes,
 DecisionTreeRegressionModel (uid=dtr_204664024b7b) of depth 5 with 39 nodes,
 DecisionTreeRegressionModel (uid=dtr_a4bbed7306b3) of depth 5 with 37 nodes,
 DecisionTreeRegressionModel (uid=dtr_ebb8daaec0f4) of depth 5 with 55 nodes,
 DecisionTreeRegressionModel (uid=dtr_d12f6164b716) of depth 5 w

## Gradient-Boosted Tree

In [0]:
from pyspark.ml.regression import GBTRegressor

rf = GBTRegressor()

In [105]:
pipeline = Pipeline(stages=[featureIndexer,rf])
model = pipeline.fit(trainingData)

predictions = model.transform(testData)
predictions.select('features','label','prediction').show(5)

+---------------+-----+------------------+
|       features|label|        prediction|
+---------------+-----+------------------+
| [0.7,39.6,8.7]|  1.6| 6.494396620443975|
| [5.4,29.9,9.4]|  5.3| 6.416864731576496|
|  [8.6,2.1,1.0]|  4.8|3.3401425150254864|
|[8.7,48.9,75.0]|  7.2| 7.403813987416391|
|[13.1,0.4,25.6]|  5.3| 5.812799066973829|
+---------------+-----+------------------+
only showing top 5 rows



In [106]:
evaluator = RegressionEvaluator(labelCol='label',predictionCol='prediction',
                                metricName='rmse')

rmse = evaluator.evaluate(predictions)
print('Root Mean Squared Error (RMSE) on test data = %g' %rmse)


y_true = predictions.select('label').toPandas()
y_pred = predictions.select('prediction').toPandas()

r2_score = sklearn.metrics.r2_score(y_true,y_pred)
print('r2_score: {0}'.format(r2_score))

Root Mean Squared Error (RMSE) on test data = 1.43339
r2_score: 0.9284681680569912


In [107]:
model.stages[-1].featureImportances

SparseVector(3, {0: 0.3357, 1: 0.3609, 2: 0.3034})

In [108]:
model.stages[-1].trees

[DecisionTreeRegressionModel (uid=dtr_94268ff33219) of depth 5 with 59 nodes,
 DecisionTreeRegressionModel (uid=dtr_da79b1aaef95) of depth 5 with 43 nodes,
 DecisionTreeRegressionModel (uid=dtr_74571332da31) of depth 5 with 37 nodes,
 DecisionTreeRegressionModel (uid=dtr_1ba191c6ab49) of depth 5 with 37 nodes,
 DecisionTreeRegressionModel (uid=dtr_253fe6f25c6e) of depth 5 with 39 nodes,
 DecisionTreeRegressionModel (uid=dtr_50fa6495dd06) of depth 5 with 39 nodes,
 DecisionTreeRegressionModel (uid=dtr_64aefa3262d3) of depth 5 with 41 nodes,
 DecisionTreeRegressionModel (uid=dtr_ac5608a102ce) of depth 5 with 59 nodes,
 DecisionTreeRegressionModel (uid=dtr_cf85a37a72c5) of depth 5 with 37 nodes,
 DecisionTreeRegressionModel (uid=dtr_c908bc272b38) of depth 5 with 37 nodes,
 DecisionTreeRegressionModel (uid=dtr_affcc3acf8e7) of depth 5 with 49 nodes,
 DecisionTreeRegressionModel (uid=dtr_1f799520271d) of depth 5 with 35 nodes,
 DecisionTreeRegressionModel (uid=dtr_7fc57644cdf7) of depth 5 w