In [None]:
# this workbook demonstrates how to use PySparkML to do single variable continuous regression
# we'll use per captita 2007 national gdp to predict national 2007 life expectency
# using
# 1) linear regression
# 2) decision tree regression
# 3) random forest regression

In [None]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext

from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.sql.session import SparkSession

In [None]:
# NOTE: you only need to create a local context if you are not 
# runnin this on a spark cluster.
# if you are on a spark cluster, a spark session will be 
# initiated in the background and will be accessible as "spark"

In [1]:
sc = SparkContext('local')
spark = SparkSession(sc)

In [4]:
sqlContext = SQLContext(sc)

In [5]:
# read data a sqlContext
# this will provide consistency with spark dataframes that result
# from a sql query rather than a csv file upload. 

df = sqlContext.read.format("csv").option("inferschema","true").option("header", "true").option("delimiter", ",").load("gapminder_all_binary.csv")

In [18]:
# dataframe schema (including headers and datatypes)
df.printSchema()

root
 |-- continent: string (nullable = true)
 |-- country: string (nullable = true)
 |-- gdpPercap_1952: double (nullable = true)
 |-- gdpPercap_1957: double (nullable = true)
 |-- gdpPercap_1962: double (nullable = true)
 |-- gdpPercap_1967: double (nullable = true)
 |-- gdpPercap_1972: double (nullable = true)
 |-- gdpPercap_1977: double (nullable = true)
 |-- gdpPercap_1982: double (nullable = true)
 |-- gdpPercap_1987: double (nullable = true)
 |-- gdpPercap_1992: double (nullable = true)
 |-- gdpPercap_1997: double (nullable = true)
 |-- gdpPercap_2002: double (nullable = true)
 |-- gdpPercap_2007: double (nullable = true)
 |-- lifeExp_1952: double (nullable = true)
 |-- lifeExp_1957: double (nullable = true)
 |-- lifeExp_1962: double (nullable = true)
 |-- lifeExp_1967: double (nullable = true)
 |-- lifeExp_1972: double (nullable = true)
 |-- lifeExp_1977: double (nullable = true)
 |-- lifeExp_1982: double (nullable = true)
 |-- lifeExp_1987: double (nullable = true)
 |-- lifeEx

In [19]:
# identify input and target columns for a regression
# note - we are doing a single variable regression, but you can add multiple 
# columns to the inputCols vector

vectorAssembler = VectorAssembler(inputCols = ['gdpPercap_2007'], outputCol = 'features')
va = vectorAssembler.transform(df)
va = va.select(['features', 'lifeExp_2007'])
va.show(3)

+-------------+------------+
|     features|lifeExp_2007|
+-------------+------------+
|[6223.367465]|      72.301|
|[4797.231267]|      42.731|
|[1441.284873]|      56.728|
+-------------+------------+
only showing top 3 rows



In [None]:
# create a regression model

In [8]:
lr = LinearRegression(featuresCol = 'features', labelCol='lifeExp_2007', maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [20]:
# fit the input columns to the target columns specified in the vector assembler, above

In [9]:
lr_model = lr.fit(va)
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [0.0006153365628308622]
Intercept: 59.820247287949805


In [21]:
# predictions based on the original regression
# note that we didn't split into a train and test set
# so we are doing predictions on the same data used to fit the regression model

In [22]:
lr_predictions = lr_model.transform(va)

In [23]:
lr_predictions.show()

+-------------+------------+------------------+
|     features|lifeExp_2007|        prediction|
+-------------+------------+------------------+
|[6223.367465]|      72.301| 63.64971283309632|
|[4797.231267]|      42.731|62.772159086890326|
|[1441.284873]|      56.728| 60.70712256776174|
|[12569.85177]|      50.728| 67.55493667139504|
|[1217.032994]|      52.295|60.569132187329515|
|[430.0706916]|       49.58| 60.08488550909324|
| [2042.09524]|       50.43| 61.07682315390467|
| [706.016537]|      44.741| 60.25468507712913|
|[1704.063724]|      50.651|60.868820002720724|
|[986.1478792]|      65.152| 60.42706013437968|
|[277.5518587]|      46.462| 59.99103509468958|
|[3632.557798]|      55.322| 62.05549291765557|
|[1544.750112]|      48.328| 60.77078851230048|
|[2082.481567]|      54.791|61.101674337546214|
|[5581.180998]|      71.338|63.254552019796044|
|[12154.08975]|      51.579| 67.29910309905262|
|[641.3695236]|       58.04|  60.2149054061063|
|[690.8055759]|      52.947| 60.24532521

In [24]:
# creaate and train a decision tree regressor

In [27]:
dt = DecisionTreeRegressor(featuresCol ='features', labelCol = 'lifeExp_2007')

In [28]:
dt_model = dt.fit(va)

In [29]:
dt_predictions = dt_model.transform(va)

In [30]:
dt_predictions.show()

+-------------+------------+------------------+
|     features|lifeExp_2007|        prediction|
+-------------+------------+------------------+
|[6223.367465]|      72.301| 73.22146153846154|
|[4797.231267]|      42.731| 62.27840000000001|
|[1441.284873]|      56.728|55.833111111111116|
|[12569.85177]|      50.728| 69.74871428571429|
|[1217.032994]|      52.295|55.833111111111116|
|[430.0706916]|       49.58|          46.30175|
| [2042.09524]|       50.43|53.694500000000005|
| [706.016537]|      44.741|           51.7966|
|[1704.063724]|      50.651| 59.39999999999998|
|[986.1478792]|      65.152|55.833111111111116|
|[277.5518587]|      46.462|          46.30175|
|[3632.557798]|      55.322|           66.9268|
|[1544.750112]|      48.328|55.833111111111116|
|[2082.481567]|      54.791|53.694500000000005|
|[5581.180998]|      71.338| 73.22146153846154|
|[12154.08975]|      51.579| 69.74871428571429|
|[641.3695236]|       58.04|           51.7966|
|[690.8055759]|      52.947|           5

In [None]:
# train a random forest regressor

In [31]:
rf = RandomForestRegressor(featuresCol ='features', labelCol = 'lifeExp_2007')

In [32]:
rf_model = rf.fit(va)

In [33]:
rf_predictions = rf_model.transform(va)

In [34]:
rf_predictions.show()

+-------------+------------+------------------+
|     features|lifeExp_2007|        prediction|
+-------------+------------+------------------+
|[6223.367465]|      72.301| 73.10251915241271|
|[4797.231267]|      42.731|62.846878571428576|
|[1441.284873]|      56.728| 56.29677115811309|
|[12569.85177]|      50.728| 70.90033271868529|
|[1217.032994]|      52.295| 56.33857485466508|
|[430.0706916]|       49.58|46.946000952380956|
| [2042.09524]|       50.43|55.265737142857134|
| [706.016537]|      44.741| 52.57729714610658|
|[1704.063724]|      50.651|59.383401164021166|
|[986.1478792]|      65.152| 55.77750256510278|
|[277.5518587]|      46.462|46.946000952380956|
|[3632.557798]|      55.322| 66.33972017551892|
|[1544.750112]|      48.328|55.788333984533985|
|[2082.481567]|      54.791|55.265737142857134|
|[5581.180998]|      71.338| 73.00404920410443|
|[12154.08975]|      51.579| 70.11608932124942|
|[641.3695236]|       58.04| 52.57729714610658|
|[690.8055759]|      52.947| 52.57729714