<a href="https://colab.research.google.com/github/geoffswc/ML-Course-Notes/blob/master/pyspark_exercise_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
!update-alternatives --set java /usr/lib/jvm/java-8-openjdk-amd64/jre/bin/java
!java -version

openjdk version "1.8.0_222"
OpenJDK Runtime Environment (build 1.8.0_222-8u222-b10-1ubuntu1~18.04.1-b10)
OpenJDK 64-Bit Server VM (build 25.222-b10, mixed mode)


In [0]:
!pip install pyspark



In [0]:
from pyspark.sql import SQLContext

from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import DecisionTreeRegressor
from pyspark.sql.session import SparkSession

In [0]:
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [0]:
sqlContext = SQLContext(spark)

In [0]:
import pandas as pd
data = pd.read_csv('https://raw.githubusercontent.com/geoffswc/IntroToPythonPart2/master/data/gapminder_all.csv')

In [0]:
df = sqlContext.createDataFrame(data)

In [0]:
df.printSchema()

root
 |-- continent: string (nullable = true)
 |-- country: string (nullable = true)
 |-- gdpPercap_1952: double (nullable = true)
 |-- gdpPercap_1957: double (nullable = true)
 |-- gdpPercap_1962: double (nullable = true)
 |-- gdpPercap_1967: double (nullable = true)
 |-- gdpPercap_1972: double (nullable = true)
 |-- gdpPercap_1977: double (nullable = true)
 |-- gdpPercap_1982: double (nullable = true)
 |-- gdpPercap_1987: double (nullable = true)
 |-- gdpPercap_1992: double (nullable = true)
 |-- gdpPercap_1997: double (nullable = true)
 |-- gdpPercap_2002: double (nullable = true)
 |-- gdpPercap_2007: double (nullable = true)
 |-- lifeExp_1952: double (nullable = true)
 |-- lifeExp_1957: double (nullable = true)
 |-- lifeExp_1962: double (nullable = true)
 |-- lifeExp_1967: double (nullable = true)
 |-- lifeExp_1972: double (nullable = true)
 |-- lifeExp_1977: double (nullable = true)
 |-- lifeExp_1982: double (nullable = true)
 |-- lifeExp_1987: double (nullable = true)
 |-- lifeEx

In [0]:
vectorAssembler = VectorAssembler(inputCols = ['gdpPercap_2007'], outputCol = 'features')
va = vectorAssembler.transform(df)
va = va.select(['features', 'lifeExp_2007'])
va.show(3)

+-------------+------------+
|     features|lifeExp_2007|
+-------------+------------+
|[6223.367465]|      72.301|
|[4797.231267]|      42.731|
|[1441.284873]|      56.728|
+-------------+------------+
only showing top 3 rows



In [0]:
lr = LinearRegression(featuresCol = 'features', labelCol='lifeExp_2007', maxIter=10, regParam=0.3, elasticNetParam=0.8)

In [0]:
lr_model = lr.fit(va)

In [0]:
lr_predictions = lr_model.transform(va)

In [0]:
print("Coefficients: " + str(lr_model.coefficients))
print("Intercept: " + str(lr_model.intercept))

Coefficients: [0.0006153365628308675]
Intercept: 59.82024728794976


In [0]:
lr_predictions.show()

+--------------------+------------------+------------------+
|            features|      lifeExp_2007|        prediction|
+--------------------+------------------+------------------+
|       [6223.367465]|            72.301| 63.64971283309631|
|       [4797.231267]|            42.731| 62.77215908689031|
|       [1441.284873]|            56.728| 60.70712256776171|
|[12569.851770000001]|            50.728| 67.55493667139505|
|       [1217.032994]|            52.295| 60.56913218732948|
|[430.07069160000003]|             49.58|  60.0848855090932|
|[2042.0952399999999]|             50.43|61.076823153904634|
|        [706.016537]| 44.74100000000001|60.254685077129096|
|[1704.0637239999999]|            50.651| 60.86882000272069|
| [986.1478792000001]|            65.152| 60.42706013437964|
|[277.55185869999997]|46.461999999999996|59.991035094689536|
|       [3632.557798]|55.321999999999996| 62.05549291765555|
|       [1544.750112]|            48.328| 60.77078851230044|
|[2082.4815670000003]|54