In [None]:
filepath = '/databricks-datasets/samples/population-vs-price/data_geo.csv'
data = spark.read.format('csv'
    ).option('header', 'true'
    ).option('inferSchema', 'true'
    ).load(filepath)
data.cache()

In [None]:
display(data)

In [None]:
data = data.dropna()
exprs = [
    col(column).alias(column.replace(' ', '_'))
    for column in data.columns
]

from pyspark.ml.linalg import Vectors, VectorUDT
spark.udf.register('oneElementVec',
    lambda d: Vectors.dense([d]), 
    returnType = VectorUDT()
)
tdata = data.select(*exprs).selectExpr(
    "oneElementVec(2014_Population_estimate) as features",
    "2015_median_sales_price as label"
)

In [None]:
display(tdata)

In [None]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression()

model = lr.fit(data, {lr.regParam:100.0})

In [None]:
predictions = model.transform(data)
display(predictions)

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator
evaluator = RegressionEvaluator(metricName = 'rmse')
rmse = evaluator.evaluate(predictions)
print('rmse = ' + str(rmse))

In [None]:
import numpy as np
from pandas import *
from ggplot import *

pop = data.map(lambda p: (p.features[0])).collect()
price = data.map(lambda p: (p.label)).collect()
pred = predictions.select("prediction").map(lambda r: r[0]).collect()

# Create a Pandas DataFrame
pydf = DataFrame({'pop':pop,'price':price,'predA':pred})
p = ggplot(pydf, aes('pop','price')) +
  geom_point(color='blue') +
  geom_line(pydf, aes('pop','pred'), color='red') +
  scale_x_log10() + scale_y_log10()
display(p)