In [2]:
from pyspark.sql import (
    functions as f,
    SparkSession,
    types as t
)
from pyspark.ml.regression import LinearRegression
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

spark = SparkSession.builder.appName("ml_lr_example").getOrCreate()
df = spark.read.csv(
        "file:///home/jovyan/work/sample/ecommerce_customers.csv",
        inferSchema=True,
        header=True)
# schema
df.printSchema()

# show the first one
df.head(1)

# get columns
df.columns

# pyspark.ml.feature.VectorAssembler(*, inputCols=None, outputCol=None, handleInvalid='error'
# A feature transformer that merges multiple columns into a vector column
asbl = VectorAssembler(
        inputCols=[
        'avg_session',
         'time_on_app',
         'time_on_website',
         'membership_period'],
        outputCol='feature_vectors')

# apply the transform
tr_data = asbl.transform(df)

# check the new column
tr_data.printSchema()

data = tr_data.select('feature_vectors', 'year_spent')
data.show()

# split
train, test = data.randomSplit([0.7, 0.3])

# confirm the count
train.describe().show()
test.describe().show()

lr = LinearRegression(
    featuresCol='feature_vectors',
    labelCol='year_spent')

# build a model
lr_model = lr.fit(train)

test_output = lr_model.evaluate(test)

# diff btw actual vs. prediction
test_output.residuals.show()

print(test_output.rootMeanSquaredError)

print(test_output.r2)

data.describe().show()

predictions = lr_model.transform(test)
predictions.show()

root
 |-- name: string (nullable = true)
 |-- avg_session: double (nullable = true)
 |-- time_on_app: double (nullable = true)
 |-- time_on_website: double (nullable = true)
 |-- membership_period: double (nullable = true)
 |-- year_spent: double (nullable = true)

root
 |-- name: string (nullable = true)
 |-- avg_session: double (nullable = true)
 |-- time_on_app: double (nullable = true)
 |-- time_on_website: double (nullable = true)
 |-- membership_period: double (nullable = true)
 |-- year_spent: double (nullable = true)
 |-- feature_vectors: vector (nullable = true)

+--------------------+------------------+
|     feature_vectors|        year_spent|
+--------------------+------------------+
|[41.3967212701347...| 652.6256699049252|
|[38.3115264316322...| 435.3474761232019|
|[39.6010977067712...|  541.177730402894|
|[41.1666679557066...| 645.8561018790921|
|[39.9968070283756...|  665.340762170797|
|[40.6452454552103...| 707.1837171857322|
|[38.4259146016643...| 578.9451139811881|
|