# Spark ML

### Code to be executed before lecture

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, expr

import pandas as pd
import numpy as np

from pyspark.ml.regression import DecisionTreeRegressor, DecisionTreeRegressionModel
from pyspark.ml.feature import VectorAssembler

In [None]:
spark = (SparkSession.builder.appName("cs544")
         .master("spark://boss:7077")
         .config("spark.executor.memory", "512M")
         .config("spark.sql.warehouse.dir", "hdfs://nn:9000/user/hive/warehouse")
         .enableHiveSupport()
         .getOrCreate())

In [None]:
df = pd.DataFrame({"x1": np.random.randint(0, 10, 100).astype(float), 
                   "x2": np.random.randint(0, 3, 100).astype(float)})
df["y"] = df["x1"] + df["x2"] + np.random.rand(len(df))

Let's convert pandas dataframe to Spark dataframe.

In [None]:
df = spark.createDataFrame(df)
df

Recall that seed in Spark is not truly deterministic overall (because everytime we might have new partitions), just deterministic at the partition level.

In [None]:
train, test = df.randomSplit([0.75, 0.25], seed=42)
test.show()

Let's write data to Parquet format and read the data from the Parquet file.
We need to now use `mode("ignore")` to make sure that we work with the deterministic sample.

In [None]:
train.write.format("parquet").mode("ignore").save("hdfs://nn:9000/train.parquet")
test.write.format("parquet").mode("ignore").save("hdfs://nn:9000/test.parquet")

In [None]:
train = spark.read.format("parquet").load("hdfs://nn:9000/train.parquet")
test = spark.read.format("parquet").load("hdfs://nn:9000/test.parquet")

### Decision Trees

- `DecisionTreeRegressor`: unfit model
- `DecisionTreeRegressionModel`: fitted model
    - In Spark, names ending in "Model" are the fitted ones

In [None]:
va = VectorAssembler(inputCols=["x1", "x2"], outputCol="features")
dt = DecisionTreeRegressor(featuresCol="features", labelCol="y")

model = dt.fit(va.transform(train))

In [None]:
type(dt), type(model)

### Lecture starts here

### Pipelines

- `Pipeline`: unfit model
- `PipelineModel`: fitted model

In [None]:
from pyspark.ml.pipeline import Pipeline, PipelineModel

In [None]:
pipe = Pipeline(stages=[va, dt])

In [None]:
model = pipe.fit(train)

In [None]:
type(pipe), type(model)

### Pipeline stages

In [None]:
print(model.stages)

In [None]:
print(model.stages[1])

In [None]:
print(model.stages[1].toDebugString)

### Saving pipeline to HDFS

In [None]:
model.write().overwrite().save("hdfs://nn:9000/model")

Let's try `ls` on HDFS.

In [None]:
!hdfs dfs -ls hdfs://nn:9000/model

In [None]:
!hdfs dfs -ls hdfs://nn:9000/model/stages

Let's load the model from HDFS.

In [None]:
model = PipelineModel.load("hdfs://nn:9000/model")

### Predictions

In [None]:
test

In [None]:
model.transform(test)

In [None]:
model.transform(test).show()

### Evaluating the model

In [None]:
from pyspark.ml.evaluation import RegressionEvaluator

In [None]:
r2score = RegressionEvaluator(predictionCol="prediction", labelCol="y", metricName="r2")
r2score

In [None]:
r2score.evaluate(model.transform(test))