In [1]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("cs544")
         .master("spark://boss:7077")
         .config("spark.executor.memory", "512M")
         .config("spark.sql.warehouse.dir", "hdfs://nn:9000/user/hive/warehouse")
         .enableHiveSupport()
         .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/01/09 03:42:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
spark

# Spark ML

In [4]:
import pandas as pd
import numpy as np
df = pd.DataFrame({"x1": np.random.randint(0, 10, 100).astype(float), 
                   "x2": np.random.randint(0, 3, 100).astype(float)})
df["y"] = df["x1"] + df["x2"] + np.random.rand(len(df))
df = spark.createDataFrame(df)
df

  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:


DataFrame[x1: double, x2: double, y: double]

In [5]:
# not truly deterministic, just at the partition level
train, test = df.randomSplit([0.75, 0.25], seed=42)
# test.show()

In [6]:
test.show()

                                                                                

+---+---+-------------------+
| x1| x2|                  y|
+---+---+-------------------+
|0.0|1.0| 1.5620703518901484|
|1.0|0.0|  1.983979871081822|
|1.0|1.0|  2.441462744275578|
|1.0|1.0|  2.828844447464115|
|1.0|2.0| 3.6823179116631923|
|3.0|0.0|  3.497976967674114|
|3.0|1.0|  4.805389402628514|
|4.0|0.0|   4.85519547382836|
|4.0|1.0|  5.786243218075029|
|5.0|1.0|  6.101739917350784|
|6.0|0.0|  6.448502843616108|
|6.0|1.0|  7.949110493383305|
|8.0|2.0|  10.75513580685739|
|9.0|0.0|  9.969437496079284|
|9.0|1.0| 10.467944755784531|
|9.0|1.0| 10.488435233510081|
|9.0|2.0| 11.905645720427248|
|0.0|0.0|0.34232772829828007|
|1.0|1.0|   2.92452276854095|
|1.0|2.0| 3.0705857232948293|
+---+---+-------------------+
only showing top 20 rows



In [7]:
train.write.mode("ignore").format("parquet").save("hdfs://nn:9000/train.parquet")
test.write.mode("ignore").format("parquet").save("hdfs://nn:9000/test.parquet")

In [8]:
train = spark.read.format("parquet").load("hdfs://nn:9000/train.parquet")
test = spark.read.format("parquet").load("hdfs://nn:9000/test.parquet")

                                                                                

In [9]:
train.count(), test.count()

(68, 32)

In [10]:
from pyspark.ml.regression import DecisionTreeRegressionModel, DecisionTreeRegressor
# DecisionTreeRegresson: unfit model
# DecisionTreeRegressionModel: fitted model
# In Spark, names ending in "Model" are the fitted ones

In [11]:
from pyspark.ml.feature import VectorAssembler

In [12]:
va = VectorAssembler(inputCols=["x1", "x2"], outputCol="features")
va.transform(train).show()

+---+---+-------------------+---------+
| x1| x2|                  y| features|
+---+---+-------------------+---------+
|0.0|0.0|0.42795247222945243|(2,[],[])|
|0.0|0.0|  0.615325546798749|(2,[],[])|
|0.0|0.0| 0.7510810525206312|(2,[],[])|
|0.0|1.0|  1.441809826102813|[0.0,1.0]|
|1.0|1.0| 2.2411650044352642|[1.0,1.0]|
|1.0|1.0|  2.517033442787486|[1.0,1.0]|
|1.0|1.0| 2.9313522167571646|[1.0,1.0]|
|1.0|2.0|  3.575044572194423|[1.0,2.0]|
|1.0|2.0| 3.8372546684902686|[1.0,2.0]|
|2.0|0.0| 2.5335274983123837|[2.0,0.0]|
|2.0|2.0|  4.019470241685458|[2.0,2.0]|
|2.0|2.0|  4.119169268090317|[2.0,2.0]|
|3.0|0.0|  3.187608117646062|[3.0,0.0]|
|3.0|1.0|  4.351293075015204|[3.0,1.0]|
|3.0|2.0|  5.136326091287162|[3.0,2.0]|
|3.0|2.0|  5.492773502085889|[3.0,2.0]|
|4.0|0.0|  4.359096735387636|[4.0,0.0]|
|4.0|1.0|  5.937079775490419|[4.0,1.0]|
|5.0|2.0|   7.69739206252476|[5.0,2.0]|
|6.0|0.0|  6.346458133471592|[6.0,0.0]|
+---+---+-------------------+---------+
only showing top 20 rows



                                                                                

In [13]:
va = VectorAssembler(inputCols=["x1", "x2"], outputCol="features") # transformer

dt = DecisionTreeRegressor(featuresCol="features", labelCol="y") # estimator

model = dt.fit(va.transform(train))

                                                                                

In [14]:
type(dt), type(model)

(pyspark.ml.regression.DecisionTreeRegressor,
 pyspark.ml.regression.DecisionTreeRegressionModel)

## Pipeline

In [15]:
from pyspark.ml.pipeline import Pipeline, PipelineModel
# Pipeline: unfit
# PipelineModel: fitted

In [16]:
pipe = Pipeline(stages=[va, dt])

In [17]:
model = pipe.fit(train)

In [18]:
type(pipe), type(model)

(pyspark.ml.pipeline.Pipeline, pyspark.ml.pipeline.PipelineModel)

In [19]:
model.stages

[VectorAssembler_d73f3fd7ff02,
 DecisionTreeRegressionModel: uid=DecisionTreeRegressor_e67bf5f43a4b, depth=5, numNodes=45, numFeatures=2]

In [20]:
print(model.stages[1].toDebugString)

DecisionTreeRegressionModel: uid=DecisionTreeRegressor_e67bf5f43a4b, depth=5, numNodes=45, numFeatures=2
  If (feature 0 <= 4.5)
   If (feature 0 <= 1.5)
    If (feature 0 <= 0.5)
     If (feature 1 <= 0.5)
      Predict: 0.6800623532608492
     Else (feature 1 > 0.5)
      Predict: 1.2758604904864002
    Else (feature 0 > 0.5)
     If (feature 1 <= 1.5)
      If (feature 1 <= 0.5)
       Predict: 1.2806370199330332
      Else (feature 1 > 0.5)
       Predict: 2.43259924623944
     Else (feature 1 > 1.5)
      Predict: 3.7061496203423454
   Else (feature 0 > 1.5)
    If (feature 1 <= 0.5)
     If (feature 0 <= 2.5)
      Predict: 2.7206653128880154
     Else (feature 0 > 2.5)
      If (feature 0 <= 3.5)
       Predict: 3.503111558841243
      Else (feature 0 > 3.5)
       Predict: 4.208164441278279
    Else (feature 1 > 0.5)
     If (feature 0 <= 2.5)
      Predict: 4.069319754887887
     Else (feature 0 > 2.5)
      If (feature 0 <= 3.5)
       Predict: 5.23852006111295
      Else (fe

In [21]:
model.write().overwrite().save("hdfs://nn:9000/model")

In [22]:
!hdfs dfs -ls hdfs://nn:9000/model

Found 2 items
drwxr-xr-x   - root supergroup          0 2024-01-08 23:26 hdfs://nn:9000/model/metadata
drwxr-xr-x   - root supergroup          0 2024-01-08 23:26 hdfs://nn:9000/model/stages


In [23]:
!hdfs dfs -ls hdfs://nn:9000/model/stages

Found 2 items
drwxr-xr-x   - root supergroup          0 2024-01-08 23:26 hdfs://nn:9000/model/stages/0_VectorAssembler_d73f3fd7ff02
drwxr-xr-x   - root supergroup          0 2024-01-08 23:26 hdfs://nn:9000/model/stages/1_DecisionTreeRegressor_e67bf5f43a4b


In [24]:
model = PipelineModel.load("hdfs://nn:9000/model")

In [25]:
test.show()

+---+---+------------------+
| x1| x2|                 y|
+---+---+------------------+
|0.0|2.0|2.9356223148472123|
|2.0|1.0|  3.54328674785907|
|3.0|0.0| 3.881907196033559|
|3.0|1.0| 4.893077548435018|
|3.0|2.0| 5.768574692347921|
|5.0|2.0| 7.139042554735778|
|6.0|1.0| 7.035251729622586|
|7.0|2.0| 9.651840006787452|
|7.0|2.0| 9.886778810986943|
|8.0|0.0| 8.637928816213646|
|8.0|1.0| 9.339865475153195|
|8.0|1.0| 9.964248436219453|
|9.0|0.0|  9.97279471809389|
|9.0|1.0|10.790392521915205|
|9.0|1.0|10.886620753695986|
|9.0|1.0|10.937779415298065|
|9.0|2.0| 11.46570470467693|
|0.0|0.0|0.2735665401574404|
|0.0|2.0|2.2193379958720127|
|1.0|0.0|1.8610933247303851|
+---+---+------------------+
only showing top 20 rows



In [26]:
model.transform(test).show()

+---+---+------------------+---------+------------------+
| x1| x2|                 y| features|        prediction|
+---+---+------------------+---------+------------------+
|0.0|2.0|2.9356223148472123|[0.0,2.0]|1.2758604904864002|
|2.0|1.0|  3.54328674785907|[2.0,1.0]| 4.069319754887887|
|3.0|0.0| 3.881907196033559|[3.0,0.0]| 3.503111558841243|
|3.0|1.0| 4.893077548435018|[3.0,1.0]|  5.23852006111295|
|3.0|2.0| 5.768574692347921|[3.0,2.0]|  5.23852006111295|
|5.0|2.0| 7.139042554735778|[5.0,2.0]| 7.254421819513839|
|6.0|1.0| 7.035251729622586|[6.0,1.0]| 7.906268580027442|
|7.0|2.0| 9.651840006787452|[7.0,2.0]| 9.111488331104255|
|7.0|2.0| 9.886778810986943|[7.0,2.0]| 9.111488331104255|
|8.0|0.0| 8.637928816213646|[8.0,0.0]| 8.447735046347407|
|8.0|1.0| 9.339865475153195|[8.0,1.0]|  9.49637899800742|
|8.0|1.0| 9.964248436219453|[8.0,1.0]|  9.49637899800742|
|9.0|0.0|  9.97279471809389|[9.0,0.0]| 9.166631007955068|
|9.0|1.0|10.790392521915205|[9.0,1.0]|10.512980061987529|
|9.0|1.0|10.88

### evaluate

In [27]:
from pyspark.ml.evaluation import RegressionEvaluator

In [28]:
r2score = RegressionEvaluator(labelCol="y", predictionCol="prediction")

In [29]:
r2score.evaluate(model.transform(test))

0.531126734504476