In [1]:
# Spark session
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("cs544")
         .master("spark://main:7077")
         .config("spark.executor.memory", "512M")
         .config("spark.sql.warehouse.dir", "hdfs://main:9000/user/hive/warehouse")
         .enableHiveSupport()
         .getOrCreate())

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/04/21 15:06:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
# practice data
import pandas as pd
import numpy as np
df = pd.DataFrame({"x1": np.random.randint(0, 10, 100).astype(float), 
                   "x2": np.random.randint(0, 3, 100).astype(float)})
df["y"] = df["x1"] + df["x2"] + np.random.rand(len(df))
df = spark.createDataFrame(df)
df

  for column, series in pdf.iteritems():


DataFrame[x1: double, x2: double, y: double]

In [7]:
# not deterministic if the number partitions changes
train, test = df.randomSplit([0.75, 0.25], seed=42)
test.show()

+---+---+-------------------+
| x1| x2|                  y|
+---+---+-------------------+
|0.0|0.0| 0.8144591299932569|
|0.0|2.0|  2.547977748090765|
|1.0|1.0|  2.380611290225664|
|1.0|1.0|  2.795402057034453|
|2.0|0.0| 2.2419795906111952|
|3.0|1.0| 4.1661932059518465|
|3.0|2.0|  5.872730748876477|
|5.0|1.0|  6.508753967169269|
|5.0|1.0|  6.802270510622792|
|5.0|2.0|  7.477152666520166|
|6.0|0.0|   6.10838750314608|
|6.0|1.0|   7.27706866795352|
|8.0|0.0|  8.603665148950029|
|9.0|0.0|  9.736288997809417|
|9.0|1.0| 10.502762118275582|
|9.0|2.0| 11.049416971231631|
|9.0|2.0| 11.854626632831264|
|0.0|0.0|0.43732336330986155|
|1.0|0.0| 1.5846055927692997|
|1.0|0.0| 1.9041895678874652|
+---+---+-------------------+
only showing top 20 rows



# Deterministic Train/Test Split

In [10]:
train.write.format("parquet").mode("ignore").save("hdfs://main:9000/train.parquet")
test.write.format("parquet").mode("ignore").save("hdfs://main:9000/test.parquet")

In [11]:
train = spark.read.format("parquet").load("hdfs://main:9000/train.parquet")
test = spark.read.format("parquet").load("hdfs://main:9000/test.parquet")

In [12]:
train.count(), test.count()

                                                                                

(68, 32)

In [13]:
from pyspark.ml.regression import LinearRegression, LinearRegressionModel
# LinearRegression is unfitted
# LinearRegressionModel is fitted

In [16]:
# INPUT TO MODEL MUST CONTAIN VECTORS
# lr = LinearRegression(featuresCol="x1", labelCol="y")
# lr.fit(train)

In [20]:
from pyspark.ml.feature import VectorAssembler
va = VectorAssembler(inputCols=["x1"], outputCol="features")

In [24]:
lr = LinearRegression(featuresCol="features", labelCol="y")
model = lr.fit(va.transform(train))
type(lr), type(model)

23/04/21 15:17:46 WARN Instrumentation: [c7d7710c] regParam is zero, which might cause numerical instability and overfitting.


(pyspark.ml.regression.LinearRegression,
 pyspark.ml.regression.LinearRegressionModel)

In [25]:
model.write().overwrite().save("hdfs://main:9000/model")

In [27]:
!hdfs dfs -ls hdfs://main:9000/model

Found 2 items
drwxr-xr-x   - root supergroup          0 2023-04-21 15:18 hdfs://main:9000/model/data
drwxr-xr-x   - root supergroup          0 2023-04-21 15:18 hdfs://main:9000/model/metadata


In [28]:
model = LinearRegressionModel.load("hdfs://main:9000/model")

In [30]:
model.transform(va.transform(test)).show()

+---+---+-------------------+--------+------------------+
| x1| x2|                  y|features|        prediction|
+---+---+-------------------+--------+------------------+
|0.0|0.0| 0.8144591299932569|   [0.0]|1.9662901332656484|
|0.0|2.0|  2.547977748090765|   [0.0]|1.9662901332656484|
|1.0|1.0|  2.380611290225664|   [1.0]|2.8783315921644856|
|1.0|1.0|  2.795402057034453|   [1.0]|2.8783315921644856|
|2.0|0.0| 2.2419795906111952|   [2.0]| 3.790373051063323|
|3.0|1.0| 4.1661932059518465|   [3.0]| 4.702414509962161|
|3.0|2.0|  5.872730748876477|   [3.0]| 4.702414509962161|
|5.0|1.0|  6.508753967169269|   [5.0]| 6.526497427759835|
|5.0|1.0|  6.802270510622792|   [5.0]| 6.526497427759835|
|5.0|2.0|  7.477152666520166|   [5.0]| 6.526497427759835|
|6.0|0.0|   6.10838750314608|   [6.0]| 7.438538886658672|
|6.0|1.0|   7.27706866795352|   [6.0]| 7.438538886658672|
|8.0|0.0|  8.603665148950029|   [8.0]| 9.262621804456348|
|9.0|0.0|  9.736288997809417|   [9.0]|10.174663263355184|
|9.0|1.0| 10.5

# Pipeline Model

In [31]:
from pyspark.ml.pipeline import Pipeline, PipelineModel

In [38]:
pipe = Pipeline(stages=[
    VectorAssembler(inputCols=["x1", "x2"], outputCol="features"), 
    LinearRegression(featuresCol="features", labelCol="y", predictionCol="prediction")
])
pipe

Pipeline_edf30c319664

In [39]:
model = pipe.fit(train)

23/04/21 15:24:07 WARN Instrumentation: [df3899e3] regParam is zero, which might cause numerical instability and overfitting.
                                                                                

In [40]:
type(pipe), type(model)

(pyspark.ml.pipeline.Pipeline, pyspark.ml.pipeline.PipelineModel)

In [41]:
model.transform(test).show()

+---+---+-------------------+---------+------------------+
| x1| x2|                  y| features|        prediction|
+---+---+-------------------+---------+------------------+
|0.0|0.0| 0.8144591299932569|(2,[],[])|0.5478013485872577|
|0.0|2.0|  2.547977748090765|[0.0,2.0]|  2.52637456808389|
|1.0|1.0|  2.380611290225664|[1.0,1.0]| 2.526423658994852|
|1.0|1.0|  2.795402057034453|[1.0,1.0]| 2.526423658994852|
|2.0|0.0| 2.2419795906111952|[2.0,0.0]|2.5264727499058135|
|3.0|1.0| 4.1661932059518465|[3.0,1.0]|4.5050950603134075|
|3.0|2.0|  5.872730748876477|[3.0,2.0]| 5.494381670061723|
|5.0|1.0|  6.508753967169269|[5.0,1.0]| 6.483766461631963|
|5.0|1.0|  6.802270510622792|[5.0,1.0]| 6.483766461631963|
|5.0|2.0|  7.477152666520166|[5.0,2.0]| 7.473053071380279|
|6.0|0.0|   6.10838750314608|[6.0,0.0]| 6.483815552542925|
|6.0|1.0|   7.27706866795352|[6.0,1.0]| 7.473102162291241|
|8.0|0.0|  8.603665148950029|[8.0,0.0]|  8.46248695386148|
|9.0|0.0|  9.736288997809417|[9.0,0.0]|  9.4518226545207

In [42]:
from pyspark.ml.evaluation import RegressionEvaluator

In [43]:
r2score = RegressionEvaluator(predictionCol="prediction", labelCol="y", metricName="r2")

In [44]:
r2score.evaluate(model.transform(test))

0.9934888072620658

In [47]:
model.stages[1].coefficients

DenseVector([0.9893, 0.9893])

In [48]:
model.stages[1].intercept

0.5478013485872577