# Starter

In [1]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("cs544")
         .master("spark://boss:7077")
         .config("spark.executor.memory", "512M")
         .config("spark.sql.warehouse.dir", "hdfs://nn:9000/user/hive/warehouse")
         .enableHiveSupport()
         .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/03 13:30:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
!hdfs dfs -cp sf.csv hdfs://nn:9000/sf.csv

In [3]:
df = (spark.read.format("csv")
      .option("header", True)
      .option("inferSchema", True)
      .load("hdfs://nn:9000/sf.csv"))

                                                                                

In [4]:
from pyspark.sql.functions import col, expr
cols = [col(c).alias(c.replace(" ", "_")) for c in df.columns]
df.select(cols).write.format("parquet").mode("ignore").save("hdfs://nn:9000/sf.parquet")

23/11/03 13:33:07 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [5]:
!hdfs dfs -rm hdfs://nn:9000/sf.csv

Deleted hdfs://nn:9000/sf.csv


In [6]:
(spark.read
 .format("parquet")
 .load("hdfs://nn:9000/sf.parquet")
 .createOrReplaceTempView("calls")
)

                                                                                

# Lecture

In [7]:
spark.sql("""
SELECT Call_Type, COUNT(*) FROM calls GROUP BY Call_Type
""").explain("formatted")

== Physical Plan ==
AdaptiveSparkPlan (5)
+- HashAggregate (4)
   +- Exchange (3)
      +- HashAggregate (2)
         +- Scan parquet  (1)


(1) Scan parquet 
Output [1]: [Call_Type#231]
Batched: true
Location: InMemoryFileIndex [hdfs://nn:9000/sf.parquet]
ReadSchema: struct<Call_Type:string>

(2) HashAggregate
Input [1]: [Call_Type#231]
Keys [1]: [Call_Type#231]
Functions [1]: [partial_count(1)]
Aggregate Attributes [1]: [count#302L]
Results [2]: [Call_Type#231, count#303L]

(3) Exchange
Input [2]: [Call_Type#231, count#303L]
Arguments: hashpartitioning(Call_Type#231, 200), ENSURE_REQUIREMENTS, [plan_id=52]

(4) HashAggregate
Input [2]: [Call_Type#231, count#303L]
Keys [1]: [Call_Type#231]
Functions [1]: [count(1)]
Aggregate Attributes [1]: [count(1)#298L]
Results [2]: [Call_Type#231, count(1)#298L AS count(1)#299L]

(5) AdaptiveSparkPlan
Output [2]: [Call_Type#231, count(1)#299L]
Arguments: isFinalPlan=false




In [9]:
# sample is just to make the demo faster, it would work without that
(spark.table("calls")
 .sample(True, 0.01)
 .repartition(10, "Call_Type")
 .write
 .bucketBy(10, "Call_Type")
 .mode("overwrite")
 .saveAsTable("calls_by_type"))

23/11/03 13:41:48 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
23/11/03 13:41:49 WARN HadoopFSUtils: The directory hdfs://nn:9000/user/hive/warehouse/calls_by_type was not found. Was it deleted very recently?
23/11/03 13:41:51 WARN FileUtils: File does not exist: hdfs://nn:9000/user/hive/warehouse/calls_by_type; Force to delete it.
23/11/03 13:41:51 ERROR FileUtils: Failed to delete hdfs://nn:9000/user/hive/warehouse/calls_by_type
23/11/03 13:42:20 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
23/11/03 13:42:20 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
23/11/03 13:42:20 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/11/03 13:42:20 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist


In [10]:
spark.sql("""
SELECT Call_Type, COUNT(*)
FROM calls_by_type
GROUP BY Call_Type
""").explain("formatted")

== Physical Plan ==
AdaptiveSparkPlan (4)
+- HashAggregate (3)
   +- HashAggregate (2)
      +- Scan parquet spark_catalog.default.calls_by_type (1)


(1) Scan parquet spark_catalog.default.calls_by_type
Output [1]: [Call_Type#518]
Batched: true
Bucketed: true
Location: InMemoryFileIndex [hdfs://nn:9000/user/hive/warehouse/calls_by_type]
ReadSchema: struct<Call_Type:string>
SelectedBucketsCount: 10 out of 10

(2) HashAggregate
Input [1]: [Call_Type#518]
Keys [1]: [Call_Type#518]
Functions [1]: [partial_count(1)]
Aggregate Attributes [1]: [count#554L]
Results [2]: [Call_Type#518, count#555L]

(3) HashAggregate
Input [2]: [Call_Type#518, count#555L]
Keys [1]: [Call_Type#518]
Functions [1]: [count(1)]
Aggregate Attributes [1]: [count(1)#514L]
Results [2]: [Call_Type#518, count(1)#514L AS count(1)#550L]

(4) AdaptiveSparkPlan
Output [2]: [Call_Type#518, count(1)#550L]
Arguments: isFinalPlan=false




# JOIN (single machine)

In [11]:
# kind_id, color
fruits = [
    ("B", "Yellow"),
    ("A", "Green"),
    ("C", "Orange"),
    ("A", "Red"),
    ("C", "Purple"),
    ("B", "Green")
]

# kind_id, name (assume no duplicate kind_id's)
kinds = [
    ("A", "Apple"),
    ("B", "Banana"),
    ("C", "Carrot")
]

# GOAL: print Yellow Banana, Green Apple, etc (any order)

In [12]:
# Hash Join
kind_lookup = dict(kinds)
kind_lookup

{'A': 'Apple', 'B': 'Banana', 'C': 'Carrot'}

In [13]:
for kind_id, color in fruits:
    print(color, kind_lookup[kind_id])

Yellow Banana
Green Apple
Orange Carrot
Red Apple
Purple Carrot
Green Banana


# Sort Merge Join

In [14]:
kinds.sort()
fruits.sort()

In [15]:
kinds

[('A', 'Apple'), ('B', 'Banana'), ('C', 'Carrot')]

In [16]:
fruits

[('A', 'Green'),
 ('A', 'Red'),
 ('B', 'Green'),
 ('B', 'Yellow'),
 ('C', 'Orange'),
 ('C', 'Purple')]

In [17]:
fruit_idx = 0
for kind_id, fruit_name in kinds:
    while fruit_idx < len(fruits):
        if kind_id == fruits[fruit_idx][0]:
            print(fruits[fruit_idx][1], fruit_name)
        elif fruits[fruit_idx][0] > kind_id:
            break
        fruit_idx += 1

Green Apple
Red Apple
Green Banana
Yellow Banana
Orange Carrot
Purple Carrot


# ML

In [18]:
import pandas as pd
import numpy as np
df = pd.DataFrame({"x1": np.random.randint(0, 10, 100).astype(float), 
                   "x2": np.random.randint(0, 3, 100).astype(float)})
df["y"] = df["x1"] + df["x2"] + np.random.rand(len(df))
df = spark.createDataFrame(df)
df

  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:


DataFrame[x1: double, x2: double, y: double]

In [19]:
train, test = df.randomSplit([0.75, 0.25], seed=42)
test.show()

                                                                                

+---+---+------------------+
| x1| x2|                 y|
+---+---+------------------+
|0.0|0.0|0.9108395611170045|
|1.0|2.0| 3.641410393897231|
|1.0|2.0|3.9464489632070117|
|2.0|0.0| 2.527813970394922|
|2.0|2.0| 4.590305173822881|
|4.0|0.0| 4.553553282229819|
|5.0|2.0| 7.913702301507376|
|6.0|1.0| 7.868747400415531|
|6.0|2.0| 8.486328666168749|
|7.0|0.0| 7.275548337048227|
|7.0|0.0| 7.728987032697484|
|7.0|1.0| 8.264617871091119|
|8.0|2.0|10.429529795420434|
|8.0|2.0|10.802189014971018|
|9.0|1.0|10.551584632951641|
|9.0|2.0|11.336390999413332|
|9.0|2.0|11.944050961335062|
|0.0|1.0|1.5202523837201376|
|2.0|1.0|3.8331570846415604|
|2.0|2.0| 4.559821959933614|
+---+---+------------------+
only showing top 20 rows



In [20]:
train.write.mode("ignore").format("parquet").save("hdfs://nn:9000/train.parquet")
test.write.mode("ignore").format("parquet").save("hdfs://nn:9000/test.parquet")

                                                                                

In [21]:
train = spark.read.format("parquet").load("hdfs://nn:9000/train.parquet")
test = spark.read.format("parquet").load("hdfs://nn:9000/test.parquet")

In [22]:
train.count(), test.count()

(68, 32)

In [23]:
from pyspark.ml.regression import DecisionTreeRegressor, DecisionTreeRegressionModel
# DecisionTreeRegressor: unfit model
# DecisionTreeRegressionModel: fitted model

In [24]:
# need vectors!
# dt = DecisionTreeRegressor(featuresCol="x1", labelCol="y")
# dt.fit(train)

In [25]:
from pyspark.ml.feature import VectorAssembler

In [26]:
va = VectorAssembler(inputCols=["x1", "x2"], outputCol="features")
va

VectorAssembler_7d8ca541415c

In [27]:
#va.transform(train).show()

In [28]:
dt = DecisionTreeRegressor(featuresCol="features", labelCol="y")
model = dt.fit(va.transform(train))

                                                                                

In [29]:
type(dt), type(model)

(pyspark.ml.regression.DecisionTreeRegressor,
 pyspark.ml.regression.DecisionTreeRegressionModel)

In [30]:
from pyspark.ml.pipeline import Pipeline, PipelineModel
# Pipeline: unfit
# PipelineModel: fitted

In [31]:
pipe = Pipeline(stages=[va, dt])

In [34]:
model = pipe.fit(train)
type(pipe), type(model)

                                                                                

(pyspark.ml.pipeline.Pipeline, pyspark.ml.pipeline.PipelineModel)

In [37]:
model.write().overwrite().save("hdfs://nn:9000/model")

                                                                                

In [39]:
!hdfs dfs -ls hdfs://nn:9000/model/stages

Found 2 items
drwxr-xr-x   - root supergroup          0 2023-11-03 14:59 hdfs://nn:9000/model/stages/0_VectorAssembler_7d8ca541415c
drwxr-xr-x   - root supergroup          0 2023-11-03 14:59 hdfs://nn:9000/model/stages/1_DecisionTreeRegressor_93027dcaa5db


In [41]:
model = PipelineModel.load("hdfs://nn:9000/model")

In [42]:
test.show()

+---+---+------------------+
| x1| x2|                 y|
+---+---+------------------+
|0.0|0.0|0.9108395611170045|
|1.0|2.0| 3.641410393897231|
|1.0|2.0|3.9464489632070117|
|2.0|0.0| 2.527813970394922|
|2.0|2.0| 4.590305173822881|
|4.0|0.0| 4.553553282229819|
|5.0|2.0| 7.913702301507376|
|6.0|1.0| 7.868747400415531|
|6.0|2.0| 8.486328666168749|
|7.0|0.0| 7.275548337048227|
|7.0|0.0| 7.728987032697484|
|7.0|1.0| 8.264617871091119|
|8.0|2.0|10.429529795420434|
|8.0|2.0|10.802189014971018|
|9.0|1.0|10.551584632951641|
|9.0|2.0|11.336390999413332|
|9.0|2.0|11.944050961335062|
|0.0|1.0|1.5202523837201376|
|2.0|1.0|3.8331570846415604|
|2.0|2.0| 4.559821959933614|
+---+---+------------------+
only showing top 20 rows



                                                                                

In [44]:
model.transform(test).show()

+---+---+------------------+---------+------------------+
| x1| x2|                 y| features|        prediction|
+---+---+------------------+---------+------------------+
|0.0|0.0|0.9108395611170045|(2,[],[])|0.3629253964995967|
|1.0|2.0| 3.641410393897231|[1.0,2.0]| 3.078665082659273|
|1.0|2.0|3.9464489632070117|[1.0,2.0]| 3.078665082659273|
|2.0|0.0| 2.527813970394922|[2.0,0.0]|2.7613042547531155|
|2.0|2.0| 4.590305173822881|[2.0,2.0]| 4.959259399838608|
|4.0|0.0| 4.553553282229819|[4.0,0.0]| 5.077959347669859|
|5.0|2.0| 7.913702301507376|[5.0,2.0]| 7.093386194668144|
|6.0|1.0| 7.868747400415531|[6.0,1.0]| 7.661040434961876|
|6.0|2.0| 8.486328666168749|[6.0,2.0]| 8.646661072348877|
|7.0|0.0| 7.275548337048227|[7.0,0.0]| 7.525592969042154|
|7.0|0.0| 7.728987032697484|[7.0,0.0]| 7.525592969042154|
|7.0|1.0| 8.264617871091119|[7.0,1.0]| 8.475563904560868|
|8.0|2.0|10.429529795420434|[8.0,2.0]|10.598821170877377|
|8.0|2.0|10.802189014971018|[8.0,2.0]|10.598821170877377|
|9.0|1.0|10.55

In [45]:
from pyspark.ml.evaluation import RegressionEvaluator

In [46]:
r2score = RegressionEvaluator(labelCol="y", predictionCol="prediction", metricName="r2")

In [47]:
r2score.evaluate(model.transform(test))

                                                                                

0.9730704358867571

In [52]:
print(model.stages[1].toDebugString)

DecisionTreeRegressionModel: uid=DecisionTreeRegressor_93027dcaa5db, depth=5, numNodes=47, numFeatures=2
  If (feature 0 <= 5.5)
   If (feature 0 <= 2.5)
    If (feature 1 <= 0.5)
     If (feature 0 <= 1.5)
      If (feature 0 <= 0.5)
       Predict: 0.3629253964995967
      Else (feature 0 > 0.5)
       Predict: 1.395077573815772
     Else (feature 0 > 1.5)
      Predict: 2.7613042547531155
    Else (feature 1 > 0.5)
     If (feature 0 <= 1.5)
      If (feature 1 <= 1.5)
       Predict: 2.1426232679288075
      Else (feature 1 > 1.5)
       Predict: 3.078665082659273
     Else (feature 0 > 1.5)
      If (feature 1 <= 1.5)
       Predict: 3.1434715268552127
      Else (feature 1 > 1.5)
       Predict: 4.959259399838608
   Else (feature 0 > 2.5)
    If (feature 0 <= 3.5)
     If (feature 1 <= 0.5)
      Predict: 3.5703031985879723
     Else (feature 1 > 0.5)
      If (feature 1 <= 1.5)
       Predict: 4.512185877675951
      Else (feature 1 > 1.5)
       Predict: 5.204072300321589
    E