# Starter

In [1]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("cs544")
         .master("spark://boss:7077")
         .config("spark.executor.memory", "512M")
         .config("spark.sql.warehouse.dir", "hdfs://nn:9000/user/hive/warehouse")
         .enableHiveSupport()
         .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/03 18:15:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
!hdfs dfs -cp sf.csv hdfs://nn:9000/sf.csv

In [3]:
df = (spark.read.format("csv")
      .option("header", True)
      .option("inferSchema", True)
      .load("hdfs://nn:9000/sf.csv"))

                                                                                

In [5]:
from pyspark.sql.functions import col, expr
cols = [col(c).alias(c.replace(" ", "_")) for c in df.columns]
df.select(cols).write.mode("ignore").format("parquet").save("hdfs://nn:9000/sf.parquet")

In [6]:
!hdfs dfs -rm hdfs://nn:9000/sf.csv

Deleted hdfs://nn:9000/sf.csv


In [7]:
(spark.read
 .format("parquet")
 .load("hdfs://nn:9000/sf.parquet")
 .createOrReplaceTempView("calls")
)

                                                                                

# Lecture

In [8]:
spark.sql("""
SELECT Call_Type, COUNT(*)
FROM calls
GROUP BY Call_Type
""").explain("formatted")

== Physical Plan ==
AdaptiveSparkPlan (5)
+- HashAggregate (4)
   +- Exchange (3)
      +- HashAggregate (2)
         +- Scan parquet  (1)


(1) Scan parquet 
Output [1]: [Call_Type#301]
Batched: true
Location: InMemoryFileIndex [hdfs://nn:9000/sf.parquet]
ReadSchema: struct<Call_Type:string>

(2) HashAggregate
Input [1]: [Call_Type#301]
Keys [1]: [Call_Type#301]
Functions [1]: [partial_count(1)]
Aggregate Attributes [1]: [count#372L]
Results [2]: [Call_Type#301, count#373L]

(3) Exchange
Input [2]: [Call_Type#301, count#373L]
Arguments: hashpartitioning(Call_Type#301, 200), ENSURE_REQUIREMENTS, [plan_id=69]

(4) HashAggregate
Input [2]: [Call_Type#301, count#373L]
Keys [1]: [Call_Type#301]
Functions [1]: [count(1)]
Aggregate Attributes [1]: [count(1)#368L]
Results [2]: [Call_Type#301, count(1)#368L AS count(1)#369L]

(5) AdaptiveSparkPlan
Output [2]: [Call_Type#301, count(1)#369L]
Arguments: isFinalPlan=false




In [9]:
# would work without sampling, just using it to make it faster
(spark.table("calls")
 .sample(True, 0.01)
 .write
 .mode("overwrite")
 .bucketBy(10, "Call_Type")
 .saveAsTable("call_by_type"))

23/11/03 18:18:09 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/11/03 18:18:09 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
23/11/03 18:18:14 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
23/11/03 18:18:14 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.26.0.2
23/11/03 18:18:15 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
23/11/03 18:18:49 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
23/11/03 18:18:49 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
23/11/03 18:18:49 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/11/03 18:18:49 W

In [10]:
spark.sql("""
SELECT Call_Type, COUNT(*)
FROM call_by_type
GROUP BY Call_Type
""").explain("formatted")

== Physical Plan ==
AdaptiveSparkPlan (4)
+- HashAggregate (3)
   +- HashAggregate (2)
      +- Scan parquet spark_catalog.default.call_by_type (1)


(1) Scan parquet spark_catalog.default.call_by_type
Output [1]: [Call_Type#518]
Batched: true
Bucketed: true
Location: InMemoryFileIndex [hdfs://nn:9000/user/hive/warehouse/call_by_type]
ReadSchema: struct<Call_Type:string>
SelectedBucketsCount: 10 out of 10

(2) HashAggregate
Input [1]: [Call_Type#518]
Keys [1]: [Call_Type#518]
Functions [1]: [partial_count(1)]
Aggregate Attributes [1]: [count#554L]
Results [2]: [Call_Type#518, count#555L]

(3) HashAggregate
Input [2]: [Call_Type#518, count#555L]
Keys [1]: [Call_Type#518]
Functions [1]: [count(1)]
Aggregate Attributes [1]: [count(1)#514L]
Results [2]: [Call_Type#518, count(1)#514L AS count(1)#550L]

(4) AdaptiveSparkPlan
Output [2]: [Call_Type#518, count(1)#550L]
Arguments: isFinalPlan=false




# JOIN Algorithms (for a single machine)

In [11]:
# kind_id, color
fruits = [
    ("B", "Yellow"),
    ("A", "Green"),
    ("C", "Orange"),
    ("A", "Red"),
    ("C", "Purple"),
    ("B", "Green")
]

# kind_id, name (assume no duplicate kind_id's)
kinds = [
    ("A", "Apple"),
    ("B", "Banana"),
    ("C", "Carrot")
]

# GOAL: print Yellow Banana, Green Apple, etc (any order)

In [12]:
# hash join
kind_lookup = dict(kinds)
kind_lookup

{'A': 'Apple', 'B': 'Banana', 'C': 'Carrot'}

In [13]:
for kind_id, color in fruits:
    print(color, kind_lookup[kind_id])

Yellow Banana
Green Apple
Orange Carrot
Red Apple
Purple Carrot
Green Banana


In [14]:
# sort merge join

In [15]:
fruits.sort()
kinds.sort()
fruits

[('A', 'Green'),
 ('A', 'Red'),
 ('B', 'Green'),
 ('B', 'Yellow'),
 ('C', 'Orange'),
 ('C', 'Purple')]

In [16]:
kinds

[('A', 'Apple'), ('B', 'Banana'), ('C', 'Carrot')]

In [17]:
fruit_idx = 0
for kind_id, food_name in kinds:
    while fruit_idx < len(fruits):
        if fruits[fruit_idx][0] > kind_id:
            break
        elif fruits[fruit_idx][0] == kind_id:
            print(fruits[fruit_idx][1], food_name)
        fruit_idx += 1

Green Apple
Red Apple
Green Banana
Yellow Banana
Orange Carrot
Purple Carrot


# Spark ML

In [18]:
import pandas as pd
import numpy as np
df = pd.DataFrame({"x1": np.random.randint(0, 10, 100).astype(float), 
                   "x2": np.random.randint(0, 3, 100).astype(float)})
df["y"] = df["x1"] + df["x2"] + np.random.rand(len(df))

In [19]:
df = spark.createDataFrame(df)
df

  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:


DataFrame[x1: double, x2: double, y: double]

In [20]:
# not truly deterministic overall, just at the partition level
train, test = df.randomSplit([0.75, 0.25], seed=42)
test.show()

[Stage 5:>                                                          (0 + 1) / 1]

+---+---+------------------+
| x1| x2|                 y|
+---+---+------------------+
|0.0|0.0|0.9158805831109543|
|1.0|0.0|1.1401461642969826|
|1.0|2.0|3.6487009924055704|
|2.0|0.0|2.0717519473221353|
|2.0|2.0| 4.546826654000136|
|4.0|0.0|4.5906962786996255|
|5.0|1.0| 6.138227466126343|
|5.0|2.0| 7.546845934732009|
|6.0|0.0| 6.042072999619121|
|6.0|1.0| 7.116403353920589|
|6.0|2.0| 8.118565731071259|
|6.0|2.0| 8.246993737593817|
|8.0|0.0| 8.946340368002927|
|9.0|1.0|10.575857248651747|
|9.0|2.0|11.097887594073411|
|9.0|2.0|11.625428140472932|
|9.0|2.0|11.830727541097351|
|0.0|0.0|0.5661008051409427|
|2.0|2.0| 4.017432291320147|
|2.0|2.0| 4.108159563044744|
+---+---+------------------+
only showing top 20 rows



                                                                                

In [21]:
train.write.format("parquet").mode("ignore").save("hdfs://nn:9000/train.parquet")
test.write.format("parquet").mode("ignore").save("hdfs://nn:9000/test.parquet")

In [22]:
train = spark.read.format("parquet").load("hdfs://nn:9000/train.parquet")
test = spark.read.format("parquet").load("hdfs://nn:9000/test.parquet")

In [23]:
train.count(), test.count()

                                                                                

(68, 32)

In [24]:
from pyspark.ml.regression import DecisionTreeRegressor, DecisionTreeRegressionModel
# DecisionTreeRegressor: unfit model
# DecisionTreeRegressionModel: fitted model
# In Spark, names ending in "Model" are the fitted ones

In [25]:
# ALWAYS need a vector column
# dt = DecisionTreeRegressor(featuresCol="x1", labelCol="y")
# dt.fit(train)

In [26]:
from pyspark.ml.feature import VectorAssembler

In [27]:
va = VectorAssembler(inputCols=["x1", "x2"], outputCol="features")
va.transform(train).show()

+---+---+------------------+---------+
| x1| x2|                 y| features|
+---+---+------------------+---------+
|0.0|0.0|0.9199799858102046|(2,[],[])|
|0.0|1.0|1.9237899890234949|[0.0,1.0]|
|0.0|2.0|2.4115514968030958|[0.0,2.0]|
|0.0|2.0|2.5050748072620093|[0.0,2.0]|
|1.0|0.0|1.7061313609343225|[1.0,0.0]|
|1.0|2.0| 3.333662686499369|[1.0,2.0]|
|2.0|0.0| 2.003528532155789|[2.0,0.0]|
|2.0|0.0|2.2192665598861536|[2.0,0.0]|
|2.0|0.0| 2.712624351595545|[2.0,0.0]|
|2.0|1.0| 3.222200036760564|[2.0,1.0]|
|2.0|2.0| 4.643240011769639|[2.0,2.0]|
|2.0|2.0| 4.655538264429331|[2.0,2.0]|
|2.0|2.0|4.6807817806409115|[2.0,2.0]|
|3.0|2.0|5.6720218292104425|[3.0,2.0]|
|4.0|0.0| 4.343506601500501|[4.0,0.0]|
|5.0|1.0| 6.560117748527708|[5.0,1.0]|
|5.0|1.0| 6.719313708593002|[5.0,1.0]|
|5.0|1.0| 6.730630186118628|[5.0,1.0]|
|5.0|1.0| 6.999800566404603|[5.0,1.0]|
|5.0|2.0| 7.460433637573166|[5.0,2.0]|
+---+---+------------------+---------+
only showing top 20 rows



In [28]:
va = VectorAssembler(inputCols=["x1", "x2"], outputCol="features")
dt = DecisionTreeRegressor(featuresCol="features", labelCol="y")

model = dt.fit(va.transform(train))

23/11/03 18:19:19 WARN BlockManager: Asked to remove block broadcast_31_piece0, which does not exist
23/11/03 18:19:20 WARN BlockManager: Asked to remove block broadcast_31, which does not exist
                                                                                

In [29]:
type(dt), type(model)

(pyspark.ml.regression.DecisionTreeRegressor,
 pyspark.ml.regression.DecisionTreeRegressionModel)

In [30]:
from pyspark.ml.pipeline import Pipeline, PipelineModel
# unfit: Pipeline
# fitted: PipelineModel

In [31]:
pipe = Pipeline(stages=[va, dt])

In [34]:
model = pipe.fit(train)

In [35]:
type(pipe), type(model)

(pyspark.ml.pipeline.Pipeline, pyspark.ml.pipeline.PipelineModel)

In [39]:
print(model.stages[1].toDebugString)

DecisionTreeRegressionModel: uid=DecisionTreeRegressor_73c04ff51a69, depth=5, numNodes=47, numFeatures=2
  If (feature 0 <= 4.5)
   If (feature 0 <= 1.5)
    If (feature 1 <= 1.5)
     If (feature 1 <= 0.5)
      If (feature 0 <= 0.5)
       Predict: 0.7800744764970607
      Else (feature 0 > 0.5)
       Predict: 1.706131360934322
     Else (feature 1 > 0.5)
      Predict: 1.6506517487404657
    Else (feature 1 > 1.5)
     If (feature 0 <= 0.5)
      Predict: 2.37192942461925
     Else (feature 0 > 0.5)
      Predict: 3.333662686499369
   Else (feature 0 > 1.5)
    If (feature 0 <= 2.5)
     If (feature 1 <= 0.5)
      Predict: 2.311806481212496
     Else (feature 1 > 0.5)
      If (feature 1 <= 1.5)
       Predict: 3.598877925615361
      Else (feature 1 > 1.5)
       Predict: 4.585099468466768
    Else (feature 0 > 2.5)
     If (feature 1 <= 1.5)
      If (feature 1 <= 0.5)
       Predict: 4.4191793805811805
      Else (feature 1 > 0.5)
       Predict: 5.358576016281355
     Else (fe

In [43]:
model.write().overwrite().save("hdfs://nn:9000/model")

                                                                                

In [45]:
!hdfs dfs -ls hdfs://nn:9000/model/stages

Found 2 items
drwxr-xr-x   - root supergroup          0 2023-11-03 18:25 hdfs://nn:9000/model/stages/0_VectorAssembler_fb04de5c9f7b
drwxr-xr-x   - root supergroup          0 2023-11-03 18:25 hdfs://nn:9000/model/stages/1_DecisionTreeRegressor_73c04ff51a69


In [46]:
model = PipelineModel.load("hdfs://nn:9000/model")

In [47]:
test

DataFrame[x1: double, x2: double, y: double]

In [48]:
model.transform(test)

DataFrame[x1: double, x2: double, y: double, features: vector, prediction: double]

In [49]:
model.transform(test).show()

+---+---+------------------+---------+------------------+
| x1| x2|                 y| features|        prediction|
+---+---+------------------+---------+------------------+
|0.0|0.0|0.7541983929002224|(2,[],[])|0.7800744764970607|
|0.0|2.0|2.0895679475224442|[0.0,2.0]|  2.37192942461925|
|2.0|0.0|2.8887728263009396|[2.0,0.0]| 2.311806481212496|
|2.0|1.0| 3.108342200029332|[2.0,1.0]| 3.598877925615361|
|3.0|1.0| 4.406243872036037|[3.0,1.0]| 5.358576016281355|
|4.0|1.0| 5.995543550470651|[4.0,1.0]| 5.358576016281355|
|5.0|1.0| 6.085087714599171|[5.0,1.0]| 6.681592758647609|
|7.0|0.0| 7.776839011240047|[7.0,0.0]| 7.565273901399568|
|7.0|1.0| 8.288747701615705|[7.0,1.0]| 8.896133847712079|
|7.0|2.0| 9.186696500928358|[7.0,2.0]| 9.689394106501865|
|7.0|2.0|  9.61693890608781|[7.0,2.0]| 9.689394106501865|
|7.0|2.0| 9.704914189877377|[7.0,2.0]| 9.689394106501865|
|9.0|1.0|10.933586729559503|[9.0,1.0]|11.145289250702923|
|9.0|2.0|11.116121782661093|[9.0,2.0]|11.145289250702923|
|9.0|2.0|11.42

In [50]:
from pyspark.ml.evaluation import RegressionEvaluator

In [51]:
r2score = RegressionEvaluator(predictionCol="prediction", labelCol="y", metricName="r2")
r2score

RegressionEvaluator_e367dae4265a

In [53]:
r2score.evaluate(model.transform(test))

                                                                                

0.9826104185724672