# Starter

In [1]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("cs544")
         .master("spark://boss:7077")
         .config("spark.executor.memory", "512M")
         .config("spark.sql.warehouse.dir", "hdfs://nn:9000/user/hive/warehouse")
         .enableHiveSupport()
         .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/01 14:25:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
!hdfs dfs -cp sf.csv hdfs://nn:9000/sf.csv

In [3]:
df = (spark.read.format("csv")
      .option("header", True)
      .option("inferSchema", True)
      .load("hdfs://nn:9000/sf.csv"))

                                                                                

In [5]:
from pyspark.sql.functions import col, expr
cols = [col(c).alias(c.replace(" ", "_")) for c in df.columns]
df.select(cols).write.format("parquet").mode("ignore").save("hdfs://nn:9000/sf.parquet")

In [6]:
!hdfs dfs -rm hdfs://nn:9000/sf.csv

Deleted hdfs://nn:9000/sf.csv


In [7]:
(spark.read
 .format("parquet")
 .load("hdfs://nn:9000/sf.parquet")
 .createOrReplaceTempView("calls")
)

                                                                                

# Lecture

In [15]:
spark.sql("""
SELECT Call_Type, COUNT(*) FROM calls GROUP BY Call_Type
""").explain("formatted")

== Physical Plan ==
AdaptiveSparkPlan (5)
+- HashAggregate (4)
   +- Exchange (3)
      +- HashAggregate (2)
         +- Scan parquet  (1)


(1) Scan parquet 
Output [1]: [Call_Type#301]
Batched: true
Location: InMemoryFileIndex [hdfs://nn:9000/sf.parquet]
ReadSchema: struct<Call_Type:string>

(2) HashAggregate
Input [1]: [Call_Type#301]
Keys [1]: [Call_Type#301]
Functions [1]: [partial_count(1)]
Aggregate Attributes [1]: [count#460L]
Results [2]: [Call_Type#301, count#461L]

(3) Exchange
Input [2]: [Call_Type#301, count#461L]
Arguments: hashpartitioning(Call_Type#301, 200), ENSURE_REQUIREMENTS, [plan_id=94]

(4) HashAggregate
Input [2]: [Call_Type#301, count#461L]
Keys [1]: [Call_Type#301]
Functions [1]: [count(1)]
Aggregate Attributes [1]: [count(1)#456L]
Results [2]: [Call_Type#301, count(1)#456L AS count(1)#457L]

(5) AdaptiveSparkPlan
Output [2]: [Call_Type#301, count(1)#457L]
Arguments: isFinalPlan=false




In [18]:
# sample is just to make the demo faster, it would work without that
(spark.table("calls")
 .sample(True, 0.01)
 .repartition(10, "Call_Type")
 .write
 .bucketBy(10, "Call_Type")
 .saveAsTable("calls_by_type"))

23/11/01 15:00:06 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/11/01 15:00:06 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
23/11/01 15:00:13 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
23/11/01 15:00:13 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.20.0.5
23/11/01 15:00:13 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
23/11/01 15:00:52 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
23/11/01 15:00:52 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
23/11/01 15:00:52 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/11/01 15:00:52 WARN 

In [19]:
spark.sql("""
SELECT Call_Type, COUNT(*)
FROM calls_by_type
GROUP BY Call_Type
""").explain("formatted")

== Physical Plan ==
AdaptiveSparkPlan (4)
+- HashAggregate (3)
   +- HashAggregate (2)
      +- Scan parquet spark_catalog.default.calls_by_type (1)


(1) Scan parquet spark_catalog.default.calls_by_type
Output [1]: [Call_Type#641]
Batched: true
Bucketed: true
Location: InMemoryFileIndex [hdfs://nn:9000/user/hive/warehouse/calls_by_type]
ReadSchema: struct<Call_Type:string>
SelectedBucketsCount: 10 out of 10

(2) HashAggregate
Input [1]: [Call_Type#641]
Keys [1]: [Call_Type#641]
Functions [1]: [partial_count(1)]
Aggregate Attributes [1]: [count#677L]
Results [2]: [Call_Type#641, count#678L]

(3) HashAggregate
Input [2]: [Call_Type#641, count#678L]
Keys [1]: [Call_Type#641]
Functions [1]: [count(1)]
Aggregate Attributes [1]: [count(1)#637L]
Results [2]: [Call_Type#641, count(1)#637L AS count(1)#673L]

(4) AdaptiveSparkPlan
Output [2]: [Call_Type#641, count(1)#673L]
Arguments: isFinalPlan=false




# JOIN (single machine)

In [20]:
# kind_id, color
fruits = [
    ("B", "Yellow"),
    ("A", "Green"),
    ("C", "Orange"),
    ("A", "Red"),
    ("C", "Purple"),
    ("B", "Green")
]

# kind_id, name (assume no duplicate kind_id's)
kinds = [
    ("A", "Apple"),
    ("B", "Banana"),
    ("C", "Carrot")
]

# GOAL: print Yellow Banana, Green Apple, etc (any order)

In [22]:
# Hash Join
kind_lookup = dict(kinds)
kind_lookup

{'A': 'Apple', 'B': 'Banana', 'C': 'Carrot'}

In [24]:
for kind_id, color in fruits:
    print(color, kind_lookup[kind_id])

Yellow Banana
Green Apple
Orange Carrot
Red Apple
Purple Carrot
Green Banana


# Sort Merge Join

In [25]:
kinds.sort()
fruits.sort()

In [26]:
kinds

[('A', 'Apple'), ('B', 'Banana'), ('C', 'Carrot')]

In [27]:
fruits

[('A', 'Green'),
 ('A', 'Red'),
 ('B', 'Green'),
 ('B', 'Yellow'),
 ('C', 'Orange'),
 ('C', 'Purple')]

In [30]:
fruit_idx = 0
for kind_id, fruit_name in kinds:
    while fruit_idx < len(fruits):
        if kind_id == fruits[fruit_idx][0]:
            print(fruits[fruit_idx][1], fruit_name)
        elif fruits[fruit_idx][0] > kind_id:
            break
        fruit_idx += 1

Green Apple
Red Apple
Green Banana
Yellow Banana
Orange Carrot
Purple Carrot


# ML

In [31]:
import pandas as pd
import numpy as np
df = pd.DataFrame({"x1": np.random.randint(0, 10, 100).astype(float), 
                   "x2": np.random.randint(0, 3, 100).astype(float)})
df["y"] = df["x1"] + df["x2"] + np.random.rand(len(df))
df = spark.createDataFrame(df)
df

  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:


DataFrame[x1: double, x2: double, y: double]

In [38]:
train, test = df.randomSplit([0.75, 0.25], seed=42)
test.show()

+---+---+------------------+
| x1| x2|                 y|
+---+---+------------------+
|0.0|1.0| 1.264283604109949|
|0.0|2.0| 2.094890224170356|
|0.0|2.0| 2.873546913558723|
|1.0|1.0|2.0277341720023143|
|1.0|2.0|3.5553849236676727|
|3.0|2.0|5.8380918111023465|
|4.0|2.0|6.0396629102184445|
|5.0|2.0| 7.908860662294932|
|6.0|0.0| 6.900819059812529|
|6.0|2.0| 8.770396197908603|
|7.0|0.0| 7.494266814295333|
|7.0|1.0| 8.752501791927529|
|9.0|0.0| 9.285365159223563|
|9.0|0.0| 9.780787928310911|
|9.0|1.0|10.647385930231051|
|9.0|2.0|11.137063529364992|
|9.0|2.0|11.361406110699855|
|0.0|0.0|0.3542937425216831|
|0.0|1.0|1.7711931101036376|
|0.0|2.0|2.5348876398044298|
+---+---+------------------+
only showing top 20 rows



In [40]:
train.write.mode("ignore").format("parquet").save("hdfs://nn:9000/train.parquet")
test.write.mode("ignore").format("parquet").save("hdfs://nn:9000/test.parquet")

In [41]:
train = spark.read.format("parquet").load("hdfs://nn:9000/train.parquet")
test = spark.read.format("parquet").load("hdfs://nn:9000/test.parquet")

In [42]:
train.count(), test.count()

                                                                                

(68, 32)

In [44]:
from pyspark.ml.regression import DecisionTreeRegressor, DecisionTreeRegressionModel
# DecisionTreeRegressor: unfit model
# DecisionTreeRegressionModel: fitted model

In [47]:
# need vectors!
# dt = DecisionTreeRegressor(featuresCol="x1", labelCol="y")
# dt.fit(train)

In [48]:
from pyspark.ml.feature import VectorAssembler

In [51]:
va = VectorAssembler(inputCols=["x1", "x2"], outputCol="features")
va

VectorAssembler_fa9a2b1c3058

In [55]:
#va.transform(train).show()

In [59]:
dt = DecisionTreeRegressor(featuresCol="features", labelCol="y")
model = dt.fit(va.transform(train))

In [60]:
type(dt), type(model)

(pyspark.ml.regression.DecisionTreeRegressor,
 pyspark.ml.regression.DecisionTreeRegressionModel)

In [62]:
from pyspark.ml.pipeline import Pipeline, PipelineModel
# Pipeline: unfit
# PipelineModel: fitted

In [63]:
pipe = Pipeline(stages=[va, dt])

In [64]:
model = pipe.fit(train)
type(pipe), type(model)

(pyspark.ml.pipeline.Pipeline, pyspark.ml.pipeline.PipelineModel)