# Starter

In [1]:
from pyspark.sql import SparkSession
spark = (SparkSession.builder.appName("cs544")
         .master("spark://boss:7077")
         .config("spark.executor.memory", "512M")
         .config("spark.sql.warehouse.dir", "hdfs://nn:9000/user/hive/warehouse")
         .enableHiveSupport()
         .getOrCreate())

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/11/01 18:06:44 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
!hdfs dfs -cp sf.csv hdfs://nn:9000/sf.csv

In [3]:
df = (spark.read.format("csv")
      .option("header", True)
      .option("inferSchema", True)
      .load("hdfs://nn:9000/sf.csv"))

                                                                                

In [4]:
from pyspark.sql.functions import col, expr
cols = [col(c).alias(c.replace(" ", "_")) for c in df.columns]
df.select(cols).write.format("parquet").save("hdfs://nn:9000/sf.parquet")

23/11/01 18:09:12 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

In [5]:
!hdfs dfs -rm hdfs://nn:9000/sf.csv

Deleted hdfs://nn:9000/sf.csv


In [6]:
(spark.read
 .format("parquet")
 .load("hdfs://nn:9000/sf.parquet")
 .createOrReplaceTempView("calls")
)

# Lecture

In [10]:
spark.sql("""
SELECT Call_Type, COUNT(*)
FROM calls
GROUP BY Call_Type
""").explain("formatted")

== Physical Plan ==
AdaptiveSparkPlan (5)
+- HashAggregate (4)
   +- Exchange (3)
      +- HashAggregate (2)
         +- Scan parquet  (1)


(1) Scan parquet 
Output [1]: [Call_Type#231]
Batched: true
Location: InMemoryFileIndex [hdfs://nn:9000/sf.parquet]
ReadSchema: struct<Call_Type:string>

(2) HashAggregate
Input [1]: [Call_Type#231]
Keys [1]: [Call_Type#231]
Functions [1]: [partial_count(1)]
Aggregate Attributes [1]: [count#347L]
Results [2]: [Call_Type#231, count#348L]

(3) Exchange
Input [2]: [Call_Type#231, count#348L]
Arguments: hashpartitioning(Call_Type#231, 200), ENSURE_REQUIREMENTS, [plan_id=65]

(4) HashAggregate
Input [2]: [Call_Type#231, count#348L]
Keys [1]: [Call_Type#231]
Functions [1]: [count(1)]
Aggregate Attributes [1]: [count(1)#343L]
Results [2]: [Call_Type#231, count(1)#343L AS count(1)#344L]

(5) AdaptiveSparkPlan
Output [2]: [Call_Type#231, count(1)#344L]
Arguments: isFinalPlan=false




In [14]:
# would work without sampling, just using it to make it faster
(spark.table("calls")
 .sample(True, 0.01)
 .write.bucketBy(10, "Call_Type")
 .saveAsTable("call_by_type"))

23/11/01 18:25:29 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/11/01 18:25:29 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
23/11/01 18:25:35 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
23/11/01 18:25:35 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore UNKNOWN@172.21.0.5
23/11/01 18:25:35 WARN ObjectStore: Failed to get database default, returning NoSuchObjectException
23/11/01 18:26:06 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
23/11/01 18:26:07 WARN HiveConf: HiveConf of name hive.internal.ss.authz.settings.applied.marker does not exist
23/11/01 18:26:07 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
23/11/01 18:26:07 WARN 

In [16]:
spark.sql("""
SELECT Call_Type, COUNT(*)
FROM call_by_type
GROUP BY Call_Type
""").explain("formatted")

== Physical Plan ==
AdaptiveSparkPlan (4)
+- HashAggregate (3)
   +- HashAggregate (2)
      +- Scan parquet spark_catalog.default.call_by_type (1)


(1) Scan parquet spark_catalog.default.call_by_type
Output [1]: [Call_Type#528]
Batched: true
Bucketed: true
Location: InMemoryFileIndex [hdfs://nn:9000/user/hive/warehouse/call_by_type]
ReadSchema: struct<Call_Type:string>
SelectedBucketsCount: 10 out of 10

(2) HashAggregate
Input [1]: [Call_Type#528]
Keys [1]: [Call_Type#528]
Functions [1]: [partial_count(1)]
Aggregate Attributes [1]: [count#570L]
Results [2]: [Call_Type#528, count#571L]

(3) HashAggregate
Input [2]: [Call_Type#528, count#571L]
Keys [1]: [Call_Type#528]
Functions [1]: [count(1)]
Aggregate Attributes [1]: [count(1)#566L]
Results [2]: [Call_Type#528, count(1)#566L AS count(1)#567L]

(4) AdaptiveSparkPlan
Output [2]: [Call_Type#528, count(1)#567L]
Arguments: isFinalPlan=false




# JOIN Algorithms (for a single machine)

In [18]:
# kind_id, color
fruits = [
    ("B", "Yellow"),
    ("A", "Green"),
    ("C", "Orange"),
    ("A", "Red"),
    ("C", "Purple"),
    ("B", "Green")
]

# kind_id, name (assume no duplicate kind_id's)
kinds = [
    ("A", "Apple"),
    ("B", "Banana"),
    ("C", "Carrot")
]

# GOAL: print Yellow Banana, Green Apple, etc (any order)

In [21]:
# hash join
kind_lookup = dict(kinds)
kind_lookup

{'A': 'Apple', 'B': 'Banana', 'C': 'Carrot'}

In [22]:
for kind_id, color in fruits:
    print(color, kind_lookup[kind_id])

Yellow Banana
Green Apple
Orange Carrot
Red Apple
Purple Carrot
Green Banana


In [23]:
# sort merge join

In [24]:
fruits.sort()
kinds.sort()
fruits

[('A', 'Green'),
 ('A', 'Red'),
 ('B', 'Green'),
 ('B', 'Yellow'),
 ('C', 'Orange'),
 ('C', 'Purple')]

In [25]:
kinds

[('A', 'Apple'), ('B', 'Banana'), ('C', 'Carrot')]

In [29]:
fruit_idx = 0
for kind_id, food_name in kinds:
    while fruit_idx < len(fruits):
        if fruits[fruit_idx][0] > kind_id:
            break
        elif fruits[fruit_idx][0] == kind_id:
            print(fruits[fruit_idx][1], food_name)
        fruit_idx += 1

Green Apple
Red Apple
Green Banana
Yellow Banana
Orange Carrot
Purple Carrot


# Spark ML

In [30]:
import pandas as pd
import numpy as np
df = pd.DataFrame({"x1": np.random.randint(0, 10, 100).astype(float), 
                   "x2": np.random.randint(0, 3, 100).astype(float)})
df["y"] = df["x1"] + df["x2"] + np.random.rand(len(df))

In [32]:
df = spark.createDataFrame(df)
df

  if should_localize and is_datetime64tz_dtype(s.dtype) and s.dt.tz is not None:


DataFrame[x1: double, x2: double, y: double]

In [37]:
# not truly deterministic overall, just at the partition level
train, test = df.randomSplit([0.75, 0.25], seed=42)
test.show()

+---+---+-------------------+
| x1| x2|                  y|
+---+---+-------------------+
|0.0|0.0| 0.5648266803421843|
|0.0|2.0| 2.0939332656644014|
|0.0|2.0|  2.994013520808395|
|1.0|0.0| 1.5516098432369945|
|2.0|0.0|  2.285419378010519|
|3.0|0.0|   3.94722988657639|
|5.0|0.0|  5.117541828375912|
|5.0|2.0| 7.3723615588151405|
|5.0|2.0| 7.6335844378293904|
|6.0|1.0|  7.209392324924814|
|6.0|1.0|  7.309871628543936|
|6.0|2.0|  8.325226894959687|
|8.0|0.0|   8.65686179465806|
|9.0|0.0|  9.335490381175925|
|9.0|0.0|  9.930572957740925|
|9.0|1.0| 10.627217306329767|
|9.0|2.0| 11.480261333812567|
|0.0|0.0|0.21694709548391078|
|1.0|1.0|  2.061167689108809|
|1.0|1.0|  2.471207033151983|
+---+---+-------------------+
only showing top 20 rows



In [39]:
train.write.format("parquet").mode("ignore").save("hdfs://nn:9000/train.parquet")
test.write.format("parquet").mode("ignore").save("hdfs://nn:9000/test.parquet")

                                                                                

In [40]:
train = spark.read.format("parquet").load("hdfs://nn:9000/train.parquet")
test = spark.read.format("parquet").load("hdfs://nn:9000/test.parquet")

In [41]:
train.count(), test.count()

                                                                                

(68, 32)

In [42]:
from pyspark.ml.regression import DecisionTreeRegressor, DecisionTreeRegressionModel
# DecisionTreeRegressor: unfit model
# DecisionTreeRegressionModel: fitted model
# In Spark, names ending in "Model" are the fitted ones

In [46]:
# ALWAYS need a vector column
# dt = DecisionTreeRegressor(featuresCol="x1", labelCol="y")
# dt.fit(train)

In [48]:
from pyspark.ml.feature import VectorAssembler

In [50]:
va = VectorAssembler(inputCols=["x1", "x2"], outputCol="features")
va.transform(train).show()

+---+---+------------------+---------+
| x1| x2|                 y| features|
+---+---+------------------+---------+
|0.0|0.0|0.5415786559153639|(2,[],[])|
|0.0|1.0| 1.518929039270323|[0.0,1.0]|
|0.0|1.0|1.7877870719920037|[0.0,1.0]|
|1.0|0.0|1.1368566307892398|[1.0,0.0]|
|2.0|0.0|2.9836589427882205|[2.0,0.0]|
|2.0|2.0| 4.214838018715973|[2.0,2.0]|
|2.0|2.0| 4.787701540320342|[2.0,2.0]|
|2.0|2.0| 4.915889037042399|[2.0,2.0]|
|3.0|0.0|3.0305886739745196|[3.0,0.0]|
|3.0|0.0| 3.442558020574335|[3.0,0.0]|
|3.0|0.0| 3.967805061470494|[3.0,0.0]|
|3.0|1.0| 4.755412091033398|[3.0,1.0]|
|3.0|2.0| 5.149914759956805|[3.0,2.0]|
|4.0|0.0|  4.40652818239079|[4.0,0.0]|
|4.0|0.0|4.5872573244424775|[4.0,0.0]|
|4.0|1.0| 5.773329264364453|[4.0,1.0]|
|4.0|1.0| 5.996211493339068|[4.0,1.0]|
|5.0|0.0| 5.015153196328459|[5.0,0.0]|
|5.0|0.0| 5.115777378498888|[5.0,0.0]|
|5.0|1.0| 6.004784812788598|[5.0,1.0]|
+---+---+------------------+---------+
only showing top 20 rows



In [52]:
va = VectorAssembler(inputCols=["x1", "x2"], outputCol="features")
dt = DecisionTreeRegressor(featuresCol="features", labelCol="y")

model = dt.fit(va.transform(train))

                                                                                

In [53]:
type(dt), type(model)

(pyspark.ml.regression.DecisionTreeRegressor,
 pyspark.ml.regression.DecisionTreeRegressionModel)