# Lab 1: PySpark

## SETTING UP SPARK
In the `data-engineer-handbook` repo, cd'd into the `3-spark-fundamentals` directory
and ran `docker compose up -d`.

Then, I simply went to localhost:8888 in my browser to access the Spark jupyter notebook!

## Note
- None of the following code will run locally. All of the data, SQL databases, packaging, etc. were injected into my environment by spinning up docker containers (as described above).

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, col
spark = SparkSession.builder.appName("Jupyter").getOrCreate()

spark

events = spark.read.option("header", "true") \
                .csv("/home/iceberg/data/events.csv") \
                .withColumn("event_date", expr("DATE_TRUNC('day', event_time)"))

devices = spark.read.option("header","true").csv("/home/iceberg/data/devices.csv")

df = events.join(devices,on="device_id",how="left")
df = df.withColumnsRenamed({'browser_type': 'browser_family', 'os_type': 'os_family'})

df.show()

ModuleNotFoundError: No module named 'pyspark'

In [None]:
df.take(5)

In [None]:
sorted = df.repartition(10, col("event_date"))\
    .sortWithinPartitions(col("event_date"), col("host"))\
    .withColumn("event_time", col("event_time").cast("timestamp")) 

sortedTwo = df.repartition(10, col("event_date"))\
    .sort(col("event_date"), col("host"))\
    .withColumn("event_time", col("event_time").cast("timestamp")) 

# sorted.show()
# sortedTwo.show()


sorted.explain()
sortedTwo.explain()

In [None]:
## `.sortWithinPartitions()` sorts within partitions, whereas `.sort()` is a global sort, which is very slow

## Note - exchange is synonymous with Shuffle

## AT SCALE, ALWAWYS USE `.sortWithinPartitions()`

In [None]:
%%sql

CREATE DATABASE IF NOT EXISTS bootcamp

In [None]:
%%sql

DROP TABLE IF EXISTS bootcamp.events

In [None]:
%%sql

DROP TABLE IF EXISTS bootcamp.events_sorted

In [None]:
%%sql

CREATE TABLE IF NOT EXISTS bootcamp.events (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg
PARTITIONED BY (years(event_date));


In [None]:
%%sql


CREATE TABLE IF NOT EXISTS bootcamp.events_sorted (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg
PARTITIONED BY (years(event_date));

In [None]:
%%sql


CREATE TABLE IF NOT EXISTS bootcamp.events_unsorted (
    url STRING,
    referrer STRING,
    browser_family STRING,
    os_family STRING,
    device_family STRING,
    host STRING,
    event_time TIMESTAMP,
    event_date DATE
)
USING iceberg
PARTITIONED BY (year(event_date));

In [None]:

# Not sorted...
start_df = df.repartition(4, col("event_date")).withColumn("event_time", col("event_time").cast("timestamp")) \

# Sorting within partitions (4, as declared above) by 3 columns...
first_sort_df = start_df.sortWithinPartitions(col("event_date"), col("browser_family"), col("host"))


start_df.write.mode("overwrite").saveAsTable("bootcamp.events_unsorted")
first_sort_df.write.mode("overwrite").saveAsTable("bootcamp.events_sorted")

In [None]:
%%sql

SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'sorted' 
FROM demo.bootcamp.events_sorted.files

UNION ALL
    
SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files, 'unsorted' 
FROM demo.bootcamp.events_unsorted.files



In [None]:
## When writing data out, you want it to be written out from the LOWEST cardinality first, all the way to HIGHEST
## (i.e. "event_date" only has a few hundred unique records, "browser_family" has less, and "host" here has EVEN LESS.

In [None]:
%%sql
SELECT *
FROM demo.bootcamp.events_sorted.files

-- `.files` accesses the metadata of table

In [None]:
%%sql
SELECT SUM(file_size_in_bytes) as size, COUNT(1) as num_files FROM demo.bootcamp.events.files;

In [None]:
%%sql 
SELECT COUNT(1) FROM bootcamp.matches_bucketed.files