In [1]:
import pyspark
from pyspark.sql import SparkSession, Row
from pyspark.conf import SparkConf
from pyspark.context import SparkContext
from pyspark.sql.window import Window
import pyspark.sql.functions as F
from datetime import datetime, timedelta
import os

In [None]:
#gsutil cp gs://hadoop-lib/gcs/gcs-connector-hadoop3-2.2.5.jar lib/gcs-connector-hadoop3-2.2.5.jar

In [None]:
# Modify spark environment template to fix host name for dockerization - ONLY NEED TO RUN ONCE
!cp $SPARK_HOME/conf/spark-env.sh.template $SPARK_HOME/conf/spark-env.sh && echo "SPARK_MASTER_HOST=127.0.0.1" >> $SPARK_HOME/conf/spark-env.sh

In [2]:
# Start Local Standalone cluster
!cd $SPARK_HOME && ./sbin/start-master.sh --host 127.0.0.1 --port 7078

starting org.apache.spark.deploy.master.Master, logging to /home/jdelzio/spark/spark-3.5.1-bin-hadoop3/logs/spark-jdelzio-org.apache.spark.deploy.master.Master-1-de-zoomcamp-deb.us-west2-a.c.intricate-reef-411403.internal.out


In [3]:
# Start cluster worker
!cd $SPARK_HOME && ./sbin/start-worker.sh spark://127.0.0.1:7078

starting org.apache.spark.deploy.worker.Worker, logging to /home/jdelzio/spark/spark-3.5.1-bin-hadoop3/logs/spark-jdelzio-org.apache.spark.deploy.worker.Worker-1-de-zoomcamp-deb.us-west2-a.c.intricate-reef-411403.internal.out


In [4]:
# set up spark configuration to GCP
PROJECT_HOME = os.getenv("HOME")+"/data-engineering-zoomcamp/project" # this may need to be updated when dockerized
credentials_location = PROJECT_HOME+"/.google/credentials/gcp.json"

conf = SparkConf() \
    .setMaster("spark://127.0.0.1:7078") \
    .setAppName("process_raw_data") \
    .set("spark.jars", PROJECT_HOME+"/lib/gcs-connector-hadoop3-2.2.5.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

In [5]:
# set up spark context
sc = SparkContext(conf=conf)

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

24/04/13 22:50:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [6]:
# Start Spark session using standalone cluster
spark = SparkSession.builder \
    .config(conf=sc.getConf()) \
    .getOrCreate()

In [7]:
# Pull data from GCS Bucket into spark df
#gs://test_bucket-intricate-reef-41103/raw/*

df_test = spark.read.parquet('gs://test_bucket-intricate-reef-41103/raw/*') \
    .select(["id","Time (h)"])

                                                                                

In [8]:
df_test.show(5)

                                                                                

+---+--------+
| id|Time (h)|
+---+--------+
|  1|     0.2|
|  2|     0.4|
|  3|     0.6|
|  4|     0.8|
|  5|     1.0|
+---+--------+
only showing top 5 rows



In [None]:
# fill null values with 0
#df_filled = df_test.fillna(0)

In [None]:
# sort by id, save row count
df_sorted = df_test.orderBy("id")
nrows = df_sorted.count()

In [None]:
# Find new batch start indeces
batch_start_df = df_sorted \
    .filter(df_sorted["Time (h)"] == 0.2) \
    .select("id") \
    .withColumnRenamed("id","batch_start_id") \
    .withColumn("Batch Number",F.monotonically_increasing_id()+1)

In [None]:
# Add next back id for join clause
window_frame = Window.orderBy("batch_start_id")
batch_start_df = batch_start_df.withColumn("next_batch_start_id", F.lead("batch_start_id").over(window_frame))
# fill final next_batch_start_id with nrow df_sorted + 1
batch_start_df = batch_start_df.fillna(nrows+1)

In [None]:
# join batch numbers to df_sorted
df_processed = df_sorted.join(batch_start_df, (df_sorted.id >= batch_start_df.batch_start_id) & (df_sorted.id < batch_start_df.next_batch_start_id ), "inner")
df_processed = df_processed.drop(*["batch_start_id","next_batch_start_id"])

In [None]:
df_processed.collect()[-1]

In [None]:
# we want to simulate that 30 batches worth of the dataset have already been completed, while the final 70 are still to be peformed
completed_batches = 30
first_new_batch = df_processed \
    .filter(df_processed["Batch Number"] == completed_batches+1) \
    .select("id") \
    .head()[0]

In [None]:
# generate artificial sample production timestamps at around 0.06s per sample (of course this is highly accelerated for quick demonstration purposes)
# The final sample will be consumed a little over an hour from the current time
ts_current = datetime.utcnow()
ts_first_30_batches = [ts_current - i*timedelta(seconds=0.06) for i in range(1,first_new_batch)]
ts_first_30_batches.reverse()
ts_last_70_batches = [ts_current + i*timedelta(seconds=0.06) for i in range(first_new_batch,nrows+1)]
sample_ts = ts_first_30_batches
sample_ts.extend(ts_last_70_batches)

In [None]:
len(sample_ts) == nrows

In [None]:
# Join sample ts to processed_df
sample_ts_df = spark.createDataFrame([Row(index=i+1, sample_ts=sample_ts[i]) for i in range(nrows)])
df_processed = df_processed.join(sample_ts_df, df_processed.id == sample_ts_df.index, "inner").drop("index")

In [None]:
df_processed.head(10)

In [None]:
df_processed \
    .repartition(4) \
    .write.parquet('gs://test_bucket-intricate-reef-41103/processed/sample_context/')

In [9]:
spark.stop()

In [10]:
# Stop Local Standalone cluster
!cd $SPARK_HOME && ./sbin/stop-master.sh

stopping org.apache.spark.deploy.master.Master


In [11]:
# Stop Worker
!cd $SPARK_HOME && ./sbin/stop-worker.sh

stopping org.apache.spark.deploy.worker.Worker
