# PySpark Notebook

This notebook performs the following operations:

- Execute query with SQL API
- Execute query with Dataframe API
- Create tables and fill them with data

This experiment was designed to ascertain whether Scala offers a notable advantage in terms of performance when compared to PySpark. The corresponding code can be found in the scala-toree notebook.

In [3]:
spark.stop()

In [15]:
from pyspark.sql import SparkSession
import os

spark = SparkSession.builder \
    .appName("PySparkTest") \
    .master("spark://192.168.0.144:7077") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.hadoop.hive.metastore.uris", "thrift://192.168.0.144:9083") \
    .config("spark.hadoop.javax.jdo.option.ConnectionURL", "jdbc:mysql://192.168.0.144:3306/metastore_db") \
    .config("spark.hadoop.javax.jdo.option.ConnectionDriverName", "com.mysql.cj.jdbc.Driver") \
    .config("spark.hadoop.javax.jdo.option.ConnectionUserName", "lh") \
    .config("spark.hadoop.javax.jdo.option.ConnectionPassword", os.getenv('MYSQL', 'Default_Value')) \
    .config("spark.jars", "/usr/local/spark/jars/delta-storage-3.2.0.jar,/usr/local/spark/jars/delta-spark_2.12-3.2.0.jar")    \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.HDFSLogStore") \
    .config("spark.executor.memory", "9g") \
    .config("spark.executor.cores", "3") \
    .config("spark.driver.memory", "19g") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.hadoop.fs.defaultFS", "hdfs://192.168.0.144:9000") \
    .config("spark.databricks.delta.clusteredTable.enableClusteringTablePreview", "true") \
    .config("spark.sql.debug.maxToStringFields", "1000") \
    .enableHiveSupport() \
    .getOrCreate()

In [6]:
from pyspark.sql import SparkSession
from time import time
from collections import defaultdict

spark.sql("USE zorder")

tables = ["zorder_eventid_actorid_table_2"]
num_executions = 10

def measure_execution_time(query):
    spark.catalog.clearCache()
    start_time = time()
    spark.sql(query).show()
    end_time = time()
    return end_time - start_time

execution_times = defaultdict(list)

for table in tables:
    for _ in range(num_executions):
        query = f"""
        SELECT
            product_dim.product_name,
            location_dim.city,
            location_dim.state,
            location_dim.country,
            SUM({table}.value) AS total_value,
            COUNT({table}.event_id) AS event_count
        FROM
            {table}
        JOIN
            product_dim ON {table}.product_id = product_dim.product_id
        JOIN
            location_dim ON {table}.location_id = location_dim.location_id
        WHERE
            {table}.event_id BETWEEN 459999 AND 999999
            AND {table}.actor_id IN (5001, 5002, 5003)
        GROUP BY
            product_dim.product_name,
            location_dim.city,
            location_dim.state,
            location_dim.country
        ORDER BY
            total_value DESC
        """

        try:
            execution_time = measure_execution_time(query)
            print(f"{table}: {execution_time} seconds")
            execution_times[table].append(execution_time)
        except Exception as e:
            print(f"Error for table {table}: {e}")

average_times = {table: sum(times) / num_executions for table, times in execution_times.items()}

df = spark.createDataFrame([(table, avg_time) for table, avg_time in average_times.items()], ["Table", "Average Execution Time"])
df.show()


                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546522|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393486|      32485|
|   Product B| Los Angeles|   CA|    USA|16260.506950983123|      32449|
|   Product F|Philadelphia|   PA|    USA|16259.047580883305|      32521|
|   Product H|   San Diego|   CA|    USA|16238.377853609758|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333414|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552855|      32367|
|   Product J|    San Jose|   CA|    USA|16205.668117725012|      32434|
|   Product D|     Houston|   TX|    USA|16115.736272952996|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536641|      31985|
+------------+------------+-----+-------+----------

                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA| 16325.64538454652|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393483|      32485|
|   Product B| Los Angeles|   CA|    USA|16260.506950983121|      32449|
|   Product F|Philadelphia|   PA|    USA|16259.047580883309|      32521|
|   Product H|   San Diego|   CA|    USA|16238.377853609756|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333416|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552853|      32367|
|   Product J|    San Jose|   CA|    USA|16205.668117725012|      32434|
|   Product D|     Houston|   TX|    USA|16115.736272952996|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536641|      31985|
+------------+------------+-----+-------+----------

                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546518|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393486|      32485|
|   Product B| Los Angeles|   CA|    USA|16260.506950983123|      32449|
|   Product F|Philadelphia|   PA|    USA|16259.047580883309|      32521|
|   Product H|   San Diego|   CA|    USA| 16238.37785360976|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333414|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552859|      32367|
|   Product J|    San Jose|   CA|    USA| 16205.66811772501|      32434|
|   Product D|     Houston|   TX|    USA|16115.736272952998|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536641|      31985|
+------------+------------+-----+-------+----------

                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546522|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393481|      32485|
|   Product B| Los Angeles|   CA|    USA|16260.506950983125|      32449|
|   Product F|Philadelphia|   PA|    USA|16259.047580883305|      32521|
|   Product H|   San Diego|   CA|    USA| 16238.37785360976|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333416|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552855|      32367|
|   Product J|    San Jose|   CA|    USA|16205.668117725018|      32434|
|   Product D|     Houston|   TX|    USA|   16115.736272953|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536637|      31985|
+------------+------------+-----+-------+----------

                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546518|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393485|      32485|
|   Product B| Los Angeles|   CA|    USA|16260.506950983123|      32449|
|   Product F|Philadelphia|   PA|    USA| 16259.04758088331|      32521|
|   Product H|   San Diego|   CA|    USA|16238.377853609758|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333418|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552857|      32367|
|   Product J|    San Jose|   CA|    USA|16205.668117725014|      32434|
|   Product D|     Houston|   TX|    USA|16115.736272952998|      32344|
|   Product E|     Phoenix|   AZ|    USA| 15993.69845353664|      31985|
+------------+------------+-----+-------+----------

                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546518|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393481|      32485|
|   Product B| Los Angeles|   CA|    USA|16260.506950983123|      32449|
|   Product F|Philadelphia|   PA|    USA|16259.047580883309|      32521|
|   Product H|   San Diego|   CA|    USA|16238.377853609763|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333414|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552857|      32367|
|   Product J|    San Jose|   CA|    USA|16205.668117725008|      32434|
|   Product D|     Houston|   TX|    USA|16115.736272952992|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536641|      31985|
+------------+------------+-----+-------+----------

                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546522|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393485|      32485|
|   Product B| Los Angeles|   CA|    USA|16260.506950983125|      32449|
|   Product F|Philadelphia|   PA|    USA|16259.047580883313|      32521|
|   Product H|   San Diego|   CA|    USA| 16238.37785360976|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333416|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552857|      32367|
|   Product J|    San Jose|   CA|    USA|16205.668117725016|      32434|
|   Product D|     Houston|   TX|    USA|16115.736272952996|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536637|      31985|
+------------+------------+-----+-------+----------

                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546518|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393485|      32485|
|   Product B| Los Angeles|   CA|    USA| 16260.50695098312|      32449|
|   Product F|Philadelphia|   PA|    USA|16259.047580883305|      32521|
|   Product H|   San Diego|   CA|    USA|16238.377853609756|      32430|
|   Product I|      Dallas|   TX|    USA| 16214.70081633342|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552857|      32367|
|   Product J|    San Jose|   CA|    USA|16205.668117725014|      32434|
|   Product D|     Houston|   TX|    USA|16115.736272952998|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536643|      31985|
+------------+------------+-----+-------+----------

                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546517|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393485|      32485|
|   Product B| Los Angeles|   CA|    USA|16260.506950983121|      32449|
|   Product F|Philadelphia|   PA|    USA| 16259.04758088331|      32521|
|   Product H|   San Diego|   CA|    USA|16238.377853609758|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333418|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552857|      32367|
|   Product J|    San Jose|   CA|    USA|16205.668117725014|      32434|
|   Product D|     Houston|   TX|    USA|16115.736272952996|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536641|      31985|
+------------+------------+-----+-------+----------

                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546517|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393488|      32485|
|   Product B| Los Angeles|   CA|    USA|16260.506950983123|      32449|
|   Product F|Philadelphia|   PA|    USA| 16259.04758088331|      32521|
|   Product H|   San Diego|   CA|    USA|16238.377853609756|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333412|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552857|      32367|
|   Product J|    San Jose|   CA|    USA|16205.668117725014|      32434|
|   Product D|     Houston|   TX|    USA|16115.736272952998|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536637|      31985|
+------------+------------+-----+-------+----------

                                                                                

+--------------------+----------------------+
|               Table|Average Execution Time|
+--------------------+----------------------+
|zorder_eventid_ac...|    15.687098050117493|
+--------------------+----------------------+



                                                                                

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, sum as _sum, count, desc
from time import time
from collections import defaultdict
import traceback

spark.sql("USE zorder")

tables = ["zorder_eventid_actorid_table_2"]
num_executions = 10

def measure_execution_time(df):
    spark.catalog.clearCache()
    start_time = time()
    df.show()
    end_time = time()
    return end_time - start_time

execution_times = defaultdict(list)

for table in tables:
    df = spark.table(table) \
        .join(spark.table("product_dim"), col(f"{table}.product_id") == col("product_dim.product_id")) \
        .join(spark.table("location_dim"), col(f"{table}.location_id") == col("location_dim.location_id")) \
        .filter(col(f"{table}.event_id").between(459999, 999999) & col(f"{table}.actor_id").isin(5001, 5002, 5003)) \
        .groupBy("product_dim.product_name", "location_dim.city", "location_dim.state", "location_dim.country") \
        .agg(
            _sum(col(f"{table}.value")).alias("total_value"),
            count(col(f"{table}.event_id")).alias("event_count")
        ) \
        .orderBy(desc("total_value"))

    for _ in range(num_executions):
        try:
            execution_time = measure_execution_time(df)
            print(f"{table}: {execution_time} seconds")
            execution_times[table].append(execution_time)
        except Exception as e:
            print(f"Error for table {table}: {str(e)}")
            traceback.print_exc()

average_times = {table: sum(times) / num_executions for table, times in execution_times.items()}

df = spark.createDataFrame([(table, avg_time) for table, avg_time in average_times.items()], ["Table", "Average Execution Time"])
df.show()


                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546522|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393485|      32485|
|   Product B| Los Angeles|   CA|    USA|16260.506950983123|      32449|
|   Product F|Philadelphia|   PA|    USA| 16259.04758088331|      32521|
|   Product H|   San Diego|   CA|    USA| 16238.37785360976|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333414|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552859|      32367|
|   Product J|    San Jose|   CA|    USA| 16205.66811772501|      32434|
|   Product D|     Houston|   TX|    USA|16115.736272952992|      32344|
|   Product E|     Phoenix|   AZ|    USA| 15993.69845353664|      31985|
+------------+------------+-----+-------+----------

                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA| 16325.64538454652|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393483|      32485|
|   Product B| Los Angeles|   CA|    USA|16260.506950983121|      32449|
|   Product F|Philadelphia|   PA|    USA|16259.047580883307|      32521|
|   Product H|   San Diego|   CA|    USA|16238.377853609758|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333416|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552857|      32367|
|   Product J|    San Jose|   CA|    USA|16205.668117725016|      32434|
|   Product D|     Houston|   TX|    USA|16115.736272952998|      32344|
|   Product E|     Phoenix|   AZ|    USA| 15993.69845353664|      31985|
+------------+------------+-----+-------+----------

                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546522|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393483|      32485|
|   Product B| Los Angeles|   CA|    USA|16260.506950983121|      32449|
|   Product F|Philadelphia|   PA|    USA|16259.047580883309|      32521|
|   Product H|   San Diego|   CA|    USA| 16238.37785360976|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333416|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552857|      32367|
|   Product J|    San Jose|   CA|    USA| 16205.66811772501|      32434|
|   Product D|     Houston|   TX|    USA|16115.736272952996|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536643|      31985|
+------------+------------+-----+-------+----------

                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA| 16325.64538454652|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393483|      32485|
|   Product B| Los Angeles|   CA|    USA|16260.506950983121|      32449|
|   Product F|Philadelphia|   PA|    USA|16259.047580883307|      32521|
|   Product H|   San Diego|   CA|    USA|16238.377853609762|      32430|
|   Product I|      Dallas|   TX|    USA| 16214.70081633342|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552857|      32367|
|   Product J|    San Jose|   CA|    USA|16205.668117725014|      32434|
|   Product D|     Houston|   TX|    USA|16115.736272953001|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536643|      31985|
+------------+------------+-----+-------+----------

                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546517|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393485|      32485|
|   Product B| Los Angeles|   CA|    USA|16260.506950983123|      32449|
|   Product F|Philadelphia|   PA|    USA|16259.047580883309|      32521|
|   Product H|   San Diego|   CA|    USA| 16238.37785360976|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333418|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552859|      32367|
|   Product J|    San Jose|   CA|    USA| 16205.66811772501|      32434|
|   Product D|     Houston|   TX|    USA|   16115.736272953|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536643|      31985|
+------------+------------+-----+-------+----------

                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546518|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393485|      32485|
|   Product B| Los Angeles|   CA|    USA|16260.506950983123|      32449|
|   Product F|Philadelphia|   PA|    USA| 16259.04758088331|      32521|
|   Product H|   San Diego|   CA|    USA|16238.377853609756|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333418|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552855|      32367|
|   Product J|    San Jose|   CA|    USA| 16205.66811772501|      32434|
|   Product D|     Houston|   TX|    USA|16115.736272952998|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536643|      31985|
+------------+------------+-----+-------+----------

                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546518|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393486|      32485|
|   Product B| Los Angeles|   CA|    USA| 16260.50695098312|      32449|
|   Product F|Philadelphia|   PA|    USA|16259.047580883307|      32521|
|   Product H|   San Diego|   CA|    USA|16238.377853609756|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333416|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552857|      32367|
|   Product J|    San Jose|   CA|    USA|16205.668117725008|      32434|
|   Product D|     Houston|   TX|    USA|16115.736272952998|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536643|      31985|
+------------+------------+-----+-------+----------

                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546517|      32631|
|   Product A|    New York|   NY|    USA| 16293.04177339349|      32485|
|   Product B| Los Angeles|   CA|    USA|16260.506950983125|      32449|
|   Product F|Philadelphia|   PA|    USA| 16259.04758088331|      32521|
|   Product H|   San Diego|   CA|    USA| 16238.37785360976|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333416|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552859|      32367|
|   Product J|    San Jose|   CA|    USA| 16205.66811772501|      32434|
|   Product D|     Houston|   TX|    USA|16115.736272952998|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536641|      31985|
+------------+------------+-----+-------+----------

                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546517|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393486|      32485|
|   Product B| Los Angeles|   CA|    USA| 16260.50695098312|      32449|
|   Product F|Philadelphia|   PA|    USA|16259.047580883309|      32521|
|   Product H|   San Diego|   CA|    USA| 16238.37785360976|      32430|
|   Product I|      Dallas|   TX|    USA|16214.700816333418|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552855|      32367|
|   Product J|    San Jose|   CA|    USA|16205.668117725012|      32434|
|   Product D|     Houston|   TX|    USA|16115.736272953001|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536641|      31985|
+------------+------------+-----+-------+----------

                                                                                

+------------+------------+-----+-------+------------------+-----------+
|product_name|        city|state|country|       total_value|event_count|
+------------+------------+-----+-------+------------------+-----------+
|   Product G| San Antonio|   TX|    USA|16325.645384546515|      32631|
|   Product A|    New York|   NY|    USA|16293.041773393485|      32485|
|   Product B| Los Angeles|   CA|    USA|16260.506950983125|      32449|
|   Product F|Philadelphia|   PA|    USA|16259.047580883309|      32521|
|   Product H|   San Diego|   CA|    USA|16238.377853609752|      32430|
|   Product I|      Dallas|   TX|    USA| 16214.70081633342|      32415|
|   Product C|     Chicago|   IL|    USA|16212.334961552851|      32367|
|   Product J|    San Jose|   CA|    USA|16205.668117725012|      32434|
|   Product D|     Houston|   TX|    USA|   16115.736272953|      32344|
|   Product E|     Phoenix|   AZ|    USA|15993.698453536641|      31985|
+------------+------------+-----+-------+----------

In [21]:
db='pyspark_test'
delta_db_path = 'hdfs://192.168.0.144:9000/datalake/pyspark_test'

In [23]:
#spark.sql(f"drop DATABASE {db} CASCADE")
spark.sql(f"CREATE DATABASE {db} LOCATION '{delta_db_path}'")
spark.sql(f"use database {db}")

DataFrame[]

In [24]:
num = 3  
max_rows = 8 

row_counts = [max_rows // (2 ** i) for i in range(num)]

for i in range(num):
    compaction_table_name = f"pyspark_test_{row_counts[i]}"
    spark.sql(f"DROP TABLE IF EXISTS {compaction_table_name}")   
    spark.sql(f"""
    CREATE TABLE {compaction_table_name} (
        timestamp TIMESTAMP,
        value DOUBLE,
        country STRING,
        event_id LONG,
        actor_id LONG,
        year INT,
        month LONG,
        day LONG,
        product_id INT,          
        location_id INT,         
        department_id INT,      
        campaign_id INT,         
        customer_id INT        
    )
    USING delta;
    """)

24/08/07 11:41:42 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `spark_catalog`.`pyspark_test`.`pyspark_test_8` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
24/08/07 11:41:42 WARN SessionState: METASTORE_FILTER_HOOK will be ignored, since hive.security.authorization.manager is set to instance of HiveAuthorizerFactory.
24/08/07 11:41:43 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `spark_catalog`.`pyspark_test`.`pyspark_test_4` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.
24/08/07 11:41:44 WARN HiveExternalCatalog: Couldn't find corresponding Hive SerDe for data source provider delta. Persisting data source table `spark_catalog`.`pyspark_test`.`pyspark_test_2` into Hive metastore in Spark SQL specific format, which is NOT compatible with Hive.


In [25]:
table_pairs = [
    ('pyspark_test_2', 'raw_data.raw_data_2')
]

for target_table, source_table in table_pairs:
    insert_sql = f"""
    INSERT INTO {target_table}
    SELECT * FROM {source_table}
    """
    print(f"Inserting data for table {target_table}")
    %time spark.sql(insert_sql)

Inserting data for table pyspark_test_2


                                                                                

CPU times: user 824 ms, sys: 379 ms, total: 1.2 s
Wall time: 37min 54s


In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

def main():

    num_rows = 1000000
    num_partitions = 10

    df = spark.range(num_rows).repartition(num_partitions) \
        .withColumn("value", col("id") * 2)

    import time
    start_time = time.time()

    result = df.filter(col("value") % 2 == 0) \
        .groupBy("value") \
        .count() \
        .collect()

    end_time = time.time()
    duration = end_time - start_time

    print(f"PySpark execution time: {duration} seconds")

    spark.stop()

if __name__ == "__main__":
    main()


                                                                                

PySpark execution time: 4.870447158813477 seconds
