In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import gresearch.spark.parquet
import os
import sys


In [2]:
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [3]:
warehouse_location = "D:/Data/HiveMetastore"

In [4]:
os.path.isdir(warehouse_location)

True

In [5]:
spark = SparkSession \
    .builder \
    .appName("Data with Nikk the Greek Spark Session") \
    .master("local[4]") \
    .enableHiveSupport() \
    .config("spark.jars.packages", "uk.co.gresearch.spark:spark-extension_2.12:2.11.0-3.5") \
    .getOrCreate()

#https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html
#    .config("spark.sql.warehouse.dir", warehouse_location) \



In [6]:
spark.conf.get("spark.sql.warehouse.dir")

'file:/d:/Data/HiveMetastore'

In [7]:
sc = spark.sparkContext

In [8]:
#ADJUST PATH BASED ON YOUR CHOSEN ONE
path_hive = "D:/Data/hive.parquet"
path_no_hive = "D:/Data/no_hive.parquet"

In [9]:
def sdf_generator(num_rows: int, num_partitions: int = None) -> "DataFrame":
    return (
        spark.range(num_rows, numPartitions=num_partitions)
        .withColumn("date", f.current_date())
        .withColumn("timestamp",f.current_timestamp())
        .withColumn("idstring", f.col("id").cast("string"))
        .withColumn("idfirst", f.col("idstring").substr(0,1))
        .withColumn("idlast", f.col("idstring").substr(-1,1))
        )

In [10]:
#100k rows, 4 files
sdf = sdf_generator(10000000, 4)

In [11]:
sdf.show()

+---+----------+--------------------+--------+-------+------+
| id|      date|           timestamp|idstring|idfirst|idlast|
+---+----------+--------------------+--------+-------+------+
|  0|2024-03-18|2024-03-18 23:50:...|       0|      0|     0|
|  1|2024-03-18|2024-03-18 23:50:...|       1|      1|     1|
|  2|2024-03-18|2024-03-18 23:50:...|       2|      2|     2|
|  3|2024-03-18|2024-03-18 23:50:...|       3|      3|     3|
|  4|2024-03-18|2024-03-18 23:50:...|       4|      4|     4|
|  5|2024-03-18|2024-03-18 23:50:...|       5|      5|     5|
|  6|2024-03-18|2024-03-18 23:50:...|       6|      6|     6|
|  7|2024-03-18|2024-03-18 23:50:...|       7|      7|     7|
|  8|2024-03-18|2024-03-18 23:50:...|       8|      8|     8|
|  9|2024-03-18|2024-03-18 23:50:...|       9|      9|     9|
| 10|2024-03-18|2024-03-18 23:50:...|      10|      1|     0|
| 11|2024-03-18|2024-03-18 23:50:...|      11|      1|     1|
| 12|2024-03-18|2024-03-18 23:50:...|      12|      1|     2|
| 13|202

In [12]:
sdf_schema = "id bigint, date date, timestamp timestamp, idstring string, idfirst string, idlast string"

In [13]:
sc.setJobDescription("Write Dataset")
sdf.write.format("parquet").mode("overwrite").save(path_hive)
sdf.write.format("parquet").mode("overwrite").save(path_no_hive)

In [15]:
spark.sql(f"DROP TABLE IF EXISTS test")
spark.sql(f"CREATE TABLE test USING PARQUET LOCATION '{path_hive}'")

DataFrame[]

In [20]:
spark.conf.set("spark.sql.sources.useV1SourceList", "parquet")
sc.setJobDescription("Load Parquet No Hive with schema")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_no_hive)
sdf_parquet.write.format("noop").mode("overwrite").save()

In [21]:
spark.conf.set("spark.sql.sources.useV1SourceList", "parquet")
sc.setJobDescription("Load Parquet No Hive without schema")
sdf_parquet = spark.read.format("parquet").load(path_no_hive)
sdf_parquet.write.format("noop").mode("overwrite").save()

In [22]:
spark.conf.set("spark.sql.sources.useV1SourceList", "parquet")
sc.setJobDescription("Load Parquet Hive")
sdf_parquet = spark.read.format("parquet").load(path_hive)
sdf_parquet.write.format("noop").mode("overwrite").save()

In [23]:
sc.setJobDescription("Load Parquet Hive Table")
spark.sql("Select * from test").write.format("noop").mode("overwrite").save()

In [28]:
sc.setJobDescription("Filter Hive Table")
spark.sql("Select * from test where id < 10000").write.format("noop").mode("overwrite").save()

In [29]:
sc.setJobDescription("Filter Parquet Table")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_no_hive)
sdf_parquet = sdf_parquet.filter(f.col("id") < 10000)
sdf_parquet.write.format("noop").mode("overwrite").save()

In [31]:
spark.conf.set("spark.sql.sources.useV1SourceList", "parquet")
sc.setJobDescription("Max Parquet No Hive with schema")
sdf_parquet = spark.read.format("parquet").schema(sdf_schema).load(path_no_hive)
sdf_max = sdf_parquet.groupBy().max("id")
sdf_max.show()

+-------+
|max(id)|
+-------+
|9999999|
+-------+



In [31]:
spark.conf.set("spark.sql.sources.useV1SourceList", "parquet")
sc.setJobDescription("Max Parquet No Hive without schema")
sdf_parquet = spark.read.format("parquet").load(path_no_hive)
sdf_max = sdf_parquet.groupBy().max("id")
sdf_max.show()

+-------+
|max(id)|
+-------+
|9999999|
+-------+



In [30]:
spark.conf.set("spark.sql.sources.useV1SourceList", "parquet")
sc.setJobDescription("Max Parquet Hive")
sdf_parquet = spark.read.format("parquet").load(path_hive)
sdf_max = sdf_parquet.groupBy().max("id")
sdf_max.show()

+-------+
|max(id)|
+-------+
|9999999|
+-------+



In [None]:
spark.sql("Select * from test").show()

In [25]:
spark.sql("ANALYZE TABLE test COMPUTE STATISTICS FOR ALL COLUMNS")

DataFrame[]

In [None]:
spark.sql("DESC EXTENDED test").show()

+--------------------+--------------------+-------+
|            col_name|           data_type|comment|
+--------------------+--------------------+-------+
|                  id|              bigint|   NULL|
|                date|                date|   NULL|
|           timestamp|           timestamp|   NULL|
|            idstring|              string|   NULL|
|             idfirst|              string|   NULL|
|              idlast|              string|   NULL|
|                    |                    |       |
|# Detailed Table ...|                    |       |
|             Catalog|       spark_catalog|       |
|            Database|             default|       |
|               Table|                test|       |
|               Owner|               nikol|       |
|        Created Time|Mon Mar 18 17:55:...|       |
|         Last Access|             UNKNOWN|       |
|          Created By|         Spark 3.5.1|       |
|                Type|            EXTERNAL|       |
|           

In [None]:
spark.sql("DESC EXTENDED test id").show()

+--------------+----------+
|     info_name|info_value|
+--------------+----------+
|      col_name|        id|
|     data_type|    bigint|
|       comment|      NULL|
|           min|         0|
|           max|     99999|
|     num_nulls|         0|
|distinct_count|     95546|
|   avg_col_len|         8|
|   max_col_len|         8|
|     histogram|      NULL|
+--------------+----------+

