In [6]:
import os
import subprocess

os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.9'
os.environ['PYSPARK_DRIVER_PYTHON'] = '/usr/bin/python3.9'

# Set HADOOP_CONF_DIR environment variable
os.environ['HADOOP_CONF_DIR'] = '/usr/odp/1.2.4.0-102/hadoop/etc/hadoop'

# Set ARROW_LIBHDFS_DIR environment variable
os.environ['ARROW_LIBHDFS_DIR'] = '/usr/odp/1.2.4.0-102/hadoop/lib/native/'

# Set CLASSPATH enviroment variable
classpath = subprocess.check_output(['hadoop', 'classpath', '--glob'])
os.environ['CLASSPATH'] = classpath.decode('utf-8')

from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.window import Window
from pyspark.sql.types import *

import numpy as np
import pandas as pd

In [7]:
ICEBERG_VERSION = "1.4.3" 
ICEBERG_SPARK_VERSION = "3.5" 
ICEBERG_SCALA_VERSION = "2.12" 
ICEBERG_EXTENSION_CLASS = "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions" 
ICEBERG_SPARK_SQL_CATALOG_NAME = "org.apache.iceberg.spark.SparkCatalog" 
ICEBERG_PATH_JAR = f"/opt/spark-3.5.1-bin-hadoop3/jars/iceberg-spark-runtime-{ICEBERG_SPARK_VERSION}_{ICEBERG_SCALA_VERSION}-{ICEBERG_VERSION}.jar"
ICEBERG_HIVE_CATALOG_NAME = 'curated'
THRIFT_IP = '10.53.2.116' 
THRIFT_PORT = 9083 
HIVE_POSTGRES_PATH_JAR = f'/opt/spark-3.5.1-bin-hadoop3/jars/postgresql-42.2.25.jar' 
THRIFT_HIVE_URI = f'thrift://{THRIFT_IP}:{THRIFT_PORT}'
SPARK_JARS_ADDITION = ",".join([ 
    ICEBERG_PATH_JAR, 
    HIVE_POSTGRES_PATH_JAR,
    '/home/VT_TTDLPT_MINHPN5/lib_spark/spark-avro_2.12-3.5.1.jar'
]) 

spark: SparkSession = (
    SparkSession.builder.appName("kedro_dev")
        .config("spark.master", "yarn")
        .config("spark.deploy.mode", "cluster")
        .config("spark.driver.cores", "1")
        .config("spark.driver.memory", "1g")
        .config("spark.executor.cores", "1")
        .config("spark.executor.memory", "1g")
        .config("spark.executor.instances", "2")
        .config("spark.executor.memoryOverhead", "512m")
        .config("spark.sql.legacy.timeParserPolicy", "LEGACY")
        .config("spark.sql.parquet.enableVectorizedReader", "false")
        .config("spark.sql.parquet.int96RebaseModeInRead", "CORRECTED")
        .config("spark.sql.parquet.int96RebaseModeInWrite", "CORRECTED")
        .config("spark.sql.parquet.datetimeRebaseModeInRead", "CORRECTED")
        .config("spark.sql.parquet.datetimeRebaseModeInWrite", "CORRECTED")
        .config("spark.jars", SPARK_JARS_ADDITION)
        .config("spark.sql.extensions", ICEBERG_EXTENSION_CLASS)
        .config(f"spark.sql.catalog.{ICEBERG_HIVE_CATALOG_NAME}", ICEBERG_SPARK_SQL_CATALOG_NAME)
        .config(f"spark.sql.catalog.{ICEBERG_HIVE_CATALOG_NAME}.type", "hive")
        .config(f"spark.sql.catalog.{ICEBERG_HIVE_CATALOG_NAME}.uri", THRIFT_HIVE_URI)
        .getOrCreate()
)

25/06/20 15:16:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/06/20 15:16:15 WARN DomainSocketFactory: The short-circuit local reads feature cannot be used because libhadoop cannot be loaded.
25/06/20 15:16:16 WARN Client: Neither spark.yarn.jars nor spark.yarn.archive is set, falling back to uploading libraries under SPARK_HOME.


In [4]:
spark.sql("show namespaces in curated").show()

+---------------+
|      namespace|
+---------------+
|    biss_mviews|
|        default|
|          hive1|
|      hive2_csv|
|     hive2_csv4|
|   iceberg_bcvm|
|iceberg_feature|
|iceberg_staging|
|  nzdr_20241012|
|         ttqtdl|
+---------------+



In [7]:
spark.sql("DESCRIBE NAMESPACE EXTENDED curated.kedro_dev").show(truncate=False)

+--------------+----------------------------------------------------------------------------------------------+
|info_name     |info_value                                                                                    |
+--------------+----------------------------------------------------------------------------------------------+
|Catalog Name  |curated                                                                                       |
|Namespace Name|kedro_dev                                                                                     |
|Comment       |Namespace for business intelligence and analytics marts                                       |
|Location      |hdfs://10.53.2.40:8020/user/VT_TTDLPT_MINHPN5/iceberg/kedro_dev                               |
|Owner         |VT_TTDLPT_MINHPN5                                                                             |
|Properties    |((hive.metastore.database.owner,VT_TTDLPT_MINHPN5), (hive.metastore.database.owner-type,