In [None]:
#!wget -qO /opt/spark/jars/sqlite-jdbc.jar https://repo1.maven.org/maven2/org/xerial/sqlite-jdbc/3.49.0.0/sqlite-jdbc-3.49.0.0.jar
#!wget -qO /opt/spark/jars/hive-metastore-3.1.3.jar https://repo1.maven.org/maven2/org/apache/hive/hive-jdbc/3.1.3/hive-metastore--3.1.3.jar
#!wget -qO /opt/spark/jars/hive-metastore-2.3.9.jar https://repo1.maven.org/maven2/org/apache/hive/hive-metastore/2.3.9/hive-metastore-2.3.9.jar
#spark.stop()

In [None]:
# Erstelle eine SparkSession und binde den SQLite JDBC-Treiber ein
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import os
from pyspark.sql.functions import *
from delta import configure_spark_with_delta_pip
from delta.tables import DeltaTable

spark = SparkSession.builder \
    .appName("appname") \
    .config("spark.sql.catalogImplementation","hive") \
    .config("spark.sql.warehouse.dir","s3a://hive/") \
    .config("spark.sql.hive.metastore.version","3.1.3")\
    .config("spark.sql.hive.metastore.jars","path") \
    .config("spark.sql.hive.metastore.jars.path","file:///opt/spark/hive/jars/*") \
    .config("spark.sql.legacy.charVarcharAsString", True)\
    .config("spark.sql.sources.partitionOverwriteMode", "dynamic")\
    .config("hive.metastore.warehouse.dir","s3a://hive/") \
    .config("spark.hive.metastore.schema.verification","false") \
    .config("hive.exec.dynamic.partition", "true") \
    .config("hive.exec.dynamic.partition.mode", "nonstrict")\
    .enableHiveSupport() \
    .getOrCreate()

  #  .config("spark.sql.hive.metastore.version","3.1.3")\
   # .config("spark.sql.hive.metastore.jars","maven") \
   # .config("spark.sql.hive.metastore.jars.path","f'./opt/spark/jars/'") \
#.config("hive.metastore.uris", "thrift://metastore:9083") \

In [None]:
# Minimierung des LOGS
spark.sparkContext.setLogLevel("WARN")
log4jLogger = spark._jvm.org.apache.log4j
logger = log4jLogger.LogManager.getLogger("LOGGER")
logger.setLevel(log4jLogger.Level.INFO)

In [None]:
# Verwende einen relativen Pfad und konvertiere in einen absoluten Pfad:
relative_path = "data/FIDUS_AND.db"
absolute_path = os.path.abspath(relative_path)
jdbc_url = f"jdbc:sqlite:{absolute_path}"

In [None]:
# Beispiel: Abrufen aller Tabellennamen aus der SQLite-Datenbank
df_tables = spark.read.format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", "(SELECT name FROM sqlite_master WHERE type='table') as tables") \
    .option("driver", "org.sqlite.JDBC") \
    .load()

df_tables.show(truncate=False)

In [None]:
df = spark.read.format("jdbc").option("url", jdbc_url).option("dbtable", "adrver") \
    .option("driver", "org.sqlite.JDBC").load()

df.show(10)

In [None]:
df = spark.read.option("header", "true").option("inferSchema", "true").csv("s3a://bronze/fidus/fidus_and/adrver.csv")
# Erstellt ein externen Table auf Hive
spark.sql("DROP TABLE IF EXISTS adrver")
spark.sql("""create external table adrver using csv
options (path "s3a://bronze/fidus/fidus_and/adrver.csv", header "true", inferSchema="true")""")

In [None]:
# Schreibt Delta Table und erstellt einen externen Table
df.write.format("delta").mode("overwrite").option("path", f's3a://silver/fidus/fidus_and/adrver/').saveAsTable("adrver")

In [None]:
spark.sql("select * from adrver").show(100)

In [None]:
df2 = spark.read.format("delta").load("s3a://bronze/fidus/fidus_and")

# Zeige die Daten an
df2.show(truncate=False)
df2.count()

In [None]:
spark.sql("DROP SCHEMA IF EXISTS hive_test CASCADE")
spark.sql("""
  CREATE TABLE IF NOT EXISTS hive_test
  USING DELTA
  LOCATION 's3a://hive/'
""")

In [None]:
dt = DeltaTable.forName(spark, "fidus_and.adrver")
dt.toDF().show()

In [None]:
spark.sql("DESCRIBE TABLe EXTENDED adrver").show(100,truncate=False)

In [None]:
spark.catalog.listDatabases()

In [None]:
#spark.sql("drop schema if exists delta_test cascade")
#spark.sql("DROP TABLE IF EXISTS adrver")
spark.sql("SHOW tables").show()

In [None]:
spark.sql("create schema delta_test")

In [None]:
spark.catalog.listTables("fidus_and")

In [None]:
spark.catalog.listFunctions("hive_table")

In [None]:
df.write.format("csv").option("header", "true").option("delimiter", ",").mode("overwrite").save("s3a://bronze/fidus/fidus_and/")
print("saved")