In [None]:
#spark.stop()

In [None]:
# SparkSession
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
import os
from pyspark.sql.functions import *
from delta import configure_spark_with_delta_pip
from delta.tables import DeltaTable

spark = SparkSession.builder \
    .appName("lakehouse") \
    .config("spark.delta.columnMapping.mode", "name") \
    .config("spark.sql.catalogImplementation","hive") \
    .config("spark.sql.warehouse.dir","s3a://hive/") \
    .config("spark.sql.hive.metastore.version","3.1.3")\
    .config("spark.sql.hive.metastore.jars","path") \
    .config("spark.sql.hive.metastore.jars.path","file:///opt/spark/hive/jars/*") \
    .config("spark.sql.legacy.charVarcharAsString", True)\
    .config("spark.sql.sources.partitionOverwriteMode", "dynamic")\
    .config("hive.metastore.warehouse.dir","s3a://hive/") \
    .config("spark.hive.metastore.schema.verification","false") \
    .config("hive.exec.dynamic.partition", "true") \
    .config("hive.exec.dynamic.partition.mode", "nonstrict")\
    .enableHiveSupport() \
    .getOrCreate()

# Minimierung des LOGS
spark.sparkContext.setLogLevel("ERROR")
log4jLogger = spark._jvm.org.apache.log4j
logger = log4jLogger.LogManager.getLogger("LOGGER")
logger.setLevel(log4jLogger.Level.INFO)

In [None]:
directory = "data/"
ignore_file = ".DS_Store"
# Liste, um die Namen zu sammeln
names = []
# For-Loop, um durch den Verzeichnisinhalt zu iterieren und nur die Namen zu erfassen
for entry in os.scandir(directory):
    # Überspringe die Datei, wenn sie den ignorierten Namen hat
    if entry.name == ignore_file:
        continue
    names.append((entry.name,))
# Spark DataFrame mit einer Spalte "name" erstellen
df_file_list = spark.createDataFrame(names, ["name"])

# DataFrame anzeigen
df_file_list.show(truncate=False)

In [None]:
DB_NAME = "fidus_hbs"
spark.sql(f"CREATE DATABASE IF NOT EXISTS {DB_NAME}")
spark.sql(f'USE {DB_NAME}')

# Verwende einen relativen Pfad und konvertiere in einen absoluten Pfad:
relative_path = f"data/{DB_NAME}.db"
absolute_path = os.path.abspath(relative_path)
jdbc_url = f"jdbc:sqlite:{absolute_path}"

In [None]:
# Beispiel: Abrufen aller Tabellennamen aus der SQLite-Datenbank
df_tables = spark.read.format("jdbc") \
    .option("url", jdbc_url) \
    .option("query", "(SELECT name FROM sqlite_master WHERE type='table') as tables") \
    .option("driver", "org.sqlite.JDBC") \
    .load()
df_tables.show(truncate=False)
#spark.sql("CREATE DATABASE IF NOT EXISTS fidus")


In [None]:
table_names = [row["name"] for row in df_tables.collect()]
for table in table_names:
    print(table)

In [None]:
directory = "data/"
ignore_file = ".DS_Store"
# Liste, um die Namen zu sammeln
names = []
# For-Loop, um durch den Verzeichnisinhalt zu iterieren und nur die Namen zu erfassen
for entry in os.scandir(directory):
    # Überspringe die Datei, wenn sie den ignorierten Namen hat
    if entry.name == ignore_file:
        continue
    names.append((entry.name,))
# Spark DataFrame mit einer Spalte "name" erstellen
df = spark.createDataFrame(names, ["name"])

# DataFrame anzeigen
df.show(truncate=False)

In [None]:
table_names = [row["name"] for row in df_tables.collect()]
for TABLE_NAME in table_names:
    print(f"{TABLE_NAME} Started!")
    df = spark.read.format("jdbc").option("url", jdbc_url).option("dbtable", TABLE_NAME).option("driver", "org.sqlite.JDBC").load()
    spark.sql(f"DROP DATABASE IF EXISTS {TABLE_NAME} cascade") # Löscht den MetaStore Eintrag
    df.write.format("delta").mode("overwrite").option("path", f's3a://bronze/fidus/{DB_NAME}/{TABLE_NAME}/').saveAsTable(TABLE_NAME)
    print(f"{TABLE_NAME} Done!")

print("SUCCESS!")
#df.show(10)

In [None]:
# Schreibt Delta Table und erstellt einen externen Table

#spark.sql(f"DROP DATABASE IF EXISTS {TABLE_NAME} cascade") # Löscht den MetaStore Eintrag
#df.write.format("delta").mode("overwrite").option("path", f's3a://bronze/fidus/fidus_and/{TABLE_NAME}/').saveAsTable(TABLE_NAME)
#print("done")

In [None]:
#spark.sql(f"select * from {TABLE_NAME}").show(5)

In [None]:
#df2 = spark.read.format("delta").load("s3a://bronze/fidus/fidus_and")
# Zeige die Daten an
#df2.show(truncate=False)
#df2.count()

In [None]:
#dt = DeltaTable.forName(spark, table_name)
#dt.toDF().show()

In [None]:
#spark.sql("DESCRIBE TABLe EXTENDED adrver").show(100,truncate=False)

In [None]:
#spark.sql("SHOW TABLES").show()

In [None]:
#spark.sql("drop database if exists begrun cascade")
#spark.sql("DROP database IF EXISTS fidus_sue CASCADE")
spark.sql("SHOW databases").show()

In [None]:
spark.sql("create schema delta_test")

In [None]:
spark.catalog.listTables("default")

In [None]:
spark.catalog.listFunctions("hive_table")

In [None]:
directory = "data/"
ignore_file = ".DS_Store"

# Liste, um die Namen zu sammeln
names = []

# For-Loop, um durch den Verzeichnisinhalt zu iterieren und nur die Namen zu erfassen
for entry in os.scandir(directory):
    # Ignoriere die Datei ".DS_Store"
    if entry.name == ignore_file:
        continue
    # Überprüfen, ob der Dateiname mit ".db" endet
    if entry.name.endswith(".db"):
        # Endung entfernen
        name_without_extension = os.path.splitext(entry.name)[0]
        names.append((name_without_extension,))
    else:
        # Anderenfalls den Originalnamen verwenden
        names.append((entry.name,))

df_file_list = spark.createDataFrame(names, ["name"])

for row in df_file_list.collect():
    DB_NAME = row["name"]
    spark.sql(f"CREATE DATABASE IF NOT EXISTS {DB_NAME}")
    spark.sql(f'USE {DB_NAME}')

    # Verwende einen relativen Pfad und konvertiere in einen absoluten Pfad:
    relative_path = f"data/{DB_NAME}.db"
    absolute_path = os.path.abspath(relative_path)
    jdbc_url = f"jdbc:sqlite:{absolute_path}"

    table_names = [row["name"] for row in df_tables.collect()]
    #print(f"{DB_NAME} Started!")
    for TABLE_NAME in table_names:
        #print(f"{TABLE_NAME} Started!")
        df = spark.read.format("jdbc").option("url", jdbc_url).option("dbtable", TABLE_NAME).option("driver", "org.sqlite.JDBC").load()
        spark.sql(f"DROP DATABASE IF EXISTS {TABLE_NAME} cascade") # Löscht den MetaStore Eintrag
        df.write.format("delta").mode("overwrite").option("path", f's3a://bronze/fidus/{DB_NAME}/{TABLE_NAME}/').saveAsTable(TABLE_NAME)
        #print(f"{TABLE_NAME} Done!")

    print(f"{DB_NAME} SUCCESS!")
print("ALL DONE!")


In [43]:
spark.sql("SHOW tables").show()

+---------+---------+-----------+
|namespace|tableName|isTemporary|
+---------+---------+-----------+
|fidus_mhw| behdatei|      false|
|fidus_mhw| praxarzt|      false|
|fidus_mhw|   patdat|      false|
|fidus_mhw|   adrver|      false|
|fidus_mhw|   begrun|      false|
|fidus_mhw| arbehdat|      false|
|fidus_mhw| rechkopf|      false|
|fidus_mhw|  rechpos|      false|
|fidus_mhw| ekktexte|      false|
|fidus_mhw| ckeytabs|      false|
|fidus_mhw|  patinfo|      false|
+---------+---------+-----------+

