In [6]:
from pyspark.sql import SparkSession
from delta import configure_spark_with_delta_pip

# Build the Spark session with Delta Lake support
builder = SparkSession.builder.appName("DeltaTutorial") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.jars.packages", "io.delta:delta-core_2.12:2.3.0")

# Configure Spark with Delta Lake
spark = configure_spark_with_delta_pip(builder).getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

data = [(1, "foo"), (2, "bar")]
df = spark.createDataFrame(data, ["id", "value"])

# Save as a Delta table
df.write.format("delta").mode("overwrite").save("/tmp/delta-table")

# Read the Delta table
df_read = spark.read.format("delta").load("/tmp/delta-table")
df_read.show()



In [8]:
features_df = spark.read.csv("/home/dwdas/keglrtlprj/source_raw/Features data set.csv", header=True, inferSchema=True)
features_df.write.format("delta").mode("overwrite").save("hdfs://namenode:8020/user/hive/warehouse/retail0A/bronze/features")

In [10]:
sales_df = spark.read.csv("/home/dwdas/keglrtlprj/source_raw/sales data-set.csv", header=True, inferSchema=True)
sales_df.write.format("delta").mode("overwrite").save("hdfs://namenode:8020/user/hive/warehouse/retail0A/bronze/sales")

                                                                                

In [11]:
stores_df = spark.read.csv("/home/dwdas/keglrtlprj/source_raw/stores data-set.csv", header=True, inferSchema=True)
stores_df.write.format("delta").mode("overwrite").save("hdfs://namenode:8020/user/hive/warehouse/retail0A/bronze/stores")

In [13]:
# Verify Features data
spark.sql("SELECT * FROM delta.`hdfs://namenode:8020/user/hive/warehouse/retail0A/bronze/features` LIMIT 5").show()

# Verify Sales data
spark.sql("SELECT * FROM delta.`hdfs://namenode:8020/user/hive/warehouse/retail0A/bronze/sales` LIMIT 5").show()

# Verify Stores data
spark.sql("SELECT * FROM delta.`hdfs://namenode:8020/user/hive/warehouse/retail0A/bronze/stores` LIMIT 5").show()

+-----+----------+-----------+----------+---------+---------+---------+---------+---------+-----------+------------+---------+
|Store|      Date|Temperature|Fuel_Price|MarkDown1|MarkDown2|MarkDown3|MarkDown4|MarkDown5|        CPI|Unemployment|IsHoliday|
+-----+----------+-----------+----------+---------+---------+---------+---------+---------+-----------+------------+---------+
|    1|05/02/2010|      42.31|     2.572|       NA|       NA|       NA|       NA|       NA|211.0963582|       8.106|    false|
|    1|12/02/2010|      38.51|     2.548|       NA|       NA|       NA|       NA|       NA|211.2421698|       8.106|     true|
|    1|19/02/2010|      39.93|     2.514|       NA|       NA|       NA|       NA|       NA|211.2891429|       8.106|    false|
|    1|26/02/2010|      46.63|     2.561|       NA|       NA|       NA|       NA|       NA|211.3196429|       8.106|    false|
|    1|05/03/2010|       46.5|     2.625|       NA|       NA|       NA|       NA|       NA|211.3501429|       8

In [5]:
spark.stop()