In [0]:
# Azure Storage settings
input_storage_account = ""
output_storage_account = ""
input_container = ""
output_container = ""

# Setting up Storage account keys
spark.conf.set(
    f"fs.azure.account.key.{input_storage_account}.blob.core.windows.net",
    dbutils.secrets.get(scope="hw2secret", key="AZURE_STORAGE_ACCOUNT_KEY_SOURCE"))

spark.conf.set(
    f"fs.azure.account.key.{output_storage_account}.blob.core.windows.net",
    dbutils.secrets.get(scope="hw2secret", key="STORAGE_FINAL"))

# Create database if it doesn't exist
spark.sql("CREATE DATABASE IF NOT EXISTS mydatabase")

# Read Expedia data from the source container and save it in Delta format to the data output container 
expedia_df = spark.read.format("avro").load(f"wasbs://{input_container}@{input_storage_account}.blob.core.windows.net/expedia/")

expedia_df.write.format("delta").mode("overwrite") \
    .option("overwriteSchema", "true") \
    .save(f"wasbs://{output_container}@{output_storage_account}.blob.core.windows.net/delta/expedia/")

# Register the Expedia Delta table in the Metastore
spark.sql("DROP TABLE IF EXISTS mydatabase.expedia")
spark.sql(f"""
    CREATE TABLE mydatabase.expedia
    USING DELTA
    LOCATION 'wasbs://{output_container}@{output_storage_account}.blob.core.windows.net/delta/expedia/'
""")

# Read Hotel-Weather data from the source container and save it in Delta format to the data output container, also partitioning is applied
hotel_weather_df = spark.read.format("parquet").load(f"wasbs://{input_container}@{input_storage_account}.blob.core.windows.net/hotel-weather/hotel-weather/")

hotel_weather_df.write.format("delta").mode("overwrite") \
    .partitionBy("year", "month", "day") \
    .option("overwriteSchema", "true") \
    .save(f"wasbs://{output_container}@{output_storage_account}.blob.core.windows.net/delta/hotel-weather/")

# Register the Hotel-Weather Delta table in the Metastore
spark.sql("DROP TABLE IF EXISTS mydatabase.hotel_weather")
spark.sql(f"""
    CREATE TABLE mydatabase.hotel_weather
    USING DELTA
    LOCATION 'wasbs://{output_container}@{output_storage_account}.blob.core.windows.net/delta/hotel-weather/'
""")

# Refresh cache to see the most up-to-date data
spark.sql("REFRESH TABLE mydatabase.expedia")
spark.sql("REFRESH TABLE mydatabase.hotel_weather")

#Due to the same column name in the two dataframes, we need to rename the column
hotel_weather_df = hotel_weather_df.withColumnRenamed("id", "accomodation_id")
# Join the Expedia and Hotel Weather data
joined_df = expedia_df.join(hotel_weather_df, expedia_df.hotel_id == hotel_weather_df.accomodation_id, "left")

# Save the intermediate DataFrame partitioned
joined_df.write.format("parquet") \
    .mode("overwrite") \
    .partitionBy("year", "month", "day") \
    .save(f"wasbs://{output_container}@{output_storage_account}.blob.core.windows.net/joined_data/")