In [0]:
# Azure Storage settings
input_storage_account = ""
output_storage_account = ""
input_container = ""
output_container = ""

spark.conf.set(
    f"fs.azure.account.key.{output_storage_account}.blob.core.windows.net",
    dbutils.secrets.get(scope="hw2secret", key="STORAGE_FINAL"))
	
df_first = spark.sql("""
	SELECT 
    address,    -- Select the hotel address.
    year,       -- Select the year from the data.
    month,      -- Select the month from the data.
    -- Calculate the temperature difference within each group:
    --   1. Find the maximum average temperature (avg_tmpr_c) in the group.
    --   2. Find the minimum average temperature (avg_tmpr_c) in the group.
    --   3. Compute the absolute difference between these values.
    --   4. Round the result to 2 decimal places and alias it as 'temp_diff'.
    ROUND(ABS(MAX(avg_tmpr_c) - MIN(avg_tmpr_c)), 2) AS temp_diff
	FROM mydatabase.hotel_weather   -- Data is sourced from the hotel_weather table.
	GROUP BY address, year, month    -- Group the records by hotel address, year, and month.
	ORDER BY temp_diff DESC          -- Order the groups by temperature difference in descending order.
	LIMIT 10;                       -- Limit the result to the top 10 groups with the largest temperature differences.
""")

df_first.write.format("parquet") \
    .mode("overwrite") \
    .partitionBy("year", "month") \
    .save(f"wasbs://{output_container}@{output_storage_account}.blob.core.windows.net/datamart/1/")
	
df_second = spark.sql("""
	WITH exploded_dates AS (
		SELECT
			ex.hotel_id,
			hw.address,
			explode(sequence(
				CAST(ex.srch_ci AS DATE), 
				CAST(ex.srch_co AS DATE) - INTERVAL 1 DAY, 
				interval 1 day)) AS visit_date
		FROM mydatabase.expedia ex
		LEFT JOIN mydatabase.hotel_weather hw
		ON ex.hotel_id = hw.id
		WHERE CAST(ex.srch_ci AS DATE) < CAST(ex.srch_co AS DATE)
		AND hw.address IS NOT NULL
	),
	monthly_visits AS (
		SELECT
			address,
			YEAR(visit_date) AS year,
			MONTH(visit_date) AS month,
			COUNT(*) AS visits_count
		FROM exploded_dates
		GROUP BY address, YEAR(visit_date), MONTH(visit_date)
	),
	ranked_hotels AS (
		SELECT *,
			DENSE_RANK() OVER (PARTITION BY year, month ORDER BY visits_count DESC) AS rank
		FROM monthly_visits
	)
	SELECT * FROM ranked_hotels WHERE rank <= 10;
""")
df_second.write.format("parquet") \
    .mode("overwrite") \
    .partitionBy("year", "month") \
    .save(f"wasbs://{output_container}@{output_storage_account}.blob.core.windows.net/datamart/2/")
	

df_third = spark.sql("""
	WITH exploded_dates AS (
		SELECT
			ex.id AS booking_id,
			ex.hotel_id,
			ex.srch_ci,
			ex.srch_co,
			explode(sequence(
				CAST(ex.srch_ci AS DATE), 
				CAST(ex.srch_co AS DATE) - INTERVAL 1 DAY,
				INTERVAL 1 DAY
			)) AS visit_date
		FROM mydatabase.expedia ex
		WHERE DATEDIFF(ex.srch_co, ex.srch_ci) BETWEEN 7 AND 30
		AND ex.srch_co > ex.srch_ci
	),
	joined_weather AS (
		SELECT 
			ed.booking_id,
			ed.hotel_id,
			ed.visit_date,
			hw.avg_tmpr_c,
			hw.address
		FROM exploded_dates ed
		LEFT JOIN mydatabase.hotel_weather hw
		ON ed.hotel_id = hw.id 
			AND CAST(hw.wthr_date AS DATE) = ed.visit_date
		WHERE hw.avg_tmpr_c IS NOT NULL
	),
	windowed_temps AS (
		SELECT
			booking_id,
			hotel_id,
			address,
			visit_date,
			avg_tmpr_c,
			FIRST_VALUE(avg_tmpr_c) OVER (PARTITION BY booking_id ORDER BY visit_date) AS first_temp,
			LAST_VALUE(avg_tmpr_c) OVER (PARTITION BY booking_id ORDER BY visit_date 
				ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING) AS last_temp
		FROM joined_weather
	),
	temp_calculations AS (
		SELECT
			booking_id,
			hotel_id,
			address,
			MIN(visit_date) AS first_day,
			MAX(visit_date) AS last_day,
			ROUND(last_temp - first_temp, 2) AS temp_trend,
			ROUND(AVG(avg_tmpr_c), 2) AS avg_temperature
		FROM windowed_temps
		GROUP BY booking_id, hotel_id, address, first_temp, last_temp
	)
	SELECT *
	FROM temp_calculations
	WHERE temp_trend IS NOT NULL
	AND avg_temperature IS NOT NULL
	AND DATEDIFF(last_day, first_day) >= 7
	ORDER BY ABS(temp_trend) DESC;
""")

df_third.write.format("parquet") \
    .mode("overwrite") \
    .partitionBy("first_day", "last_day") \
    .save(f"wasbs://{output_container}@{output_storage_account}.blob.core.windows.net/datamart/3/")