# Stage 4 — Final Dataset Assembly (Booking + Scraped Enrichment(Without Labels))

In [0]:
from pyspark.sql import functions as F

# =========================
# INPUT PATHS
# =========================
BOOKING_FEATURES_PATH = "dbfs:/tmp/booking_stage4/booking_features_complete_ml"
SCRAPED_ENRICH_PATH = "dbfs:/tmp/booking_stage4/scraped_enrichment_features_ml" 
OUT_STAGE4_FEATURES = "dbfs:/tmp/booking_stage4/final_features_assembled_no_labels"

# =========================
# LOAD
# =========================
print("Loading datasets...")
df_booking = spark.read.parquet(BOOKING_FEATURES_PATH)
df_scraped = spark.read.parquet(SCRAPED_ENRICH_PATH)

print(f"Booking Features Rows: {df_booking.count():,}")
print(f"Scraped Features Rows: {df_scraped.count():,}")

Loading datasets...
Booking Features Rows: 3,239,391
Scraped Features Rows: 3,239,391


In [0]:
df_booking = df_booking.dropDuplicates(["hotel_id"])
df_scraped = df_scraped.dropDuplicates(["hotel_id"])

print("Deduplication complete.")
print(f"Booking Unique: {df_booking.count():,}")
print(f"Scraped Unique: {df_scraped.count():,}")

Deduplication complete.
Booking Unique: 3,239,391
Scraped Unique: 3,239,391


## Enforce 1 row per hotel_id in each table 

In [0]:
def dup_hotel_ids(df, key="hotel_id"):
    return df.groupBy(key).count().filter(F.col("count") > 1).count()

print("Duplicates in Booking Features:", dup_hotel_ids(df_booking))
print("Duplicates in Scraped Features:", dup_hotel_ids(df_scraped))

# Drop duplicates to ensure safe join
df_booking_u = df_booking.dropDuplicates(["hotel_id"])
df_scraped_u = df_scraped.dropDuplicates(["hotel_id"])

Duplicates in Booking Features: 0
Duplicates in Scraped Features: 0


## Check for column name collisions (besides hotel_id)


In [0]:
def collisions(df1, df2, key="hotel_id"):
    return sorted(list((set(df1.columns) & set(df2.columns)) - {key}))

coll = collisions(df_booking_u, df_scraped_u)

print("Collisions found:", coll)

if coll:
    raise ValueError(f"Found column collisions: {coll}. Rename columns in Scraped dataset before joining.")
else:
    print("✅ No collisions found. Safe to proceed.")

Collisions found: []
✅ No collisions found. Safe to proceed.


## Join 

In [0]:
df_joined = df_booking.join(df_scraped, "hotel_id", "left")

df_features = df_joined.na.fill(0, subset=["has_enrichment"])

print("Join Complete.")
print(f"Total Columns: {len(df_features.columns)}")
display(df_features.groupBy("has_enrichment").count())

Join Complete.
Total Columns: 71


has_enrichment,count
1,507220
0,2732171


## Coverage and missingness checks 

In [0]:
# Quick non-null check for key columns from both sources
check_cols = [c for c in [
    "has_free_wifi",         # From Booking
    "has_kitchen",           # From Booking
    "landmark_count_2km",    # From Booking
    "count_cafes_500m",      # From Scraped
] if c in df_features.columns]

display(df_features.agg(*[
    (1 - F.mean(F.col(c).isNull().cast("double"))).alias(f"{c}_coverage_rate")
    for c in check_cols
]))

has_free_wifi_coverage_rate,has_kitchen_coverage_rate,landmark_count_2km_coverage_rate
1.0,1.0,1.0


## Save Stage 4 combined feature dataset (NO LABELS )

In [0]:
print(f"Saving final assembled dataset to: {OUT_STAGE4_FEATURES}")

df_features.write.mode("overwrite").parquet(OUT_STAGE4_FEATURES)

print("SUCCESS! Final Feature Table Saved.")

Saving final assembled dataset to: dbfs:/tmp/booking_stage4/final_features_assembled_no_labels
SUCCESS! Final Feature Table Saved.
