In [0]:
booking_clean_path = "dbfs:/tmp/booking_clean/booking_clean.parquet"
booking_df = spark.read.parquet(booking_clean_path)


In [0]:
# Force Spark to forget old cached files
spark.catalog.clearCache()

# Re-read booking data fresh
booking_clean_path = "dbfs:/tmp/booking_clean/booking_clean.parquet"
booking_df = spark.read.parquet(booking_clean_path)


In [0]:
booking_df.count()


3239391

In [0]:
booking_df.printSchema()


root
 |-- hotel_id: string (nullable = true)
 |-- url: string (nullable = true)
 |-- title: string (nullable = true)
 |-- city: string (nullable = true)
 |-- country: string (nullable = true)
 |-- location: string (nullable = true)
 |-- lat: double (nullable = true)
 |-- lon: double (nullable = true)
 |-- metro_railway_access: boolean (nullable = true)
 |-- description: string (nullable = true)
 |-- fine_print: string (nullable = true)
 |-- property_highlights: string (nullable = true)
 |-- property_information: string (nullable = true)
 |-- most_popular_facilities: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- house_rules: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- rule: string (nullable = true)
 |    |    |-- description: string (nullable = true)
 |-- manager_language_spoken: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- availability: array (nullable = true)
 |    |-- element: st

In [0]:
# -------------------------------
# Positive evidence (weighted) — English only
# -------------------------------

FAMILY_KEYWORDS = {
    # strong signals (direct family amenities)
    "crib": 2,
    "cot": 2,
    "baby cot": 2,
    "high chair": 2,
    "stroller": 2,
    "playground": 2,
    "kids club": 2,
    "family friendly": 2,
    "child friendly": 2,
    "kids friendly": 2,
    "family rooms": 2,

    # weak signals (mentions of family/kids)
    "family": 1,
    "families": 1,
    "kids": 1,
    "children": 1,
    "child": 1,
    "baby": 1,
    "toddler": 1,
    "infant": 1,
}

REMOTE_KEYWORDS = {
    # strong signals (work setup + strong connectivity)
    "desk": 2,
    "work desk": 2,
    "workspace": 2,
    "work space": 2,
    "working remotely": 2,
    "remote work": 2,
    "good for work": 2,
    "work friendly": 2,
    "home office": 2,
    "fast wifi": 2,
    "high speed internet": 2,
    "great wifi": 2,
    "excellent wifi": 2,

    # weak signals (common but still relevant)
    "wifi": 1,
    "wi-fi": 1,
    "wi fi": 1,
    "internet": 1,
    "quiet": 1,
    "peaceful": 1,
    "calm": 1,
    "long stay": 1,
    "extended stay": 1,
}

TOURIST_KEYWORDS = {
    # strong signals (walkability / sights / prime location)
    "old town": 2,
    "walking distance": 2,
    "walkable": 2,
    "close to attractions": 2,
    "near attractions": 2,
    "close to city center": 2,
    "near city center": 2,
    "near metro": 2,
    "near subway": 2,
    "near train": 2,
    "near station": 2,
    "near beach": 2,

    # weak signals (broader tourism hints)
    "central": 1,
    "city center": 1,
    "downtown": 1,
    "sightseeing": 1,
    "attractions": 1,
    "restaurants nearby": 1,
    "bars nearby": 1,
    "nightlife": 1,
    "night market": 1,
    "museum": 1,
    "museums": 1,
}


In [0]:
# -------------------------------
# Negative evidence (penalties) — English only
# -------------------------------

FAMILY_NEGATIVE = {
    # safety / neighborhood
    "unsafe": 3,
    "not safe": 3,
    "felt unsafe": 3,
    "didn't feel safe": 3,
    "dangerous": 3,
    "danger": 2,
    "sketchy": 2,
    "shady area": 2,
    "bad neighborhood": 2,
    "rough area": 2,

    # kid-specific
    "unsafe for kids": 4,
    "not safe for kids": 4,
    "not suitable for kids": 3,
    "not child friendly": 3,
    "not family friendly": 3,

    # noise (families care)
    "noisy at night": 2,
    "very noisy": 2,
    "too noisy": 2,
    "loud at night": 2,
    "couldn't sleep": 2,
    "thin walls": 1,
    "noise from street": 1,
    "noise from neighbors": 1,

    # cleanliness / hygiene (often mentioned by families)
    "dirty": 2,
    "not clean": 2,
    "unclean": 2,
    "mold": 2,
    "smelly": 1,
    "bad smell": 1,
}

REMOTE_NEGATIVE = {
    # internet quality (critical)
    "no wifi": 4,
    "no wi-fi": 4,
    "no internet": 4,
    "without internet": 4,
    "wifi not working": 4,
    "wi-fi not working": 4,
    "internet not working": 4,

    # speed / stability
    "slow wifi": 3,
    "wifi was slow": 3,
    "slow internet": 3,
    "internet was slow": 3,
    "unstable internet": 3,
    "wifi kept dropping": 3,
    "kept disconnecting": 3,
    "poor wifi": 3,
    "bad wifi": 3,
    "weak wifi": 3,
    "wifi signal was weak": 3,

    # workspace (if they complain, it's negative evidence)
    "no desk": 2,
    "no workspace": 2,
    "not suitable for work": 3,
    "hard to work": 2,
    "couldn't work": 3,
}

TOURIST_NEGATIVE = {
    # location / accessibility
    "far from center": 3,
    "far from the center": 3,
    "far from city center": 3,
    "not central": 2,
    "too far": 2,
    "far away": 2,
    "long walk": 2,
    "long walk to": 2,
    "not walkable": 2,

    # transport access problems
    "far from metro": 2,
    "far from station": 2,
    "no public transport": 3,
    "hard to get to": 2,
    "difficult to reach": 2,

    # attraction / things to do
    "nothing to do": 2,
    "no attractions nearby": 3,
    "far from attractions": 2,

    # safety (tourists also mention this)
    "unsafe area": 3,
    "unsafe neighborhood": 3,
    "felt unsafe": 3,
}


In [0]:
import re

# -------------------------------
# Regex helpers
# -------------------------------

# Keep it English-only, but include common negation forms
NEGATION_PATTERN = r"(no|not|without|lack|lacking|never|none|can't|cannot|won't|isn't|aren't|wasn't|weren't|didn't|doesn't|don't)"

def word_regex(word: str) -> str:
    """
    Safe word-boundary regex for single words.
    Example: wifi -> \bwifi\b
    """
    escaped = re.escape(word.lower())
    return rf"\b{escaped}\b"

def phrase_regex(phrase: str) -> str:
    """
    Safe regex for multi-word phrases.
    We still add word boundaries at the ends to reduce false matches.
    Example: 'fast wifi' -> \bfast\ wifi\b
    """
    escaped = re.escape(phrase.lower())
    return rf"\b{escaped}\b"

def negated_pattern(phrase: str, window_words: int = 3) -> str:
    """
    Match negation within a short window before the phrase.
    Handles patterns like:
      - 'no wifi'
      - 'wifi was not working' (if phrase is 'working' you’d handle differently)
      - 'not stable internet'
    We allow up to `window_words` words between negation and phrase.
    """
    escaped = re.escape(phrase.lower())
    # allow: NEGATION + up to N words + phrase
    return rf"{NEGATION_PATTERN}(?:\s+\w+){{0,{window_words}}}\s+\b{escaped}\b"


In [0]:
from functools import reduce
from pyspark.sql import functions as F

def persona_score(text_col, positive_rules, negative_rules):
    """
    Compute persona score with:
    - weighted positives
    - negation blocking (don't count positive if negated)
    - explicit negative penalties (subtract)
    """
    score_exprs = []

    # Positive evidence
    for phrase, weight in positive_rules.items():
        pattern = phrase_regex(phrase) if " " in phrase else word_regex(phrase)
        neg_pattern = negated_pattern(phrase)

        score_exprs.append(
            F.when(
                (text_col.rlike(pattern)) & (~text_col.rlike(neg_pattern)),
                F.lit(int(weight))
            ).otherwise(F.lit(0))
        )

    # Negative evidence (penalties)
    for phrase, penalty in negative_rules.items():
        pattern = phrase_regex(phrase) if " " in phrase else word_regex(phrase)

        score_exprs.append(
            F.when(
                text_col.rlike(pattern),
                F.lit(-int(penalty))
            ).otherwise(F.lit(0))
        )

    # If no rules were passed, avoid reduce() crash
    if not score_exprs:
        return F.lit(0)

    return reduce(lambda a, b: a + b, score_exprs)


In [0]:
from pyspark.sql import functions as F

labeling_df = (
    booking_df
    .select(
        F.col("hotel_id"),
        # Get Reviews (Null-safe)
        F.coalesce(F.col("top_reviews"), F.array()).alias("top_reviews"),
        # Get Description (Null-safe) - NEW ADDITION
        F.lower(F.coalesce(F.col("description"), F.lit(""))).alias("description_text")
    )
    .withColumn(
        "reviews_array",
        F.expr("transform(top_reviews, x -> lower(coalesce(x.review, '')))")
    )
    # 1. Join reviews into one string
    .withColumn("reviews_joined", F.concat_ws(" ", F.col("reviews_array")))
    # 2. Combine Description + Reviews
    .withColumn("full_text_raw", F.concat_ws(" ", F.col("description_text"), F.col("reviews_joined")))
    # 3. Clean text (remove special chars/extra spaces)
    .withColumn("reviews_text", F.regexp_replace(F.col("full_text_raw"), r"[^a-z0-9\s]", " "))
    .withColumn("reviews_text", F.regexp_replace(F.col("reviews_text"), r"\s+", " "))
    .withColumn("reviews_text", F.trim(F.col("reviews_text")))
    # 4. Calculate Length (Critical for filtering)
    .withColumn("text_len", F.length(F.col("reviews_text")))
    # 5. Calculate has_reviews_text
    .withColumn("has_reviews_text", (F.col("text_len") > 0).cast("int"))
    # Clean up intermediate columns
    .drop("top_reviews", "reviews_array", "description_text", "reviews_joined", "full_text_raw")
)

display(labeling_df.select("hotel_id", "text_len", "has_reviews_text", "reviews_text").limit(3))

hotel_id,text_len,has_reviews_text,reviews_text
8908679,2323,1,providing a garden komfortowe noclegi provides accommodations in sza sza this homestay offers free private parking private check in and check out and free wifi g rnik zabrze is 5 6 miles away and ruch chorz w stadium is 18 miles from the homestay offering a balcony and garden views the homestay includes 2 bedrooms a living room cable flat screen tv an equipped kitchen and 1 bathroom with a shower towels and bed linen are featured in the homestay the accommodation is non smoking stadion l ski is 19 miles from the homestay while silesia city center shopping mall is 21 miles away katowice airport is 25 miles from the property beautiful spacious and very well equipped living space the hostess had thought of everything you might need friendly and responsive hostess quiet location very comfortable home stay will definately come again everything was perfect well decorated clean with a huge bathroom a nice kitchen and 2 comfy bedrooms nice appartment in new house in suburbs of gliwice private free parking nice garden host was very welcoming and nice 1 welcome dinner prepared 2 clean and tidy 3 garage offered 4 quite and communicative place closet to a1 highway 5 contacting you before arrival 6 any kind of vanity sets available 7 strongly recommended everything was excellent klidn lokalita na okraji obce naprosto luxusn ubytov n skv le vybaven pohodln postele v odd len ch lo nic ch je super m t soukrom prostorn kr sn koupelna s vanou i sprchov m koutem c tili jsme se snad l p ne doma pan majitelka byla velice mil a ochotn nem m v bec dn v hrady bylo to jedno z nejhez ch ubytov n kter jsem m la sch ne lage in gepflegter umgebung beste vermieterin die ich je getroffen habe super freundlich und hilfsbereit sehr sch nes appartement alles vorhanden was man sich vorstellen kann sogar eine garage f rs auto vorhanden kurzum besser geht nicht alles war perfekt v e naprosto perfektn ist utuln pani hostitelka velmi p jemn ochotn przemili w a ciciele wszystko czego potrzebujesz jest na miejscu czy ciutko wietna lokalizacja miejsce godne polecenia i ponownego odwiedzenia die freundlichkeit der gastgeber war unglaublich und allein das w re die reise wert gewesen es war einer der besten aufenthalte die wir je ber booking com gebucht haben es gibt absolut nichts kritisch anzumerken es hat uns alles gefallen
2246827,983,1,sunshine guest house offers accommodations in hualien city 10 miles from liyu lake and 24 miles from taroko national park popular points of interest nearby include nanbin park hualien city god temple and meilun mountain park free wifi a tour desk and a shared lounge are featured a terrace with sea view a cable flat screen tv and air conditioning are available in some units at the homestay every unit is equipped with a private bathroom a car rental service is available at the homestay popular points of interest near sunshine guest house include beibin park beach pine garden and eastern railway site hualien airport is 2 5 miles from the property location is good and host is very nice also the shower is very strong no nice sch nes ger umiges zimmer mit netter ausstattung unser zimmer hatte auch eine kleine nette terrasse inhaber waren sehr nett und freundlich haben uns viele empfehlungen f r ausfl ge und sehensw rdigkeiten gegeben k nnen wir auf jeden fall weiterempfehlen
12275653,1008,1,providing a garden apartmani srna igalo provides accommodations in igalo the property is around a 7 minute walk from titova vila galeb beach 1 8 miles from herceg novi clock tower and 2 1 miles from forte mare fortress private parking can be arranged at an extra charge units come with air conditioning and certain units at the apartment complex have a balcony at the apartment complex every unit includes a terrace a private bathroom and a flat screen tv roman mosaics is 18 miles from the apartment while sub city shopping center is 23 miles away domacica zdenka je divna izasla nam je u susret za sve sto nam je bilo potrebno zaista sve pohvale utrolig koselig dame som drev stedet sjekket alltid om alt gikk fint med oss smilte og var blid hele tiden f rste rommet vi l p var set fortsatt mat fra forrige folkene i fryseren s luktet litt men ellers bra du f r hva du betaler for og damen som eier er verdens hyggeligste dame ganske bratt opp til leilighetene s passer ikke g for de som er d rlig til bens


In [0]:
from pyspark.sql import functions as F

MIN_TEXT_LEN = 10  

scored_df = (
    labeling_df
    .withColumn("eligible_for_labeling", (F.col("text_len") >= MIN_TEXT_LEN).cast("int"))
    .withColumn(
        "family_score",
        F.when(F.col("eligible_for_labeling") == 1,
               persona_score(F.col("reviews_text"), FAMILY_KEYWORDS, FAMILY_NEGATIVE)
        ).otherwise(F.lit(0))
    )
    .withColumn(
        "remote_score",
        F.when(F.col("eligible_for_labeling") == 1,
               persona_score(F.col("reviews_text"), REMOTE_KEYWORDS, REMOTE_NEGATIVE)
        ).otherwise(F.lit(0))
    )
    .withColumn(
        "tourist_score",
        F.when(F.col("eligible_for_labeling") == 1,
               persona_score(F.col("reviews_text"), TOURIST_KEYWORDS, TOURIST_NEGATIVE)
        ).otherwise(F.lit(0))
    )
)

display(scored_df.select("hotel_id", "text_len", "eligible_for_labeling", "family_score", "remote_score", "tourist_score").limit(10))


hotel_id,text_len,eligible_for_labeling,family_score,remote_score,tourist_score
8908679,2323,1,0,2,1
2246827,983,1,0,3,0
12275653,1008,1,0,0,0
7877735,2825,1,0,1,4
1985847,2380,1,1,2,1
6921514,2892,1,1,1,4
4767573,3826,1,-1,3,2
291872,3165,1,1,3,1
1383532,3287,1,1,4,0
11140846,714,1,0,3,1


In [0]:
from pyspark.sql import functions as F


debug_df = (
    scored_df
    .select(
        "hotel_id",
        "eligible_for_labeling",
        "family_score",
        "remote_score",
        "tourist_score",
        F.substring(F.col("reviews_text"), 1, 300).alias("text_snippet")
    )
    .where(
        (F.col("eligible_for_labeling") == 1) &
        (
            (F.col("family_score") > 0) |
            (F.col("remote_score") > 0) |
            (F.col("tourist_score") > 0)
        )
    )
    .limit(10)
)

display(debug_df)


hotel_id,eligible_for_labeling,family_score,remote_score,tourist_score,text_snippet
8908679,1,0,2,1,providing a garden komfortowe noclegi provides accommodations in sza sza this homestay offers free private parking private check in and check out and free wifi g rnik zabrze is 5 6 miles away and ruch chorz w stadium is 18 miles from the homestay offering a balcony and garden views the homestay incl
2246827,1,0,3,0,sunshine guest house offers accommodations in hualien city 10 miles from liyu lake and 24 miles from taroko national park popular points of interest nearby include nanbin park hualien city god temple and meilun mountain park free wifi a tour desk and a shared lounge are featured a terrace with sea v
7877735,1,0,1,4,hvile 12 hvile stay in toru offers accommodations with free wifi a 14 minute walk from planetarium 0 6 miles from old town hall and a 12 minute walk from copernicus monument the property is around 1 6 miles from toru wschodni railway station 1 8 miles from atrium copernicus shopping center and 2 4 m
1985847,1,1,2,1,located in mount surprise discovery resorts undara has accommodations with a year round outdoor pool free wifi a garden and a restaurant some units are air conditioned and include a balcony and or a patio as well as a seating area a grill is available on site and hiking can be enjoyed within close p
6921514,1,1,1,4,prestigeloc business confort 3 grandes chambres 2 sdb hyper centre is located in orl ans just 2 8 miles from gare des aubrais and 12 miles from chateau de meung sur loire the apartment set in a building dating from 1948 is 30 miles from chateau de talcy and 31 miles from chateau de sully sur loire f
4767573,1,-1,3,2,located in shirahama a 10 minute walk from shirarahama beach seamore residence has accommodations with a garden free private parking a shared lounge and a terrace this 3 star hotel offers a shared kitchen and free wifi guests can have a drink at the bar at the hotel all rooms come with a desk the ro
291872,1,1,3,1,in vela s main square this family run hotel is just over one mile from trent train station with panoramic mountain views it has free wi fi and free parking in its garage hotel vela has a quiet location and easy access to public transportation into central trento with buses stopping right outside all
1383532,1,1,4,0,featuring free wifi and a restaurant hostal y camping los girasoles offers accommodations in salento the hostel has a barbecue and terrace and guests can enjoy a drink at the bar all rooms have a private bathroom fitted with a shower a tv is featured in some units there is a 24 hour front desk at th
11140846,1,0,3,1,located in yonago 1 7 miles from kaike onsen beach hotel wakow vacation stay 22137v has accommodations with free wifi and free private parking the property is around 11 miles from mizuki shigeru road 21 miles from lake shinji and 22 miles from lafcadio hearn memorial museum matsue station is 20 mile
2791826,1,0,3,0,villa des pitons is located in soufri re 1 6 miles from soufriere beach featuring room service this property also provides guests with a sun terrace there s a year round outdoor pool and guests can use free wifi and free private parking rooms are equipped with air conditioning a safety deposit box a


In [0]:
from pyspark.sql import functions as F

THR_FAMILY  = 2
THR_REMOTE  = 2
THR_TOURIST = 2

labeled_df = (
    scored_df
    .withColumn("label_family",  (F.col("family_score")  >= F.lit(THR_FAMILY)).cast("int"))
    .withColumn("label_remote",  (F.col("remote_score")  >= F.lit(THR_REMOTE)).cast("int"))
    .withColumn("label_tourist", (F.col("tourist_score") >= F.lit(THR_TOURIST)).cast("int"))
    .withColumn("n_labels", (F.col("label_family") + F.col("label_remote") + F.col("label_tourist")).cast("int"))
)


In [0]:
summary = (
    labeled_df
    .groupBy("n_labels")
    .agg(
        F.count("*").alias("rows"),
        F.sum("label_family").alias("family_count"),
        F.sum("label_remote").alias("remote_count"),
        F.sum("label_tourist").alias("tourist_count")
    )
    .orderBy("n_labels")
)

display(summary)

n_labels,rows,family_count,remote_count,tourist_count
0,1836550,0,0,0
1,980750,162728,594985,223037
2,380022,166128,354446,239470
3,42069,42069,42069,42069


In [0]:
from pyspark.sql import functions as F

coverage_overall = (
    labeled_df
    .agg(
        F.count("*").alias("total_rows"),
        F.sum(F.col("eligible_for_labeling")).alias("eligible_rows"),
        F.sum((F.col("n_labels") >= 1).cast("int")).alias("labeled_rows"),
    )
    .withColumn("eligible_pct", F.col("eligible_rows") / F.col("total_rows"))
    .withColumn("labeled_pct_overall", F.col("labeled_rows") / F.col("total_rows"))
    .withColumn("labeled_pct_among_eligible", F.col("labeled_rows") / F.col("eligible_rows"))
)

display(coverage_overall)


total_rows,eligible_rows,labeled_rows,eligible_pct,labeled_pct_overall,labeled_pct_among_eligible
3239391,3239386,1402841,0.9999984565000026,0.4330570159638031,0.4330576843883378


In [0]:
from pyspark.sql import functions as F

# total rows (cheap if cached)
total_rows = labeled_df.count()

overlap_summary = (
    labeled_df
    .groupBy("n_labels")
    .agg(F.count("*").alias("rows"))
    .withColumn("pct", F.col("rows") / F.lit(total_rows))
    .orderBy("n_labels")
)

display(overlap_summary)


n_labels,rows,pct
0,1836550,0.5669429840361969
1,980750,0.3027575244853122
2,380022,0.1173127912005682
3,42069,0.0129867002779226


In [0]:
class_balance = (
    labeled_df
    .agg(
        F.sum("label_family").alias("family_pos"),
        F.sum("label_remote").alias("remote_pos"),
        F.sum("label_tourist").alias("tourist_pos"),
        F.count("*").alias("total_rows"),
        F.sum(F.col("eligible_for_labeling")).alias("eligible_rows"),
    )
    .withColumn("family_pos_rate_overall", F.col("family_pos") / F.col("total_rows"))
    .withColumn("remote_pos_rate_overall", F.col("remote_pos") / F.col("total_rows"))
    .withColumn("tourist_pos_rate_overall", F.col("tourist_pos") / F.col("total_rows"))
)

display(class_balance)


family_pos,remote_pos,tourist_pos,total_rows,eligible_rows,family_pos_rate_overall,remote_pos_rate_overall,tourist_pos_rate_overall
370925,991500,504576,3239391,3239386,0.1145045473053422,0.3060760494796707,0.1557626109352035


In [0]:
class_balance_eligible = (
    labeled_df
    .where(F.col("eligible_for_labeling") == 1)
    .agg(
        F.sum("label_family").alias("family_pos"),
        F.sum("label_remote").alias("remote_pos"),
        F.sum("label_tourist").alias("tourist_pos"),
        F.count("*").alias("eligible_rows"),
    )
    .withColumn("family_pos_rate_eligible", F.col("family_pos") / F.col("eligible_rows"))
    .withColumn("remote_pos_rate_eligible", F.col("remote_pos") / F.col("eligible_rows"))
    .withColumn("tourist_pos_rate_eligible", F.col("tourist_pos") / F.col("eligible_rows"))
)

display(class_balance_eligible)


family_pos,remote_pos,tourist_pos,eligible_rows,family_pos_rate_eligible,remote_pos_rate_eligible,tourist_pos_rate_eligible
370925,991500,504576,3239386,0.1145047240433835,0.3060765219087815,0.1557628513551642


In [0]:
qa_len = (
    labeled_df
    .where(F.col("eligible_for_labeling") == 1)
    .withColumn("label_group", F.when(F.col("n_labels") >= 1, "labeled").otherwise("unlabeled"))
    .groupBy("label_group")
    .agg(
        F.count("*").alias("rows"),
        F.expr("percentile(text_len, 0.25)").alias("p25_len"),
        F.expr("percentile(text_len, 0.50)").alias("median_len"),
        F.expr("percentile(text_len, 0.75)").alias("p75_len"),
        F.expr("percentile(text_len, 0.90)").alias("p90_len")
    )
)

display(qa_len)


label_group,rows,p25_len,median_len,p75_len,p90_len
labeled,1402841,1044.0,2369.0,3177.0,3835.0
unlabeled,1836545,627.0,781.0,1397.0,2583.0


In [0]:
sample_labeled = (
    labeled_df
    .where((F.col("eligible_for_labeling") == 1) & (F.col("n_labels") >= 1))
    .select(
        "hotel_id",
        "label_family","label_remote","label_tourist","n_labels",
        "family_score","remote_score","tourist_score",
        "text_len",
        F.substring("reviews_text", 1, 500).alias("snippet")
    )
    .orderBy(F.rand(seed=42))
    .limit(20)
)

display(sample_labeled)


com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:136)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:136)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:133)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:133)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:728)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:446)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:446)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

In [0]:
sample_unlabeled = (
    labeled_df
    .where((F.col("eligible_for_labeling") == 1) & (F.col("n_labels") == 0))
    .select(
        "hotel_id",
        "text_len",
        F.substring("reviews_text", 1, 500).alias("snippet")
    )
    .orderBy(F.col("text_len").desc())
    .limit(20)
)

display(sample_unlabeled)


com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:136)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:136)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:133)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:133)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:728)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:446)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:446)
	at com.databricks.spark.chauffeur.ExecutionContextManagerV1.can

In [0]:
# 1. DEFINE PATH
TEACHER_OUT = "dbfs:/tmp/booking_stage4/teacher_labels_multilabel_desc_reviews_v2"

# 2. OPTIMIZED LOGIC (With Repartition)
teacher_labels = (
    labeled_df
    .where((F.col("eligible_for_labeling") == 1) & (F.col("n_labels") >= 1))
    .select(
        "hotel_id",
        "label_family", "label_remote", "label_tourist",
        "family_score", "remote_score", "tourist_score"
    )
    .repartition(200) 
)

# 3. WRITE TO DISK (This effectively "caches" it permanently)
teacher_labels.write.mode("overwrite").parquet(TEACHER_OUT)
print(f"Saved successfully to: {TEACHER_OUT}")

# 4. READ IT BACK (Instant Count)
# We read the saved file instead of re-calculating the variable.
saved_df = spark.read.parquet(TEACHER_OUT)
print("Teacher rows:", saved_df.count())
display(saved_df.limit(10))

Saved successfully to: dbfs:/tmp/booking_stage4/teacher_labels_multilabel_desc_reviews_v2
Teacher rows: 1402841


hotel_id,label_family,label_remote,label_tourist,family_score,remote_score,tourist_score
10268911,0,1,1,-1,4,3
9606539,0,0,1,0,1,2
6633437,0,1,0,-2,3,1
9667482,1,0,0,3,1,0
3935850,0,1,1,1,2,4
54396,0,1,0,0,5,1
381920,0,1,0,0,4,1
3033512,0,0,1,1,-2,2
6911108,0,1,1,0,3,2
12777908,0,1,1,0,4,2
